@@ -80,28 +80,30 @@ def __init__(
8080 None if node_config is None else node_config .get ("scrape_do" , None )
8181 )
8282
83+ def is_valid_url (self , source : str ) -> bool :
84+ """
85+ Validates if the source string is a valid URL using regex.
86+
87+ Parameters:
88+ source (str): The URL string to validate
89+
90+ Raises:
91+ ValueError: If the URL is invalid
92+ """
93+ import re
94+ url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
95+ if not bool (re .match (url_pattern , source )):
96+ raise ValueError (f"Invalid URL format: { source } . URL must start with http(s):// and contain a valid domain." )
97+ return True
98+
8399 def execute (self , state ):
84100 """
85101 Executes the node's logic to fetch HTML content from a specified URL and
86102 update the state with this content.
87-
88- Args:
89- state (dict): The current state of the graph. The input keys will be used
90- to fetch the correct data types from the state.
91-
92- Returns:
93- dict: The updated state with a new output key containing the fetched HTML content.
94-
95- Raises:
96- KeyError: If the input key is not found in the state, indicating that the
97- necessary information to perform the operation is missing.
98103 """
99-
100104 self .logger .info (f"--- Executing { self .node_name } Node ---" )
101105
102- # Interpret input keys based on the provided input expression
103106 input_keys = self .get_input_keys (state )
104- # Fetching data from the state based on the input keys
105107 input_data = [state [key ] for key in input_keys ]
106108
107109 source = input_data [0 ]
@@ -124,10 +126,16 @@ def execute(self, state):
124126 return handlers [input_type ](state , input_type , source )
125127 elif self .input == "pdf_dir" :
126128 return state
127- elif not source .startswith ("http" ) and not source .startswith ("www" ):
128- return self .handle_local_source (state , source )
129- else :
130- return self .handle_web_source (state , source )
129+
130+ # For web sources, validate URL before proceeding
131+ try :
132+ if self .is_valid_url (source ):
133+ return self .handle_web_source (state , source )
134+ except ValueError as e :
135+ # Re-raise the exception from is_valid_url
136+ raise
137+
138+ return self .handle_local_source (state , source )
131139
132140 def handle_directory (self , state , input_type , source ):
133141 """
0 commit comments