|
1 | 1 | """ |
2 | | -Module for checking if a website is scrapepable or not |
| 2 | +Module for checking if a website is scrapepable or not |
3 | 3 | """ |
4 | 4 | from typing import List |
5 | 5 | from urllib.parse import urlparse |
|
12 | 12 |
|
13 | 13 | class RobotsNode(BaseNode): |
14 | 14 | """ |
15 | | - A node responsible for checking if a website is scrapepable or not. |
| 15 | + A node responsible for checking if a website is scrapepable or not. |
16 | 16 | It uses the AsyncHtmlLoader for asynchronous |
17 | 17 | document loading. |
18 | 18 |
|
@@ -59,7 +59,7 @@ def __init__(self, input: str, output: List[str], node_config: dict, force_scra |
59 | 59 | node_config (dict): Configuration parameters for the node. |
60 | 60 | force_scraping (bool): A flag indicating whether scraping should be enforced even |
61 | 61 | if disallowed by robots.txt. Defaults to True. |
62 | | - node_name (str, optional): The unique identifier name for the node. |
| 62 | + node_name (str, optional): The unique identifier name for the node. |
63 | 63 | Defaults to "Robots". |
64 | 64 | """ |
65 | 65 | super().__init__(node_name, "node", input, output, 1) |
@@ -112,11 +112,12 @@ def execute(self, state): |
112 | 112 | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" |
113 | 113 | loader = AsyncHtmlLoader(f"{base_url}/robots.txt") |
114 | 114 | document = loader.load() |
115 | | - model = self.llm_model.model_name |
116 | | - |
117 | | - if "ollama" in model: |
118 | | - model = model.split("/", maxsplit=1)[-1] |
| 115 | + if "ollama" in self.llm_model.model: |
| 116 | + self.llm_model.model = self.llm_model.model.split("/")[-1] |
| 117 | + model = self.llm_model.model.split("/")[-1] |
119 | 118 |
|
| 119 | + else: |
| 120 | + model = self.llm_model.model_name |
120 | 121 | try: |
121 | 122 | agent = robots_dictionary[model] |
122 | 123 |
|
|
0 commit comments