diff --git a/pyproject.toml b/pyproject.toml index 811622d742..16c43e1df4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ ] [project.optional-dependencies] -all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"] +all = ["crawlee[adaptive-crawler,ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"] adaptive-crawler = [ "jaro-winkler>=2.0.3", "playwright>=1.27.0", @@ -58,6 +58,7 @@ adaptive-crawler = [ "apify_fingerprint_datapoints>=0.0.3", "browserforge>=1.2.4" ] +ai = ["pydantic-ai-slim[openai]>=1.106.0", "parsel>=1.10.0", "lxml[html_clean]>=5.2.0"] beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"] cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"] curl-impersonate = ["curl-cffi>=0.9.0"] diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py index ac97581bb0..2e67efa985 100644 --- a/src/crawlee/crawlers/__init__.py +++ b/src/crawlee/crawlers/__init__.py @@ -65,6 +65,36 @@ StagehandPreNavCrawlingContext, ) +with _try_import( + __name__, + 'AiCleanHtmlDistiller', + 'AiCrawler', + 'AiCrawlingContext', + 'AiDirectExtractor', + 'AiHtmlDistiller', + 'AiHtmlExtractor', + 'AiSelectorExtractor', + 'AiSkeletonDistiller', + 'AiUsageStats', + 'BaseAiHtmlDistiller', + 'BaseAiHtmlExtractor', + 'get_basic_ai_cleaner', +): + from ._ai import ( + AiCleanHtmlDistiller, + AiCrawler, + AiCrawlingContext, + AiDirectExtractor, + AiHtmlDistiller, + AiHtmlExtractor, + AiSelectorExtractor, + AiSkeletonDistiller, + AiUsageStats, + BaseAiHtmlDistiller, + BaseAiHtmlExtractor, + get_basic_ai_cleaner, + ) + __all__ = [ 'AbstractHttpCrawler', @@ -74,6 +104,17 @@ 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', + 'AiCleanHtmlDistiller', + 'AiCrawler', + 'AiCrawlingContext', + 'AiDirectExtractor', + 'AiHtmlDistiller', + 'AiHtmlExtractor', + 'AiSelectorExtractor', + 'AiSkeletonDistiller', + 'AiUsageStats', + 'BaseAiHtmlDistiller', + 'BaseAiHtmlExtractor', 'BasicCrawler', 'BasicCrawlerOptions', 'BasicCrawlingContext', @@ -99,4 +140,5 @@ 'StagehandCrawlingContext', 'StagehandPostNavCrawlingContext', 'StagehandPreNavCrawlingContext', + 'get_basic_ai_cleaner', ] diff --git a/src/crawlee/crawlers/_ai/__init__.py b/src/crawlee/crawlers/_ai/__init__.py new file mode 100644 index 0000000000..90571efc04 --- /dev/null +++ b/src/crawlee/crawlers/_ai/__init__.py @@ -0,0 +1,42 @@ +from crawlee._utils.try_import import install_import_hook as _install_import_hook +from crawlee._utils.try_import import try_import as _try_import + +_install_import_hook(__name__) + +# The following imports are wrapped in try_import to handle optional dependencies (the `ai` extra), +# ensuring the module can still function even if these dependencies are missing. +with _try_import(__name__, 'AiCrawler'): + from ._ai_crawler import AiCrawler +with _try_import(__name__, 'AiCrawlingContext'): + from ._ai_crawling_context import AiCrawlingContext +with _try_import(__name__, 'BaseAiHtmlExtractor'): + from ._base_extractor import BaseAiHtmlExtractor +with _try_import(__name__, 'AiDirectExtractor'): + from ._direct_extractor import AiDirectExtractor +with _try_import(__name__, 'AiSelectorExtractor'): + from ._selector_extractor import AiSelectorExtractor +with _try_import(__name__, 'BaseAiHtmlDistiller'): + from ._base_distiller import BaseAiHtmlDistiller +with _try_import(__name__, 'AiCleanHtmlDistiller'): + from ._clean_html_distiller import AiCleanHtmlDistiller +with _try_import(__name__, 'AiSkeletonDistiller'): + from ._skeleton_distiller import AiSkeletonDistiller +with _try_import(__name__, 'AiHtmlDistiller', 'AiHtmlExtractor', 'AiUsageStats'): + from ._types import AiHtmlDistiller, AiHtmlExtractor, AiUsageStats +with _try_import(__name__, 'get_basic_ai_cleaner'): + from ._utils import get_basic_ai_cleaner + +__all__ = [ + 'AiCleanHtmlDistiller', + 'AiCrawler', + 'AiCrawlingContext', + 'AiDirectExtractor', + 'AiHtmlDistiller', + 'AiHtmlExtractor', + 'AiSelectorExtractor', + 'AiSkeletonDistiller', + 'AiUsageStats', + 'BaseAiHtmlDistiller', + 'BaseAiHtmlExtractor', + 'get_basic_ai_cleaner', +] diff --git a/src/crawlee/crawlers/_ai/_ai_crawler.py b/src/crawlee/crawlers/_ai/_ai_crawler.py new file mode 100644 index 0000000000..5c89d20e1f --- /dev/null +++ b/src/crawlee/crawlers/_ai/_ai_crawler.py @@ -0,0 +1,173 @@ +from __future__ import annotations + +import warnings +from contextlib import AbstractAsyncContextManager +from logging import getLogger +from typing import TYPE_CHECKING + +from parsel import Selector + +from crawlee._utils.docs import docs_group +from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions +from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext +from crawlee.crawlers._parsel._parsel_parser import ParselParser + +from ._ai_crawling_context import AiCrawlingContext +from ._direct_extractor import AiDirectExtractor + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from pydantic_ai.models import Model + from typing_extensions import Unpack + + from crawlee import Request + from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext + + from ._types import AiHtmlExtractor, AiUsageStats, ExtractFunction, TSchema + + +logger = getLogger(__name__) + + +@docs_group('Crawlers') +class AiCrawler(AbstractHttpCrawler[AiCrawlingContext, Selector, Selector]): + """A web crawler that extracts structured data from pages using an AI model. + + Builds on `AbstractHttpCrawler` and parses responses with Parsel, so the request handler has both the usual + Parsel `selector` and the AI-powered `extract` helper: pass a Pydantic model and get a validated instance back. + + The model layer is Pydantic AI, so any provider it supports (OpenAI, Anthropic, Gemini, Ollama, ...) works + through the `model` argument. The default extractor is an `AiDirectExtractor`: each page is distilled and sent + to the model in one call. For cached CSS-selector extraction at near-zero LLM cost, pass an `AiSelectorExtractor` + through the `extractor` argument. + + Warning: + This is an experimental crawler. Its public API may change in future versions. + + ### Usage + + ```python + from pydantic import BaseModel + from pydantic_ai.models.openai import OpenAIChatModel + from pydantic_ai.providers.openai import OpenAIProvider + + from crawlee.crawlers import AiCrawler, AiCrawlingContext + + + class Article(BaseModel): + title: str + author: str | None + + + crawler = AiCrawler(model=OpenAIChatModel('gpt-5.4-nano', provider=OpenAIProvider(api_key='...'))) + + + @crawler.router.default_handler + async def request_handler(context: AiCrawlingContext) -> None: + article = await context.extract(Article) + await context.push_data(article.model_dump()) + + + await crawler.run(['https://crawlee.dev/']) + ``` + """ + + def __init__( + self, + *, + model: str | Model | None = None, + extractor: AiHtmlExtractor | None = None, + **kwargs: Unpack[HttpCrawlerOptions[AiCrawlingContext]], + ) -> None: + """Initialize a new instance. + + Args: + model: The model used for extraction, given to the default extractor (`AiDirectExtractor`). A + provider-prefixed name (e.g. `'openai:gpt-5.4-nano'`) or a Pydantic AI `Model` instance. When given + as a string, the provider reads credentials from its environment variable (e.g. `OPENAI_API_KEY`). + Pass a `Model` instance to supply them explicitly. Provide exactly one of `model` or `extractor`. + extractor: A pre-configured `AiHtmlExtractor`, for full control over the distiller, instructions, + caching, usage limits, and model fallback. Pass an `AiSelectorExtractor` here for cached-selector + extraction. Provide exactly one of `model` or `extractor`. + kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`. + """ + if (model is None) == (extractor is None): + raise ValueError('Provide exactly one of `model` or `extractor`.') + + if extractor is None and model is not None: + extractor = AiDirectExtractor(model) + + if not extractor: + raise ValueError('Extractor initialization failed; check the provided model or extractor configuration.') + + # Call the notification only once. + warnings.warn( + 'The AiCrawler is experimental and its public API may change in future releases.', + category=UserWarning, + stacklevel=2, + ) + + self._ai_usage = extractor.ai_usage + self._extractor = extractor + + async def final_step( + context: ParsedHttpCrawlingContext[Selector], + ) -> AsyncGenerator[AiCrawlingContext, None]: + """Enhance `ParsedHttpCrawlingContext[Selector]` with the `extract` helper and `ai_usage`.""" + parsel_context = ParselCrawlingContext.from_parsed_http_crawling_context(context) + yield AiCrawlingContext.from_parsel_crawling_context( + parsel_context, + extract=self._create_extract_function(parsel_context.selector, parsel_context.request), + ai_usage=self._ai_usage, + ) + + kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step) + + # If the extractor is an async context manager, add it to the crawler's additional context managers so it's + # properly entered and exited around the crawl. + if isinstance(extractor, AbstractAsyncContextManager): + kwargs['_additional_context_managers'] = [ + *kwargs.get('_additional_context_managers', []), + extractor, + ] + super().__init__( + parser=ParselParser(), + **kwargs, + ) + + @property + def extractor(self) -> AiHtmlExtractor: + """The extractor used to turn pages into structured data.""" + return self._extractor + + @property + def ai_usage(self) -> AiUsageStats: + """Accumulated token usage across extraction calls.""" + return self._ai_usage + + def _create_extract_function(self, selector: Selector, request: Request) -> ExtractFunction: + """Build an `extract` helper bound to the page's parsed tree. + + When the caller omits `cache_tag`, it defaults to `request.label` so an `AiSelectorExtractor` buckets + selectors per route without extra wiring. An explicit `cache_tag` overrides this. + """ + + async def extract( + schema: type[TSchema], + *, + scope: str | None = None, + cache_tag: str | None = None, + additional_instructions: str | None = None, + ) -> TSchema: + # `AiHtmlExtractor.extract` accepts a Selector directly, so the already-parsed tree is handed over + # without a serialize round trip. + return await self._extractor.extract( + selector, + schema, + scope=scope, + cache_tag=cache_tag if cache_tag is not None else request.label, + additional_instructions=additional_instructions, + ) + + return extract diff --git a/src/crawlee/crawlers/_ai/_ai_crawling_context.py b/src/crawlee/crawlers/_ai/_ai_crawling_context.py new file mode 100644 index 0000000000..18377a6644 --- /dev/null +++ b/src/crawlee/crawlers/_ai/_ai_crawling_context.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from dataclasses import dataclass, fields +from typing import TYPE_CHECKING + +from crawlee._utils.docs import docs_group +from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext + +if TYPE_CHECKING: + from typing_extensions import Self + + from ._types import AiUsageStats, ExtractFunction + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class AiCrawlingContext(ParselCrawlingContext): + """The crawling context used by the `AiCrawler`. + + It extends `ParselCrawlingContext`, so the full Parsel `selector` (and `enqueue_links`) remain available + alongside the AI-powered `extract` helper. Handlers can mix cheap manual selectors with AI extraction on the + same page. + """ + + extract: ExtractFunction + """Extract a structured Pydantic model from the page using the configured AI extractor.""" + + ai_usage: AiUsageStats + """The cumulative token usage stats of the extractor across calls in this crawl.""" + + @classmethod + def from_parsel_crawling_context( + cls, + context: ParselCrawlingContext, + *, + extract: ExtractFunction, + ai_usage: AiUsageStats, + ) -> Self: + """Create a new context from an existing `ParselCrawlingContext`.""" + return cls( + extract=extract, + ai_usage=ai_usage, + **{field.name: getattr(context, field.name) for field in fields(context)}, + ) diff --git a/src/crawlee/crawlers/_ai/_base_distiller.py b/src/crawlee/crawlers/_ai/_base_distiller.py new file mode 100644 index 0000000000..3567054167 --- /dev/null +++ b/src/crawlee/crawlers/_ai/_base_distiller.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import re +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from crawlee._utils.docs import docs_group + +if TYPE_CHECKING: + from lxml.html import HtmlElement + + +# Placeholder tag used to hide JSON scripts from the cleaning pass. The cleaner removes `