Scrapegraph-ai/examples/extras/chromium_selenium.py at b1b8579704f509d5560c3052f1edfdf31e42db4b · ScrapeGraphAI/Scrapegraph-ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import asyncio
import json
import os

from aiohttp import ClientError
from dotenv import load_dotenv

from scrapegraphai.docloaders.chromium import (  # Import your ChromiumLoader class
    ChromiumLoader,
)
from scrapegraphai.graphs import SmartScraperGraph

# Load environment variables for API keys
load_dotenv()


# ************************************************
# Define function to analyze content with ScrapegraphAI
# ************************************************
async def analyze_content_with_scrapegraph(content: str):
    """
    Analyze scraped content using ScrapegraphAI.

    Args:
        content (str): The scraped HTML or text content.

    Returns:
        dict: The result from ScrapegraphAI analysis.
    """
    try:
        # Initialize ScrapegraphAI SmartScraperGraph
        smart_scraper = SmartScraperGraph(
            prompt="Summarize the main content of this webpage and extract any contact information.",
            source=content,  # Pass the content directly
            config={
                "llm": {
                    "api_key": os.getenv("OPENAI_API_KEY"),
                    "model": "openai/gpt-4o",
                },
                "verbose": True,
            },
        )
        result = smart_scraper.run()
        return result
    except Exception as e:
        print(f"❌ ScrapegraphAI analysis failed: {e}")
        return {"error": str(e)}


# ************************************************
# Test scraper and ScrapegraphAI pipeline
# ************************************************
async def test_scraper_with_analysis(scraper: ChromiumLoader, urls: list):
    """
    Test scraper for the given backend and URLs, then analyze content with ScrapegraphAI.

    Args:
        scraper (ChromiumLoader): The ChromiumLoader instance.
        urls (list): A list of URLs to scrape.
    """
    for url in urls:
        try:
            print(f"\n🔎 Scraping: {url} using {scraper.backend}...")
            result = await scraper.scrape(url)

            if "Error" in result or not result.strip():
                print(f"❌ Failed to scrape {url}: {result}")
            else:
                print(
                    f"✅ Successfully scraped {url}. Content (first 200 chars): {result[:200]}"
                )

                # Pass scraped content to ScrapegraphAI for analysis
                print("🤖 Analyzing content with ScrapegraphAI...")
                analysis_result = await analyze_content_with_scrapegraph(result)
                print("📝 Analysis Result:")
                print(json.dumps(analysis_result, indent=4))

        except ClientError as ce:
            print(f"❌ Network error while scraping {url}: {ce}")
        except Exception as e:
            print(f"❌ Unexpected error while scraping {url}: {e}")


# ************************************************
# Main Execution
# ************************************************
async def main():
    urls_to_scrape = [
        "https://example.com",
        "https://www.python.org",
        "https://invalid-url.test",
    ]

    # Test with Playwright backend
    print("\n--- Testing Playwright Backend ---")
    try:
        scraper_playwright_chromium = ChromiumLoader(
            urls=urls_to_scrape,
            backend="playwright",
            headless=True,
            browser_name="chromium",
        )
        await test_scraper_with_analysis(scraper_playwright_chromium, urls_to_scrape)

        scraper_playwright_firefox = ChromiumLoader(
            urls=urls_to_scrape,
            backend="playwright",
            headless=True,
            browser_name="firefox",
        )
        await test_scraper_with_analysis(scraper_playwright_firefox, urls_to_scrape)
    except ImportError as ie:
        print(f"❌ Playwright ImportError: {ie}")
    except Exception as e:
        print(f"❌ Error initializing Playwright ChromiumLoader: {e}")

    # Test with Selenium backend
    print("\n--- Testing Selenium Backend ---")
    try:
        scraper_selenium_chromium = ChromiumLoader(
            urls=urls_to_scrape,
            backend="selenium",
            headless=True,
            browser_name="chromium",
        )
        await test_scraper_with_analysis(scraper_selenium_chromium, urls_to_scrape)

        scraper_selenium_firefox = ChromiumLoader(
            urls=urls_to_scrape,
            backend="selenium",
            headless=True,
            browser_name="firefox",
        )
        await test_scraper_with_analysis(scraper_selenium_firefox, urls_to_scrape)
    except ImportError as ie:
        print(f"❌ Selenium ImportError: {ie}")
    except Exception as e:
        print(f"❌ Error initializing Selenium ChromiumLoader: {e}")


if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        print("❌ Program interrupted by user.")
    except Exception as e:
        print(f"❌ Program crashed: {e}")