From 928fb9b274b55caaec3024bf1c3ca5b865120aa2 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 10 Jun 2025 15:33:11 +0200 Subject: [PATCH 1/8] feat: add infinite scrolling --- ...nc_smartscraper_infinite_scroll_example.py | 71 +++++++++++++++++++ .../smartscraper_infinite_scroll_example.py | 46 ++++++++++++ scrapegraph-py/scrapegraph_py/async_client.py | 4 ++ scrapegraph-py/scrapegraph_py/client.py | 4 ++ .../scrapegraph_py/models/smartscraper.py | 7 +- scrapegraph-py/uv.lock | 2 +- 6 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py create mode 100644 scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py diff --git a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py new file mode 100644 index 0000000..8b9aa00 --- /dev/null +++ b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py @@ -0,0 +1,71 @@ +import asyncio +from typing import List +from pydantic import BaseModel + +from scrapegraph_py import AsyncClient +from scrapegraph_py.logger import sgai_logger + +sgai_logger.set_logging(level="INFO") + +# Define the output schema +class Company(BaseModel): + name: str + category: str + location: str + +class CompaniesResponse(BaseModel): + companies: List[Company] + +async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None: + """Scrape companies from a specific YC batch with infinite scroll.""" + try: + response = await client.smartscraper( + website_url=f"{url}?batch={batch}", + user_prompt="Extract all company names and their categories from the page", + output_schema=CompaniesResponse, + number_of_scrolls=10 # Scroll 10 times to load more companies + ) + + # Parse and print the results + result = CompaniesResponse.model_validate(response['result']) + print(f"\nCompanies from {batch} batch:") + print("=" * 80) + for company in result.companies: + print(f"Name: {company.name}") + print(f"Category: {company.category}") + print(f"Location: {company.location}") + print("-" * 80) + + except Exception as e: + print(f"Error scraping {batch} batch: {e}") + +async def main(): + # Initialize async client + sgai_client = AsyncClient(api_key="your-api-key-here") + + try: + # Define batches to scrape + base_url = "https://www.ycombinator.com/companies" + batches = [ + "Spring%202025", + "Winter%202025", + "Summer%202024" + ] + + # Create tasks for each batch + tasks = [ + scrape_companies(sgai_client, base_url, batch) + for batch in batches + ] + + # Execute all scraping tasks concurrently + await asyncio.gather(*tasks) + + except Exception as e: + print(f"An error occurred: {e}") + + finally: + await sgai_client.close() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py new file mode 100644 index 0000000..c153d84 --- /dev/null +++ b/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py @@ -0,0 +1,46 @@ +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger +from pydantic import BaseModel +from typing import List + +sgai_logger.set_logging(level="INFO") + +# Define the output schema +class Company(BaseModel): + name: str + category: str + location: str + +class CompaniesResponse(BaseModel): + companies: List[Company] + +# Initialize the client with explicit API key +sgai_client = Client(api_key="sgai-4cf4a4f5-87f7-457a-8c58-0790ecaf323e") + +try: + # SmartScraper request with infinite scroll + response = sgai_client.smartscraper( + website_url="https://www.ycombinator.com/companies?batch=Spring%202025", + user_prompt="Extract all company names and their categories from the page", + output_schema=CompaniesResponse, + number_of_scrolls=10 # Scroll 10 times to load more companies + ) + + # Print the response + print(f"Request ID: {response['request_id']}") + + # Parse and print the results in a structured way + result = CompaniesResponse.model_validate(response['result']) + print("\nExtracted Companies:") + print("-" * 80) + for company in result.companies: + print(f"Name: {company.name}") + print(f"Category: {company.category}") + print(f"Location: {company.location}") + print("-" * 80) + +except Exception as e: + print(f"An error occurred: {e}") + +finally: + sgai_client.close() \ No newline at end of file diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 99b6212..7c7ea38 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -174,6 +174,7 @@ async def smartscraper( website_html: Optional[str] = None, headers: Optional[dict[str, str]] = None, output_schema: Optional[BaseModel] = None, + number_of_scrolls: Optional[int] = None, ): """Send a smartscraper request""" logger.info("🔍 Starting smartscraper request") @@ -183,6 +184,8 @@ async def smartscraper( logger.debug("📄 Using provided HTML content") if headers: logger.debug("🔧 Using custom headers") + if number_of_scrolls is not None: + logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}") logger.debug(f"📝 Prompt: {user_prompt}") request = SmartScraperRequest( @@ -191,6 +194,7 @@ async def smartscraper( headers=headers, user_prompt=user_prompt, output_schema=output_schema, + number_of_scrolls=number_of_scrolls, ) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 1168557..7cb6c3b 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -182,6 +182,7 @@ def smartscraper( website_html: Optional[str] = None, headers: Optional[dict[str, str]] = None, output_schema: Optional[BaseModel] = None, + number_of_scrolls: Optional[int] = None, ): """Send a smartscraper request""" logger.info("🔍 Starting smartscraper request") @@ -191,6 +192,8 @@ def smartscraper( logger.debug("📄 Using provided HTML content") if headers: logger.debug("🔧 Using custom headers") + if number_of_scrolls is not None: + logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}") logger.debug(f"📝 Prompt: {user_prompt}") request = SmartScraperRequest( @@ -199,6 +202,7 @@ def smartscraper( headers=headers, user_prompt=user_prompt, output_schema=output_schema, + number_of_scrolls=number_of_scrolls, ) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py index 21b346e..986144f 100644 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/smartscraper.py @@ -4,7 +4,7 @@ from uuid import UUID from bs4 import BeautifulSoup -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, Field, model_validator, conint class SmartScraperRequest(BaseModel): @@ -29,6 +29,11 @@ class SmartScraperRequest(BaseModel): description="Optional headers to send with the request, including cookies and user agent", ) output_schema: Optional[Type[BaseModel]] = None + number_of_scrolls: Optional[conint(ge=0, le=100)] = Field( + default=None, + description="Number of times to scroll the page (0-100). If None, no scrolling will be performed.", + example=10 + ) @model_validator(mode="after") def validate_user_prompt(self) -> "SmartScraperRequest": diff --git a/scrapegraph-py/uv.lock b/scrapegraph-py/uv.lock index c250817..290ee64 100644 --- a/scrapegraph-py/uv.lock +++ b/scrapegraph-py/uv.lock @@ -1525,7 +1525,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "aiohttp", specifier = ">=3.11.8" }, + { name = "aiohttp", specifier = ">=3.10" }, { name = "beautifulsoup4", specifier = ">=4.12.3" }, { name = "furo", marker = "extra == 'docs'", specifier = "==2024.5.6" }, { name = "pydantic", specifier = ">=2.10.2" }, From ed866eb0c2b2b5f48bfa598533a061b8a03a408d Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 10 Jun 2025 17:10:57 +0200 Subject: [PATCH 2/8] Update smartscraper_infinite_scroll_example.py --- .../examples/sync/smartscraper_infinite_scroll_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py index c153d84..5795936 100644 --- a/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py +++ b/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py @@ -15,7 +15,7 @@ class CompaniesResponse(BaseModel): companies: List[Company] # Initialize the client with explicit API key -sgai_client = Client(api_key="sgai-4cf4a4f5-87f7-457a-8c58-0790ecaf323e") +sgai_client = Client(api_key="sgai-api-key") try: # SmartScraper request with infinite scroll From dc95174db97340b241e74f5e938d8d7acf830ecb Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 10 Jun 2025 17:12:03 +0200 Subject: [PATCH 3/8] Update async_smartscraper_infinite_scroll_example.py --- ...nc_smartscraper_infinite_scroll_example.py | 88 +++++++++---------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py index 8b9aa00..e6d9fea 100644 --- a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py +++ b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py @@ -1,71 +1,69 @@ import asyncio -from typing import List -from pydantic import BaseModel +from typing import List, Dict, Any from scrapegraph_py import AsyncClient from scrapegraph_py.logger import sgai_logger sgai_logger.set_logging(level="INFO") -# Define the output schema -class Company(BaseModel): - name: str - category: str - location: str - -class CompaniesResponse(BaseModel): - companies: List[Company] async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None: """Scrape companies from a specific YC batch with infinite scroll.""" try: + # Initial scrape with infinite scroll enabled response = await client.smartscraper( - website_url=f"{url}?batch={batch}", - user_prompt="Extract all company names and their categories from the page", - output_schema=CompaniesResponse, - number_of_scrolls=10 # Scroll 10 times to load more companies + website_url=url, + user_prompt="Extract all company information from this page, including name, description, and website", + infinite_scroll=True, + scroll_options={ + "max_scrolls": 10, # Adjust based on page size + "scroll_delay": 2, # Seconds between scrolls + "scroll_to_bottom": True + } ) - - # Parse and print the results - result = CompaniesResponse.model_validate(response['result']) - print(f"\nCompanies from {batch} batch:") - print("=" * 80) - for company in result.companies: - print(f"Name: {company.name}") - print(f"Category: {company.category}") - print(f"Location: {company.location}") - print("-" * 80) - + + # Process the results + companies = response.get("result", []) + if not companies: + print(f"No companies found for batch {batch}") + return + + # Save or process the companies data + print(f"Found {len(companies)} companies in batch {batch}") + for company in companies: + print(f"Company: {company.get('name', 'N/A')}") + print(f"Description: {company.get('description', 'N/A')}") + print(f"Website: {company.get('website', 'N/A')}") + print("-" * 50) + except Exception as e: - print(f"Error scraping {batch} batch: {e}") + print(f"Error scraping batch {batch}: {str(e)}") + async def main(): # Initialize async client - sgai_client = AsyncClient(api_key="your-api-key-here") - + client = AsyncClient(api_key="your-api-key-here") + try: - # Define batches to scrape - base_url = "https://www.ycombinator.com/companies" - batches = [ - "Spring%202025", - "Winter%202025", - "Summer%202024" - ] - + # Example YC batch URLs + batch_urls = { + "W24": "https://www.ycombinator.com/companies?batch=W24", + "S23": "https://www.ycombinator.com/companies?batch=S23" + } + # Create tasks for each batch tasks = [ - scrape_companies(sgai_client, base_url, batch) - for batch in batches + scrape_companies(client, url, batch) + for batch, url in batch_urls.items() ] - - # Execute all scraping tasks concurrently + + # Execute all batch scraping concurrently await asyncio.gather(*tasks) - - except Exception as e: - print(f"An error occurred: {e}") - + finally: - await sgai_client.close() + # Ensure client is properly closed + await client.close() + if __name__ == "__main__": asyncio.run(main()) \ No newline at end of file From ffc32ce05a5c3546579142181466e34c3027ec67 Mon Sep 17 00:00:00 2001 From: Vikrant-Khedkar Date: Wed, 11 Jun 2025 14:38:17 +0530 Subject: [PATCH 4/8] refactor: simplify infinite scroll config by replacing scroll_options with number_of_scrolls parameter --- .../async/async_smartscraper_infinite_scroll_example.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py index e6d9fea..ea4c513 100644 --- a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py +++ b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py @@ -14,12 +14,7 @@ async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None: response = await client.smartscraper( website_url=url, user_prompt="Extract all company information from this page, including name, description, and website", - infinite_scroll=True, - scroll_options={ - "max_scrolls": 10, # Adjust based on page size - "scroll_delay": 2, # Seconds between scrolls - "scroll_to_bottom": True - } + number_of_scrolls=10, ) # Process the results From fd2f5d1bf3e8d34ecc12068f4471e7031e730722 Mon Sep 17 00:00:00 2001 From: mohammadehsanansari Date: Mon, 16 Jun 2025 13:50:47 +0530 Subject: [PATCH 5/8] fixed the examples --- __pycache__/test_sdk.cpython-312.pyc | Bin 0 -> 838 bytes .../async_smartscraper_infinite_scroll_example.py | 11 +++++------ 2 files changed, 5 insertions(+), 6 deletions(-) create mode 100644 __pycache__/test_sdk.cpython-312.pyc diff --git a/__pycache__/test_sdk.cpython-312.pyc b/__pycache__/test_sdk.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8cccba0cbe89b54332d0afba7f9ad557eb723c8b GIT binary patch literal 838 zcmZWn&ubG=5PomByIHfFwxNd9gRs(844SN-gd&1im4LKR5hR!8nK#XDyT5$zMVp-J zt;gOhh<|{ze}*?NiL|sX^yJA~OK&~-cGFPBIm~|F%)FUzhW%bD6_K>~+lsfWAoMFE zW796d`40&`A&MvtP>5R?XIKf8P;IFgDG1bcWBlmqOV#~}Jv`!h6{gz0K900B(AG;= z)nh~p)Y!kaC$(hOD#-2zEkc2sHzfazz9{1kc~65{GFSRPC%ez&E&g$PmkgucWNqqq z#%eg#p9VfeLL!rQi0=k*rvogVXf8irkwGnh_Ym~Uy|~NjB=PGT(5rjEK;{aPyOQ^3 zJRuUk=s2)R!X$u3I}ZB~pKS_8+QJqd*dg&F+XF)VsAH3e+7x)3`HAr3$d1?Ynh)?c zAswhGsVO1%1-M-nq-K``a}ySaiRfFa@U9D7*v+LyJ1w#ibA8%3R)Kc|A)`9W(Q>Vr zPRb!9fsS;ZoAj6}cTOw+1{8T-u9F+LNa9m!>+W z8;+1mx)L;!-i7ffjA=K3$E+%wXMYv{DBY=wF+M@JPEh?+M|gJ2JVMnWs_w2GqJ=XZ zO;rXr?;lJ)*fM3>svenhLv!wUdf|Awd8#36{v4^6vSplE2j;&HsC None: user_prompt="Extract all company information from this page, including name, description, and website", number_of_scrolls=10, ) - # Process the results - companies = response.get("result", []) + companies = response.get("result", {}).get("companies", []) if not companies: print(f"No companies found for batch {batch}") return # Save or process the companies data print(f"Found {len(companies)} companies in batch {batch}") + for company in companies: print(f"Company: {company.get('name', 'N/A')}") print(f"Description: {company.get('description', 'N/A')}") @@ -37,13 +36,13 @@ async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None: async def main(): # Initialize async client - client = AsyncClient(api_key="your-api-key-here") + client = AsyncClient(api_key="sgai-4cf4a4f5-87f7-457a-8c58-0790ecaf323e") try: # Example YC batch URLs batch_urls = { - "W24": "https://www.ycombinator.com/companies?batch=W24", - "S23": "https://www.ycombinator.com/companies?batch=S23" + "W24": "https://www.ycombinator.com/companies?batch=Winter%202024", + "S23": "https://www.ycombinator.com/companies?batch=Summer%202023" } # Create tasks for each batch From 2dbb83375933b32169df68dbbff728ab377587fa Mon Sep 17 00:00:00 2001 From: mohammadehsanansari Date: Mon, 16 Jun 2025 13:52:57 +0530 Subject: [PATCH 6/8] hotfix --- scrapegraph-py/scrapegraph_py/async_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 7c7ea38..aa54e6c 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -196,6 +196,7 @@ async def smartscraper( output_schema=output_schema, number_of_scrolls=number_of_scrolls, ) + logger.debug("✅ Request validation passed") result = await self._make_request( From 28cf2141c75998ac27ad59bbb4685e3d540d7aa8 Mon Sep 17 00:00:00 2001 From: mohammadehsanansari Date: Mon, 16 Jun 2025 14:02:13 +0530 Subject: [PATCH 7/8] git log --oneline --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2348049..c0b5069 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .DS_Store **/.DS_Store *.csv +venv/ From 6272be99a663cc260772a1f50eb5c763eaa87f48 Mon Sep 17 00:00:00 2001 From: mohammadehsanansari Date: Mon, 16 Jun 2025 14:20:45 +0530 Subject: [PATCH 8/8] removed the API_KEY --- .../async/async_smartscraper_infinite_scroll_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py index e3fd59e..a53cb53 100644 --- a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py +++ b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py @@ -36,7 +36,7 @@ async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None: async def main(): # Initialize async client - client = AsyncClient(api_key="sgai-4cf4a4f5-87f7-457a-8c58-0790ecaf323e") + client = AsyncClient(api_key="Your-API-Key") try: # Example YC batch URLs