Skip to content

feat: add ImpitHttpClient http-client client using the impit library #1151

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ all = [
"curl-cffi>=0.9.0",
"html5lib>=1.0",
"inquirer>=3.3.0",
"impit>=0.2.0",
"jaro-winkler>=2.0.3",
"parsel>=1.10.0",
"playwright>=1.27.0",
Expand All @@ -76,6 +77,7 @@ adaptive-crawler = [
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
impit = ["impit>=0.2.0"]
parsel = ["parsel>=1.10.0"]
playwright = ["playwright>=1.27.0"]

Expand Down
1 change: 1 addition & 0 deletions src/crawlee/_utils/blocked.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
'ERR_PROXY_CONNECTION_FAILED',
'ERR_TUNNEL_CONNECTION_FAILED',
'Proxy responded with',
'unsuccessful tunnel',
]
"""
Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning.
Expand Down
4 changes: 4 additions & 0 deletions src/crawlee/http_clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,15 @@
with _try_import(__name__, 'CurlImpersonateHttpClient'):
from ._curl_impersonate import CurlImpersonateHttpClient

with _try_import(__name__, 'ImpitHttpClient'):
from ._impit import ImpitHttpClient


__all__ = [
'CurlImpersonateHttpClient',
'HttpClient',
'HttpCrawlingResult',
'HttpResponse',
'HttpxHttpClient',
'ImpitHttpClient',
]
185 changes: 185 additions & 0 deletions src/crawlee/http_clients/_impit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING, Any, Optional

from impit import AsyncClient, Response
from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee._utils.docs import docs_group
from crawlee.errors import ProxyError
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

if TYPE_CHECKING:
from crawlee import Request
from crawlee._types import HttpMethod, HttpPayload
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
from crawlee.statistics import Statistics

logger = getLogger(__name__)


class _ImpitResponse:
"""Adapter class for `impit.Response` to conform to the `HttpResponse` protocol."""

def __init__(self, response: Response) -> None:
self._response = response

@property
def http_version(self) -> str:
return str(self._response.http_version)

@property
def status_code(self) -> int:
return int(self._response.status_code)

@property
def headers(self) -> HttpHeaders:
return HttpHeaders(dict(self._response.headers))

def read(self) -> bytes:
return self._response.content


@docs_group('Classes')
class ImpitHttpClient(HttpClient):
"""HTTP client based on the `impit` library.

This client uses the `impit` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
and to manage sessions, proxies, and error handling.

See the `HttpClient` class for more common information about HTTP clients.

### Usage

```python
from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler
from crawlee.http_clients import ImpitHttpClient

http_client = ImpitHttpClient()
crawler = HttpCrawler(http_client=http_client)
```
"""

def __init__(
self,
*,
persist_cookies_per_session: bool = True,
http3: bool = True,
verify: bool = True,
**async_client_kwargs: Any,
) -> None:
"""Initialize a new instance.

Args:
persist_cookies_per_session: Whether to persist cookies per HTTP session.
http3: Whether to enable HTTP/3 support.
verify: SSL certificates used to verify the identity of requested hosts.
header_generator: Header generator instance to use for generating common headers.
async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`.
"""
super().__init__(
persist_cookies_per_session=persist_cookies_per_session,
)
self._http3 = http3
self._verify = verify

self._async_client_kwargs = async_client_kwargs

self._client_by_proxy_url = dict[Optional[str], AsyncClient]()

@override
async def crawl(
self,
request: Request,
*,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
statistics: Statistics | None = None,
) -> HttpCrawlingResult:
client = self._get_client(proxy_info.url if proxy_info else None)

try:
response = await client.request(
url=request.url,
method=request.method,
content=request.payload,
headers=dict(request.headers) if request.headers else None,
)
except RuntimeError as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
raise

if statistics:
statistics.register_status_code(response.status_code)

request.loaded_url = str(response.url)

return HttpCrawlingResult(
http_response=_ImpitResponse(response),
)

@override
async def send_request(
self,
url: str,
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | dict[str, str] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})

client = self._get_client(proxy_info.url if proxy_info else None)

try:
response = await client.request(
url=url,
method=method,
headers=dict(headers) if headers else None,
content=payload,
)
except RuntimeError as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
raise

return _ImpitResponse(response)

def _get_client(self, proxy_url: str | None) -> AsyncClient:
"""Retrieve or create an HTTP client for the given proxy URL.

If a client for the specified proxy URL does not exist, create and store a new one.
"""
if proxy_url not in self._client_by_proxy_url:
# Prepare a default kwargs for the new client.
kwargs: dict[str, Any] = {
'proxy': proxy_url,
'http3': self._http3,
'verify': self._verify,
'follow_redirects': True,
'browser': 'firefox',
}

# Update the default kwargs with any additional user-provided kwargs.
kwargs.update(self._async_client_kwargs)

client = AsyncClient(**kwargs)
self._client_by_proxy_url[proxy_url] = client

return self._client_by_proxy_url[proxy_url]

@staticmethod
def _is_proxy_error(error: RuntimeError) -> bool:
"""Determine whether the given error is related to a proxy issue.

Check if the error message contains known proxy-related error keywords.
"""
return any(needle in str(error) for needle in ROTATE_PROXY_ERRORS)
5 changes: 4 additions & 1 deletion tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from crawlee import service_locator
from crawlee.configuration import Configuration
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network
from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient
from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, ImpitHttpClient
from crawlee.proxy_configuration import ProxyInfo
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import KeyValueStore, _creation_management
Expand Down Expand Up @@ -206,9 +206,12 @@ def redirect_server_url(redirect_http_server: TestServer) -> URL:
params=[
pytest.param('curl', id='curl'),
pytest.param('httpx', id='httpx'),
pytest.param('impit', id='impit'),
]
)
async def http_client(request: pytest.FixtureRequest) -> HttpClient:
if request.param == 'curl':
return CurlImpersonateHttpClient(http_version=CurlHttpVersion.V1_1)
if request.param == 'impit':
return ImpitHttpClient(http3=False)
return HttpxHttpClient(http2=False)
109 changes: 109 additions & 0 deletions tests/unit/http_clients/test_impit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from __future__ import annotations

import os
from typing import TYPE_CHECKING

import pytest

from crawlee import Request
from crawlee.errors import ProxyError
from crawlee.http_clients import ImpitHttpClient
from crawlee.statistics import Statistics

if TYPE_CHECKING:
from yarl import URL

from crawlee.proxy_configuration import ProxyInfo


@pytest.fixture
def http_client() -> ImpitHttpClient:
return ImpitHttpClient()


async def test_http_1(server_url: URL) -> None:
http_client = ImpitHttpClient()
response = await http_client.send_request(str(server_url))
assert response.http_version == 'HTTP/1.1'


async def test_http_2() -> None:
http_client = ImpitHttpClient()
response = await http_client.send_request('https://apify.com/')
assert response.http_version == 'HTTP/2'


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_proxy(
http_client: ImpitHttpClient,
proxy: ProxyInfo,
server_url: URL,
) -> None:
url = str(server_url / 'status/222')
request = Request.from_url(url)

async with Statistics.with_default_state() as statistics:
result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics)

assert result.http_response.status_code == 222


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_proxy_disabled(
http_client: ImpitHttpClient,
disabled_proxy: ProxyInfo,
) -> None:
url = 'https://apify.com/'
request = Request.from_url(url)

with pytest.raises(ProxyError):
async with Statistics.with_default_state() as statistics:
await http_client.crawl(request, proxy_info=disabled_proxy, statistics=statistics)


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_send_request_with_proxy(
http_client: ImpitHttpClient,
proxy: ProxyInfo,
server_url: URL,
) -> None:
url = str(server_url / 'status/222')

response = await http_client.send_request(url, proxy_info=proxy)
assert response.status_code == 222 # 222 - authentication successful


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_send_request_with_proxy_disabled(
http_client: ImpitHttpClient,
disabled_proxy: ProxyInfo,
) -> None:
url = 'https://apify.com/'

with pytest.raises(ProxyError):
await http_client.send_request(url, proxy_info=disabled_proxy)


async def test_crawl_follow_redirects_by_default(http_client: ImpitHttpClient, server_url: URL) -> None:
target_url = str(server_url / 'status/200')
redirect_url = str((server_url / 'redirect').update_query(url=target_url))
request = Request.from_url(redirect_url)

crawling_result = await http_client.crawl(request)

assert crawling_result.http_response.status_code == 200
assert request.loaded_url == target_url


async def test_crawl_follow_redirects_false(server_url: URL) -> None:
http_client = ImpitHttpClient(follow_redirects=False)

target_url = str(server_url / 'status/200')
redirect_url = str((server_url / 'redirect').update_query(url=target_url))
request = Request.from_url(redirect_url)

crawling_result = await http_client.crawl(request)

assert crawling_result.http_response.status_code == 302
assert crawling_result.http_response.headers['Location'] == target_url
assert request.loaded_url == redirect_url
Loading
Loading