apify · Mantisus · Apr 7, 2025 · Apr 7, 2025 · Apr 10, 2025 · Apr 10, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,6 +60,7 @@ all = [
     "curl-cffi>=0.9.0",
     "html5lib>=1.0",
     "inquirer>=3.3.0",
+    "impit>=0.2.0",
     "jaro-winkler>=2.0.3",
     "parsel>=1.10.0",
     "playwright>=1.27.0",
@@ -76,6 +77,7 @@ adaptive-crawler = [
 beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
 cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
 curl-impersonate = ["curl-cffi>=0.9.0"]
+impit = ["impit>=0.2.0"]
 parsel = ["parsel>=1.10.0"]
 playwright = ["playwright>=1.27.0"]
 

diff --git a/src/crawlee/_utils/blocked.py b/src/crawlee/_utils/blocked.py
@@ -21,6 +21,7 @@
     'ERR_PROXY_CONNECTION_FAILED',
     'ERR_TUNNEL_CONNECTION_FAILED',
     'Proxy responded with',
+    'unsuccessful tunnel',
 ]
 """
 Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning.

diff --git a/src/crawlee/http_clients/__init__.py b/src/crawlee/http_clients/__init__.py
@@ -12,11 +12,15 @@
 with _try_import(__name__, 'CurlImpersonateHttpClient'):
     from ._curl_impersonate import CurlImpersonateHttpClient
 
+with _try_import(__name__, 'ImpitHttpClient'):
+    from ._impit import ImpitHttpClient
+
 
 __all__ = [
     'CurlImpersonateHttpClient',
     'HttpClient',
     'HttpCrawlingResult',
     'HttpResponse',
     'HttpxHttpClient',
+    'ImpitHttpClient',
 ]
diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py
@@ -0,0 +1,185 @@
+from __future__ import annotations
+
+from logging import getLogger
+from typing import TYPE_CHECKING, Any, Optional
+
+from impit import AsyncClient, Response
+from typing_extensions import override
+
+from crawlee._types import HttpHeaders
+from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
+from crawlee._utils.docs import docs_group
+from crawlee.errors import ProxyError
+from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse
+
+if TYPE_CHECKING:
+    from crawlee import Request
+    from crawlee._types import HttpMethod, HttpPayload
+    from crawlee.proxy_configuration import ProxyInfo
+    from crawlee.sessions import Session
+    from crawlee.statistics import Statistics
+
+logger = getLogger(__name__)
+
+
+class _ImpitResponse:
+    """Adapter class for `impit.Response` to conform to the `HttpResponse` protocol."""
+
+    def __init__(self, response: Response) -> None:
+        self._response = response
+
+    @property
+    def http_version(self) -> str:
+        return str(self._response.http_version)
+
+    @property
+    def status_code(self) -> int:
+        return int(self._response.status_code)
+
+    @property
+    def headers(self) -> HttpHeaders:
+        return HttpHeaders(dict(self._response.headers))
+
+    def read(self) -> bytes:
+        return self._response.content
+
+
+@docs_group('Classes')
+class ImpitHttpClient(HttpClient):
+    """HTTP client based on the `impit` library.
+
+    This client uses the `impit` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
+    and to manage sessions, proxies, and error handling.
+
+    See the `HttpClient` class for more common information about HTTP clients.
+
+    ### Usage
+
+    ```python
+    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler
+    from crawlee.http_clients import ImpitHttpClient
+
+    http_client = ImpitHttpClient()
+    crawler = HttpCrawler(http_client=http_client)
+    ```
+    """
+
+    def __init__(
+        self,
+        *,
+        persist_cookies_per_session: bool = True,
+        http3: bool = True,
+        verify: bool = True,
+        **async_client_kwargs: Any,
+    ) -> None:
+        """Initialize a new instance.
+
+        Args:
+            persist_cookies_per_session: Whether to persist cookies per HTTP session.
+            http3: Whether to enable HTTP/3 support.
+            verify: SSL certificates used to verify the identity of requested hosts.
+            header_generator: Header generator instance to use for generating common headers.
+            async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`.
+        """
+        super().__init__(
+            persist_cookies_per_session=persist_cookies_per_session,
+        )
+        self._http3 = http3
+        self._verify = verify
+
+        self._async_client_kwargs = async_client_kwargs
+
+        self._client_by_proxy_url = dict[Optional[str], AsyncClient]()
+
+    @override
+    async def crawl(
+        self,
+        request: Request,
+        *,
+        session: Session | None = None,
+        proxy_info: ProxyInfo | None = None,
+        statistics: Statistics | None = None,
+    ) -> HttpCrawlingResult:
+        client = self._get_client(proxy_info.url if proxy_info else None)
+
+        try:
+            response = await client.request(
+                url=request.url,
+                method=request.method,
+                content=request.payload,
+                headers=dict(request.headers) if request.headers else None,
+            )
+        except RuntimeError as exc:
+            if self._is_proxy_error(exc):
+                raise ProxyError from exc
+            raise
+
+        if statistics:
+            statistics.register_status_code(response.status_code)
+
+        request.loaded_url = str(response.url)
+
+        return HttpCrawlingResult(
+            http_response=_ImpitResponse(response),
+        )
+
+    @override
+    async def send_request(
+        self,
+        url: str,
+        *,
+        method: HttpMethod = 'GET',
+        headers: HttpHeaders | dict[str, str] | None = None,
+        payload: HttpPayload | None = None,
+        session: Session | None = None,
+        proxy_info: ProxyInfo | None = None,
+    ) -> HttpResponse:
+        if isinstance(headers, dict) or headers is None:
+            headers = HttpHeaders(headers or {})
+
+        client = self._get_client(proxy_info.url if proxy_info else None)
+
+        try:
+            response = await client.request(
+                url=url,
+                method=method,
+                headers=dict(headers) if headers else None,
+                content=payload,
+            )
+        except RuntimeError as exc:
+            if self._is_proxy_error(exc):
+                raise ProxyError from exc
+            raise
+
+        return _ImpitResponse(response)
+
+    def _get_client(self, proxy_url: str | None) -> AsyncClient:
+        """Retrieve or create an HTTP client for the given proxy URL.
+
+        If a client for the specified proxy URL does not exist, create and store a new one.
+        """
+        if proxy_url not in self._client_by_proxy_url:
+            # Prepare a default kwargs for the new client.
+            kwargs: dict[str, Any] = {
+                'proxy': proxy_url,
+                'http3': self._http3,
+                'verify': self._verify,
+                'follow_redirects': True,
+                'browser': 'firefox',
+            }
+
+            # Update the default kwargs with any additional user-provided kwargs.
+            kwargs.update(self._async_client_kwargs)
+
+            client = AsyncClient(**kwargs)
+            self._client_by_proxy_url[proxy_url] = client
+
+        return self._client_by_proxy_url[proxy_url]
+
+    @staticmethod
+    def _is_proxy_error(error: RuntimeError) -> bool:
+        """Determine whether the given error is related to a proxy issue.
+
+        Check if the error message contains known proxy-related error keywords.
+        """
+        return any(needle in str(error) for needle in ROTATE_PROXY_ERRORS)
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -15,7 +15,7 @@
 from crawlee import service_locator
 from crawlee.configuration import Configuration
 from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network
-from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient
+from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, ImpitHttpClient
 from crawlee.proxy_configuration import ProxyInfo
 from crawlee.storage_clients import MemoryStorageClient
 from crawlee.storages import KeyValueStore, _creation_management
@@ -206,9 +206,12 @@ def redirect_server_url(redirect_http_server: TestServer) -> URL:
     params=[
         pytest.param('curl', id='curl'),
         pytest.param('httpx', id='httpx'),
+        pytest.param('impit', id='impit'),
     ]
 )
 async def http_client(request: pytest.FixtureRequest) -> HttpClient:
     if request.param == 'curl':
         return CurlImpersonateHttpClient(http_version=CurlHttpVersion.V1_1)
+    if request.param == 'impit':
+        return ImpitHttpClient(http3=False)
     return HttpxHttpClient(http2=False)
diff --git a/tests/unit/http_clients/test_impit.py b/tests/unit/http_clients/test_impit.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+import pytest
+
+from crawlee import Request
+from crawlee.errors import ProxyError
+from crawlee.http_clients import ImpitHttpClient
+from crawlee.statistics import Statistics
+
+if TYPE_CHECKING:
+    from yarl import URL
+
+    from crawlee.proxy_configuration import ProxyInfo
+
+
+@pytest.fixture
+def http_client() -> ImpitHttpClient:
+    return ImpitHttpClient()
+
+
+async def test_http_1(server_url: URL) -> None:
+    http_client = ImpitHttpClient()
+    response = await http_client.send_request(str(server_url))
+    assert response.http_version == 'HTTP/1.1'
+
+
+async def test_http_2() -> None:
+    http_client = ImpitHttpClient()
+    response = await http_client.send_request('https://apify.com/')
+    assert response.http_version == 'HTTP/2'
+
+
+@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
+async def test_proxy(
+    http_client: ImpitHttpClient,
+    proxy: ProxyInfo,
+    server_url: URL,
+) -> None:
+    url = str(server_url / 'status/222')
+    request = Request.from_url(url)
+
+    async with Statistics.with_default_state() as statistics:
+        result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics)
+
+    assert result.http_response.status_code == 222
+
+
+@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
+async def test_proxy_disabled(
+    http_client: ImpitHttpClient,
+    disabled_proxy: ProxyInfo,
+) -> None:
+    url = 'https://apify.com/'
+    request = Request.from_url(url)
+
+    with pytest.raises(ProxyError):
+        async with Statistics.with_default_state() as statistics:
+            await http_client.crawl(request, proxy_info=disabled_proxy, statistics=statistics)
+
+
+@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
+async def test_send_request_with_proxy(
+    http_client: ImpitHttpClient,
+    proxy: ProxyInfo,
+    server_url: URL,
+) -> None:
+    url = str(server_url / 'status/222')
+
+    response = await http_client.send_request(url, proxy_info=proxy)
+    assert response.status_code == 222  # 222 - authentication successful
+
+
+@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
+async def test_send_request_with_proxy_disabled(
+    http_client: ImpitHttpClient,
+    disabled_proxy: ProxyInfo,
+) -> None:
+    url = 'https://apify.com/'
+
+    with pytest.raises(ProxyError):
+        await http_client.send_request(url, proxy_info=disabled_proxy)
+
+
+async def test_crawl_follow_redirects_by_default(http_client: ImpitHttpClient, server_url: URL) -> None:
+    target_url = str(server_url / 'status/200')
+    redirect_url = str((server_url / 'redirect').update_query(url=target_url))
+    request = Request.from_url(redirect_url)
+
+    crawling_result = await http_client.crawl(request)
+
+    assert crawling_result.http_response.status_code == 200
+    assert request.loaded_url == target_url
+
+
+async def test_crawl_follow_redirects_false(server_url: URL) -> None:
+    http_client = ImpitHttpClient(follow_redirects=False)
+
+    target_url = str(server_url / 'status/200')
+    redirect_url = str((server_url / 'redirect').update_query(url=target_url))
+    request = Request.from_url(redirect_url)
+
+    crawling_result = await http_client.crawl(request)
+
+    assert crawling_result.http_response.status_code == 302
+    assert crawling_result.http_response.headers['Location'] == target_url
+    assert request.loaded_url == redirect_url