diff --git a/README.md b/README.md index b642de9..42b84fb 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ You will also need one of the Selenium [compatible browsers](http://www.selenium ## Configuration 1. Add the browser to use, the path to the driver executable, and the arguments to pass to the executable to the scrapy settings: + ```python from shutil import which @@ -20,17 +21,22 @@ You will also need one of the Selenium [compatible browsers](http://www.selenium SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox ``` -Optionally, set the path to the browser executable: +- Optionally, set the path to the browser executable: + ```python SELENIUM_BROWSER_EXECUTABLE_PATH = which('firefox') ``` -In order to use a remote Selenium driver, specify `SELENIUM_COMMAND_EXECUTOR` instead of `SELENIUM_DRIVER_EXECUTABLE_PATH`: +- In order to use a **remote Selenium driver**, specify `SELENIUM_COMMAND_EXECUTOR` instead of `SELENIUM_DRIVER_EXECUTABLE_PATH`: + + Important: keep the driver name and arguments + ```python SELENIUM_COMMAND_EXECUTOR = 'http://localhost:4444/wd/hub' ``` 2. Add the `SeleniumMiddleware` to the downloader middlewares: + ```python DOWNLOADER_MIDDLEWARES = { 'scrapy_selenium.SeleniumMiddleware': 800 @@ -38,19 +44,24 @@ In order to use a remote Selenium driver, specify `SELENIUM_COMMAND_EXECUTOR` in ``` ## Usage Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Request` like below: + ```python from scrapy_selenium import SeleniumRequest yield SeleniumRequest(url=url, callback=self.parse_result) ``` + The request will be handled by selenium, and the request will have an additional `meta` key, named `driver` containing the selenium driver with the request processed. + ```python def parse_result(self, response): print(response.request.meta['driver'].title) ``` + For more information about the available driver methods and attributes, refer to the [selenium python documentation](http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webdriver) The `selector` response attribute work as usual (but contains the html processed by the selenium driver). + ```python def parse_result(self, response): print(response.selector.xpath('//title/@text')) @@ -62,6 +73,7 @@ The `scrapy_selenium.SeleniumRequest` accept 4 additional arguments: #### `wait_time` / `wait_until` When used, selenium will perform an [Explicit wait](http://selenium-python.readthedocs.io/waits.html#explicit-waits) before returning the response to the spider. + ```python from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC @@ -76,6 +88,7 @@ yield SeleniumRequest( #### `screenshot` When used, selenium will take a screenshot of the page and the binary data of the .png captured will be added to the response `meta`: + ```python yield SeleniumRequest( url=url, @@ -90,6 +103,7 @@ def parse_result(self, response): #### `script` When used, selenium will execute custom JavaScript code. + ```python yield SeleniumRequest( url=url, diff --git a/requirements/requirements.txt b/requirements/requirements.txt index e6e2710..927ded1 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,2 +1,3 @@ scrapy>=1.0.0 selenium>=3.9.0 +webdriver_manager>=4.0.1 \ No newline at end of file diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py index 201db2c..7edc129 100644 --- a/scrapy_selenium/middlewares.py +++ b/scrapy_selenium/middlewares.py @@ -1,140 +1,146 @@ -"""This module contains the ``SeleniumMiddleware`` scrapy middleware""" - from importlib import import_module - +from selenium.webdriver.support.ui import WebDriverWait +from selenium.common.exceptions import WebDriverException +from selenium import webdriver +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.chrome.service import Service as ChromeService from scrapy import signals from scrapy.exceptions import NotConfigured from scrapy.http import HtmlResponse -from selenium.webdriver.support.ui import WebDriverWait +from .http import SeleniumRequest # Assuming SeleniumRequest is in http.py in the same folder +import time + +import logging -from .http import SeleniumRequest +selenium_logger = logging.getLogger("selenium.webdriver.remote.remote_connection") +selenium_logger.setLevel(logging.INFO) class SeleniumMiddleware: - """Scrapy middleware handling the requests using selenium""" - - def __init__(self, driver_name, driver_executable_path, - browser_executable_path, command_executor, driver_arguments): - """Initialize the selenium webdriver - - Parameters - ---------- - driver_name: str - The selenium ``WebDriver`` to use - driver_executable_path: str - The path of the executable binary of the driver - driver_arguments: list - A list of arguments to initialize the driver - browser_executable_path: str - The path of the executable binary of the browser - command_executor: str - Selenium remote server endpoint - """ - - webdriver_base_path = f'selenium.webdriver.{driver_name}' - - driver_klass_module = import_module(f'{webdriver_base_path}.webdriver') - driver_klass = getattr(driver_klass_module, 'WebDriver') - - driver_options_module = import_module(f'{webdriver_base_path}.options') - driver_options_klass = getattr(driver_options_module, 'Options') - - driver_options = driver_options_klass() - - if browser_executable_path: - driver_options.binary_location = browser_executable_path - for argument in driver_arguments: - driver_options.add_argument(argument) - - driver_kwargs = { - 'executable_path': driver_executable_path, - f'{driver_name}_options': driver_options - } - - # locally installed driver - if driver_executable_path is not None: - driver_kwargs = { - 'executable_path': driver_executable_path, - f'{driver_name}_options': driver_options - } - self.driver = driver_klass(**driver_kwargs) - # remote driver - elif command_executor is not None: - from selenium import webdriver - capabilities = driver_options.to_capabilities() - self.driver = webdriver.Remote(command_executor=command_executor, - desired_capabilities=capabilities) + def __init__( + self, + driver_name, + driver_executable_path, + browser_executable_path, + command_executor, + driver_arguments, + ): + self.driver_name = driver_name + self.driver_executable_path = driver_executable_path + self.browser_executable_path = browser_executable_path + self.command_executor = command_executor + self.driver_arguments = driver_arguments + self._initialize_driver() + self.retry_count = 0 + + def _initialize_driver(self): + retries = 3 + for i in range(retries): + try: + webdriver_base_path = f"selenium.webdriver.{self.driver_name}" + driver_klass_module = import_module(f"{webdriver_base_path}.webdriver") + driver_klass = getattr(driver_klass_module, "WebDriver") + driver_options_module = import_module(f"{webdriver_base_path}.options") + driver_options_klass = getattr(driver_options_module, "Options") + + driver_options = driver_options_klass() + if self.browser_executable_path: + driver_options.binary_location = self.browser_executable_path + for argument in self.driver_arguments: + driver_options.add_argument(argument) + + driver_kwargs = { + "executable_path": self.driver_executable_path, + f"{self.driver_name}_options": driver_options, + } + + if self.driver_executable_path is not None: + self.driver = driver_klass(**driver_kwargs) + elif self.command_executor is not None: + self.driver = webdriver.Remote( + command_executor=self.command_executor, options=driver_options + ) + else: + if self.driver_name and self.driver_name.lower() == "chrome": + self.driver = webdriver.Chrome( + options=driver_options, + service=ChromeService(ChromeDriverManager().install()), + ) + break + except WebDriverException: + if i < retries - 1: # not the last retry + print( + f"Encountered WebDriverException during driver initialization. Retrying... ({i+1})" + ) + time.sleep(2**i) # exponential backoff + else: + print("Max retries reached. Could not initialize the driver.") + raise # re-raise the exception @classmethod def from_crawler(cls, crawler): - """Initialize the middleware with the crawler settings""" - - driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME') - driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH') - browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH') - command_executor = crawler.settings.get('SELENIUM_COMMAND_EXECUTOR') - driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS') + driver_name = crawler.settings.get("SELENIUM_DRIVER_NAME") + driver_executable_path = crawler.settings.get("SELENIUM_DRIVER_EXECUTABLE_PATH") + browser_executable_path = crawler.settings.get("SELENIUM_BROWSER_EXECUTABLE_PATH") + command_executor = crawler.settings.get("SELENIUM_COMMAND_EXECUTOR") + driver_arguments = crawler.settings.get("SELENIUM_DRIVER_ARGUMENTS") - if driver_name is None: - raise NotConfigured('SELENIUM_DRIVER_NAME must be set') - - if driver_executable_path is None and command_executor is None: - raise NotConfigured('Either SELENIUM_DRIVER_EXECUTABLE_PATH ' - 'or SELENIUM_COMMAND_EXECUTOR must be set') + if not driver_name: + raise NotConfigured("SELENIUM_DRIVER_NAME must be set") middleware = cls( - driver_name=driver_name, - driver_executable_path=driver_executable_path, - browser_executable_path=browser_executable_path, - command_executor=command_executor, - driver_arguments=driver_arguments + driver_name, + driver_executable_path, + browser_executable_path, + command_executor, + driver_arguments, ) crawler.signals.connect(middleware.spider_closed, signals.spider_closed) - return middleware def process_request(self, request, spider): - """Process a request using the selenium driver if applicable""" + try: + if not isinstance(request, SeleniumRequest): + return None - if not isinstance(request, SeleniumRequest): - return None + # OPENING WEBSITE + self.driver.get(request.url) - self.driver.get(request.url) + for cookie_name, cookie_value in request.cookies.items(): + self.driver.add_cookie({"name": cookie_name, "value": cookie_value}) - for cookie_name, cookie_value in request.cookies.items(): - self.driver.add_cookie( - { - 'name': cookie_name, - 'value': cookie_value - } - ) - - if request.wait_until: - WebDriverWait(self.driver, request.wait_time).until( - request.wait_until - ) - - if request.screenshot: - request.meta['screenshot'] = self.driver.get_screenshot_as_png() + if request.wait_until: + WebDriverWait(self.driver, request.wait_time).until(request.wait_until) - if request.script: - self.driver.execute_script(request.script) + if request.screenshot: + request.meta["screenshot"] = self.driver.get_screenshot_as_png() - body = str.encode(self.driver.page_source) + if request.script: + self.driver.execute_script(request.script) - # Expose the driver via the "meta" attribute - request.meta.update({'driver': self.driver}) + body = str.encode(self.driver.page_source) + request.meta.update({"driver": self.driver}) - return HtmlResponse( - self.driver.current_url, - body=body, - encoding='utf-8', - request=request - ) + processed_request = HtmlResponse( + self.driver.current_url, body=body, encoding="utf-8", request=request + ) + self.retry_count = 0 # Reset the retry counter if successful + return processed_request + + except WebDriverException: + if self.retry_count < 3: # Maximum retry limit + print( + f"Encountered WebDriverException with {request.url}\nRetrying in {2 ** self.retry_count}s..." + ) + self.retry_count += 1 + time.sleep(2**self.retry_count) # Exponential backoff + request.meta["retrying"] = True + self._initialize_driver() + return request + else: + self.retry_count = 0 # Reset the retry counter + raise # Reraise the exception if maximum retries reached def spider_closed(self): - """Shutdown the driver when spider is closed""" - self.driver.quit() - diff --git a/setup.cfg b/setup.cfg index 2ca31e9..bb89663 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -name = scrapy-selenium +name = scrapy-selenium-modernized version = 0.0.7 url = https://github.com/clemfromspace/scrapy-selenium licence = DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE diff --git a/setup.py b/setup.py index 16fd185..0800ce5 100644 --- a/setup.py +++ b/setup.py @@ -1,34 +1,10 @@ -"""This module contains the packaging routine for the pybook package""" - from setuptools import setup, find_packages -try: - from pip.download import PipSession - from pip.req import parse_requirements -except ImportError: - # It is quick hack to support pip 10 that has changed its internal - # structure of the modules. - from pip._internal.download import PipSession - from pip._internal.req.req_file import parse_requirements def get_requirements(source): - """Get the requirements from the given ``source`` - - Parameters - ---------- - source: str - The filename containing the requirements - - """ - - install_reqs = parse_requirements(filename=source, session=PipSession()) - - return [str(ir.req) for ir in install_reqs] - - -setup( - packages=find_packages(), - install_requires=get_requirements('requirements/requirements.txt') -) + with open(source, "r") as f: + requirements = f.read().splitlines() + return requirements +setup(packages=find_packages(), install_requires=get_requirements("requirements/requirements.txt"))