clemfromspace · emilrueh · Oct 10, 2023 · Oct 19, 2023 · Oct 19, 2023 · Oct 20, 2023
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ You will also need one of the Selenium [compatible browsers](http://www.selenium
 
 ## Configuration
 1. Add the browser to use, the path to the driver executable, and the arguments to pass to the executable to the scrapy settings:
+
     ```python
     from shutil import which
 
@@ -20,37 +21,47 @@ You will also need one of the Selenium [compatible browsers](http://www.selenium
     SELENIUM_DRIVER_ARGUMENTS=['-headless']  # '--headless' if using chrome instead of firefox
     ```
 
-Optionally, set the path to the browser executable:
+- Optionally, set the path to the browser executable:
+
     ```python
     SELENIUM_BROWSER_EXECUTABLE_PATH = which('firefox')
     ```
 
-In order to use a remote Selenium driver, specify `SELENIUM_COMMAND_EXECUTOR` instead of `SELENIUM_DRIVER_EXECUTABLE_PATH`:
+- In order to use a **remote Selenium driver**, specify `SELENIUM_COMMAND_EXECUTOR` instead of `SELENIUM_DRIVER_EXECUTABLE_PATH`:
+
+    Important: keep the driver name and arguments
+
     ```python
     SELENIUM_COMMAND_EXECUTOR = 'http://localhost:4444/wd/hub'
     ```
 
 2. Add the `SeleniumMiddleware` to the downloader middlewares:
+
     ```python
     DOWNLOADER_MIDDLEWARES = {
         'scrapy_selenium.SeleniumMiddleware': 800
     }
     ```
 ## Usage
 Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Request` like below:
+
 ```python
 from scrapy_selenium import SeleniumRequest
 
 yield SeleniumRequest(url=url, callback=self.parse_result)
 ```
+
 The request will be handled by selenium, and the request will have an additional `meta` key, named `driver` containing the selenium driver with the request processed.
+
 ```python
 def parse_result(self, response):
     print(response.request.meta['driver'].title)
 ```
+
 For more information about the available driver methods and attributes, refer to the [selenium python documentation](http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webdriver)
 
 The `selector` response attribute work as usual (but contains the html processed by the selenium driver).
+
 ```python
 def parse_result(self, response):
     print(response.selector.xpath('//title/@text'))
@@ -62,6 +73,7 @@ The `scrapy_selenium.SeleniumRequest` accept 4 additional arguments:
 #### `wait_time` / `wait_until`
 
 When used, selenium will perform an [Explicit wait](http://selenium-python.readthedocs.io/waits.html#explicit-waits) before returning the response to the spider.
+
 ```python
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
@@ -76,6 +88,7 @@ yield SeleniumRequest(
 
 #### `screenshot`
 When used, selenium will take a screenshot of the page and the binary data of the .png captured will be added to the response `meta`:
+
 ```python
 yield SeleniumRequest(
     url=url,
@@ -90,6 +103,7 @@ def parse_result(self, response):
 
 #### `script`
 When used, selenium will execute custom JavaScript code.
+
 ```python
 yield SeleniumRequest(
     url=url,

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -1,2 +1,3 @@
 scrapy>=1.0.0
 selenium>=3.9.0
+webdriver_manager>=4.0.1
diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py
@@ -1,140 +1,146 @@
-"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
-
 from importlib import import_module
-
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.common.exceptions import WebDriverException
+from selenium import webdriver
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.service import Service as ChromeService
 from scrapy import signals
 from scrapy.exceptions import NotConfigured
 from scrapy.http import HtmlResponse
-from selenium.webdriver.support.ui import WebDriverWait
+from .http import SeleniumRequest  # Assuming SeleniumRequest is in http.py in the same folder
+import time
+
+import logging
 
-from .http import SeleniumRequest
+selenium_logger = logging.getLogger("selenium.webdriver.remote.remote_connection")
+selenium_logger.setLevel(logging.INFO)
 
 
 class SeleniumMiddleware:
-    """Scrapy middleware handling the requests using selenium"""
-
-    def __init__(self, driver_name, driver_executable_path,
-        browser_executable_path, command_executor, driver_arguments):
-        """Initialize the selenium webdriver
-
-        Parameters
-        ----------
-        driver_name: str
-            The selenium ``WebDriver`` to use
-        driver_executable_path: str
-            The path of the executable binary of the driver
-        driver_arguments: list
-            A list of arguments to initialize the driver
-        browser_executable_path: str
-            The path of the executable binary of the browser
-        command_executor: str
-            Selenium remote server endpoint
-        """
-
-        webdriver_base_path = f'selenium.webdriver.{driver_name}'
-
-        driver_klass_module = import_module(f'{webdriver_base_path}.webdriver')
-        driver_klass = getattr(driver_klass_module, 'WebDriver')
-
-        driver_options_module = import_module(f'{webdriver_base_path}.options')
-        driver_options_klass = getattr(driver_options_module, 'Options')
-
-        driver_options = driver_options_klass()
-
-        if browser_executable_path:
-            driver_options.binary_location = browser_executable_path
-        for argument in driver_arguments:
-            driver_options.add_argument(argument)
-
-        driver_kwargs = {
-            'executable_path': driver_executable_path,
-            f'{driver_name}_options': driver_options
-        }
-
-        # locally installed driver
-        if driver_executable_path is not None:
-            driver_kwargs = {
-                'executable_path': driver_executable_path,
-                f'{driver_name}_options': driver_options
-            }
-            self.driver = driver_klass(**driver_kwargs)
-        # remote driver
-        elif command_executor is not None:
-            from selenium import webdriver
-            capabilities = driver_options.to_capabilities()
-            self.driver = webdriver.Remote(command_executor=command_executor,
-                                           desired_capabilities=capabilities)
+    def __init__(
+        self,
+        driver_name,
+        driver_executable_path,
+        browser_executable_path,
+        command_executor,
+        driver_arguments,
+    ):
+        self.driver_name = driver_name
+        self.driver_executable_path = driver_executable_path
+        self.browser_executable_path = browser_executable_path
+        self.command_executor = command_executor
+        self.driver_arguments = driver_arguments
+        self._initialize_driver()
+        self.retry_count = 0
+
+    def _initialize_driver(self):
+        retries = 3
+        for i in range(retries):
+            try:
+                webdriver_base_path = f"selenium.webdriver.{self.driver_name}"
+                driver_klass_module = import_module(f"{webdriver_base_path}.webdriver")
+                driver_klass = getattr(driver_klass_module, "WebDriver")
+                driver_options_module = import_module(f"{webdriver_base_path}.options")
+                driver_options_klass = getattr(driver_options_module, "Options")
+
+                driver_options = driver_options_klass()
+                if self.browser_executable_path:
+                    driver_options.binary_location = self.browser_executable_path
+                for argument in self.driver_arguments:
+                    driver_options.add_argument(argument)
+
+                driver_kwargs = {
+                    "executable_path": self.driver_executable_path,
+                    f"{self.driver_name}_options": driver_options,
+                }
+
+                if self.driver_executable_path is not None:
+                    self.driver = driver_klass(**driver_kwargs)
+                elif self.command_executor is not None:
+                    self.driver = webdriver.Remote(
+                        command_executor=self.command_executor, options=driver_options
+                    )
+                else:
+                    if self.driver_name and self.driver_name.lower() == "chrome":
+                        self.driver = webdriver.Chrome(
+                            options=driver_options,
+                            service=ChromeService(ChromeDriverManager().install()),
+                        )
+                break
+            except WebDriverException:
+                if i < retries - 1:  # not the last retry
+                    print(
+                        f"Encountered WebDriverException during driver initialization. Retrying... ({i+1})"
+                    )
+                    time.sleep(2**i)  # exponential backoff
+                else:
+                    print("Max retries reached. Could not initialize the driver.")
+                    raise  # re-raise the exception
 
     @classmethod
     def from_crawler(cls, crawler):
-        """Initialize the middleware with the crawler settings"""
-
-        driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
-        driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
-        browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
-        command_executor = crawler.settings.get('SELENIUM_COMMAND_EXECUTOR')
-        driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')
+        driver_name = crawler.settings.get("SELENIUM_DRIVER_NAME")
+        driver_executable_path = crawler.settings.get("SELENIUM_DRIVER_EXECUTABLE_PATH")
+        browser_executable_path = crawler.settings.get("SELENIUM_BROWSER_EXECUTABLE_PATH")
+        command_executor = crawler.settings.get("SELENIUM_COMMAND_EXECUTOR")
+        driver_arguments = crawler.settings.get("SELENIUM_DRIVER_ARGUMENTS")
 
-        if driver_name is None:
-            raise NotConfigured('SELENIUM_DRIVER_NAME must be set')
-
-        if driver_executable_path is None and command_executor is None:
-            raise NotConfigured('Either SELENIUM_DRIVER_EXECUTABLE_PATH '
-                                'or SELENIUM_COMMAND_EXECUTOR must be set')
+        if not driver_name:
+            raise NotConfigured("SELENIUM_DRIVER_NAME must be set")
 
         middleware = cls(
-            driver_name=driver_name,
-            driver_executable_path=driver_executable_path,
-            browser_executable_path=browser_executable_path,
-            command_executor=command_executor,
-            driver_arguments=driver_arguments
+            driver_name,
+            driver_executable_path,
+            browser_executable_path,
+            command_executor,
+            driver_arguments,
         )
 
         crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
-
         return middleware
 
     def process_request(self, request, spider):
-        """Process a request using the selenium driver if applicable"""
+        try:
+            if not isinstance(request, SeleniumRequest):
+                return None
 
-        if not isinstance(request, SeleniumRequest):
-            return None
+            # OPENING WEBSITE
+            self.driver.get(request.url)
 
-        self.driver.get(request.url)
+            for cookie_name, cookie_value in request.cookies.items():
+                self.driver.add_cookie({"name": cookie_name, "value": cookie_value})
 
-        for cookie_name, cookie_value in request.cookies.items():
-            self.driver.add_cookie(
-                {
-                    'name': cookie_name,
-                    'value': cookie_value
-                }
-            )
-
-        if request.wait_until:
-            WebDriverWait(self.driver, request.wait_time).until(
-                request.wait_until
-            )
-
-        if request.screenshot:
-            request.meta['screenshot'] = self.driver.get_screenshot_as_png()
+            if request.wait_until:
+                WebDriverWait(self.driver, request.wait_time).until(request.wait_until)
 
-        if request.script:
-            self.driver.execute_script(request.script)
+            if request.screenshot:
+                request.meta["screenshot"] = self.driver.get_screenshot_as_png()
 
-        body = str.encode(self.driver.page_source)
+            if request.script:
+                self.driver.execute_script(request.script)
 
-        # Expose the driver via the "meta" attribute
-        request.meta.update({'driver': self.driver})
+            body = str.encode(self.driver.page_source)
+            request.meta.update({"driver": self.driver})
 
-        return HtmlResponse(
-            self.driver.current_url,
-            body=body,
-            encoding='utf-8',
-            request=request
-        )
+            processed_request = HtmlResponse(
+                self.driver.current_url, body=body, encoding="utf-8", request=request
+            )
+            self.retry_count = 0  # Reset the retry counter if successful
+            return processed_request
+
+        except WebDriverException:
+            if self.retry_count < 3:  # Maximum retry limit
+                print(
+                    f"Encountered WebDriverException with {request.url}\nRetrying in {2 ** self.retry_count}s..."
+                )
+                self.retry_count += 1
+                time.sleep(2**self.retry_count)  # Exponential backoff
+                request.meta["retrying"] = True
+                self._initialize_driver()
+                return request
+            else:
+                self.retry_count = 0  # Reset the retry counter
+                raise  # Reraise the exception if maximum retries reached
 
     def spider_closed(self):
-        """Shutdown the driver when spider is closed"""
-
         self.driver.quit()
-
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [metadata]
-name = scrapy-selenium
+name = scrapy-selenium-modernized
 version = 0.0.7
 url = https://github.com/clemfromspace/scrapy-selenium
 licence = DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE

diff --git a/setup.py b/setup.py
@@ -1,34 +1,10 @@
-"""This module contains the packaging routine for the pybook package"""
-
 from setuptools import setup, find_packages
-try:
-    from pip.download import PipSession
-    from pip.req import parse_requirements
-except ImportError:
-    # It is quick hack to support pip 10 that has changed its internal
-    # structure of the modules.
-    from pip._internal.download import PipSession
-    from pip._internal.req.req_file import parse_requirements
 
 
 def get_requirements(source):
-    """Get the requirements from the given ``source``
-
-    Parameters
-    ----------
-    source: str
-        The filename containing the requirements
-
-    """
-
-    install_reqs = parse_requirements(filename=source, session=PipSession())
-
-    return [str(ir.req) for ir in install_reqs]
-
-
-setup(
-    packages=find_packages(),
-    install_requires=get_requirements('requirements/requirements.txt')
-)
+    with open(source, "r") as f:
+        requirements = f.read().splitlines()
+    return requirements
 
 
+setup(packages=find_packages(), install_requires=get_requirements("requirements/requirements.txt"))