cache html page fetching by link

cosmicexplorer · cosmicexplorer · commit 8809012a836a · 2020-02-13T15:44:01.000-08:00
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -9,6 +9,19 @@
 import os
 from collections import OrderedDict
 
+from pip._vendor import html5lib, requests
+from pip._vendor.distlib.compat import unescape
+from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError
+from pip._vendor.six.moves.urllib import parse as urllib_parse
+from pip._vendor.six.moves.urllib import request as urllib_request
+
+from pip._internal.models.link import Link
+from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
+from pip._internal.utils.misc import redact_auth_from_url
+from pip._internal.utils.typing import MYPY_CHECK_RUNNING
+from pip._internal.utils.urls import path_to_url, url_to_path
+from pip._internal.vcs import is_url, vcs
+
 try:
     from functools import lru_cache
 except ImportError:
@@ -26,18 +39,6 @@ def wrapped(arg):
             return wrapped
         return wrapper
 
-from pip._vendor import html5lib, requests
-from pip._vendor.distlib.compat import unescape
-from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError
-from pip._vendor.six.moves.urllib import parse as urllib_parse
-from pip._vendor.six.moves.urllib import request as urllib_request
-
-from pip._internal.models.link import Link
-from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
-from pip._internal.utils.misc import redact_auth_from_url
-from pip._internal.utils.typing import MYPY_CHECK_RUNNING
-from pip._internal.utils.urls import path_to_url, url_to_path
-from pip._internal.vcs import is_url, vcs
 
 if MYPY_CHECK_RUNNING:
     from typing import (
@@ -261,15 +262,17 @@ def _create_link_from_element(
     return link
 
 
-class CacheablePage(object):
+class CacheablePageContent(object):
     def __init__(self, page):
         self.page = page
 
     def __eq__(self, other):
-        return isinstance(other, type(self)) and self.page.url == other.page.url
+        return (isinstance(other, type(self)) and
+                self.page.content == other.page.content and
+                self.page.encoding == other.page.encoding)
 
     def __hash__(self):
-        return hash(self.page.url)
+        return hash((self.page.content, self.page.encoding))
 
 
 def with_cached_html_pages(fn):
@@ -278,7 +281,7 @@ def wrapper(cacheable_page):
         return list(fn(cacheable_page.page))
 
     def wrapper_wrapper(page):
-        return wrapper(CacheablePage(page))
+        return wrapper(CacheablePageContent(page))
 
     return wrapper_wrapper
 
@@ -348,6 +351,15 @@ def _make_html_page(response):
     return HTMLPage(response.content, encoding=encoding, url=response.url)
 
 
+def with_cached_link_fetch(fn):
+    @lru_cache(maxsize=None)
+    def wrapper(link, session=None):
+        return fn(link, session=session)
+
+    return wrapper
+
+
+@with_cached_link_fetch
 def _get_html_page(link, session=None):
     # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
     if session is None:
diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -1,6 +1,5 @@
 import logging
 import os.path
-import uuid
 from textwrap import dedent
 
 import mock
@@ -11,6 +10,7 @@
 from pip._vendor.six.moves.urllib import request as urllib_request
 
 from pip._internal.index.collector import (
+    CacheablePageContent,
     HTMLPage,
     _clean_link,
     _determine_base_url,
@@ -270,7 +270,7 @@ def test_parse_links__yanked_reason(anchor_html, expected):
     page = HTMLPage(
         html_bytes,
         encoding=None,
-        url='https://example.com/find-links-{}'.format(uuid.uuid4()),
+        url='https://example.com/simple/',
     )
     links = list(parse_links(page))
     link, = links
@@ -287,19 +287,19 @@ def test_parse_links_caches_same_page():
     )
     html_bytes = html.encode('utf-8')
 
-    # The caching is only keyed on having the same `url`.
     page_1 = HTMLPage(
         html_bytes,
         encoding=None,
-        url='https://example.com/some-find-links-url/',
+        url='https://example.com/simple/',
     )
     page_2 = HTMLPage(
         html_bytes,
         encoding=None,
-        url='https://example.com/some-find-links-url/',
+        url='https://example.com/simple/',
     )
 
-    with mock.patch("pip._internal.index.collector.html5lib.parse") as mock_parse:
+    mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
+    with mock_parse as mock_parse:
         mock_parse.return_value = html5lib.parse(
             page_1.content,
             transport_encoding=page_1.encoding,
@@ -308,7 +308,7 @@ def test_parse_links_caches_same_page():
         parsed_links_1 = list(parse_links(page_1))
         mock_parse.assert_called()
 
-    with mock.patch("pip._internal.index.collector.html5lib.parse") as mock_parse:
+    with mock_parse as mock_parse:
         parsed_links_2 = list(parse_links(page_2))
         assert parsed_links_2 == parsed_links_1
         mock_parse.assert_not_called()
@@ -378,6 +378,25 @@ def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme):
     ]
 
 
+def test_get_html_page_caches_same_link():
+    link = Link('https://example.com/link-1/')
+    session = mock.Mock(PipSession)
+
+    fake_response = make_fake_html_response(link.url)
+    mock_func = mock.patch("pip._internal.index.collector._get_html_response")
+    with mock_func as mock_func:
+        mock_func.return_value = fake_response
+        page_1 = _get_html_page(link, session=session)
+        mock_func.assert_called_once()
+
+    with mock_func as mock_func:
+        page_2 = _get_html_page(link, session=session)
+        # Assert that the result of the cached html page fetch will also then
+        # be cached by parse_links() and @with_cached_html_pages.
+        assert CacheablePageContent(page_1) == CacheablePageContent(page_2)
+        mock_func.assert_not_called()
+
+
 def make_fake_html_response(url):
     """
     Create a fake requests.Response object.