add failing test

cosmicexplorer · cosmicexplorer · commit d7318e621f15 · 2020-04-07T12:00:32.000-07:00
apply the fix add template NEWS entry according to https://pip.pypa.io/en/latest/development/contributing/#news-entries (wrong PR #) rename news entry to the current PR # respond to review comments fix test failures fix tests by adding uuid salt in urls cache html page fetching by link make CI pass (?) make the types much better finally listen to the maintainer and cache parse_links() by url :) avoid caching parse_links() when the url is an index url cleanup add testing for uncachable marking only conditionally vendor _lru_cache for py2 bugfix => feature python 2 does not cache! Do away with type: ignore with getattr() respond to review comments
diff --git a/news/7729.feature b/news/7729.feature
@@ -0,0 +1,3 @@
+Cache the result of parse_links() to avoid re-tokenizing a find-links page multiple times over a pip run.
+
+This change significantly improves resolve performance when --find-links points to a very large html page.
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -3,6 +3,7 @@
 """
 
 import cgi
+import functools
 import itertools
 import logging
 import mimetypes
@@ -25,8 +26,8 @@
 
 if MYPY_CHECK_RUNNING:
     from typing import (
-        Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
-        Union,
+        Callable, Iterable, List, MutableMapping, Optional,
+        Protocol, Sequence, Tuple, TypeVar, Union,
     )
     import xml.etree.ElementTree
 
@@ -38,10 +39,31 @@
     HTMLElement = xml.etree.ElementTree.Element
     ResponseHeaders = MutableMapping[str, str]
 
+    # Used in the @lru_cache polyfill.
+    F = TypeVar('F')
+
+    class LruCache(Protocol):
+        def __call__(self, maxsize=None):
+            # type: (Optional[int]) -> Callable[[F], F]
+            raise NotImplementedError
+
 
 logger = logging.getLogger(__name__)
 
 
+# Fallback to noop_lru_cache in Python 2
+# TODO: this can be removed when python 2 support is dropped!
+def noop_lru_cache(maxsize=None):
+    # type: (Optional[int]) -> Callable[[F], F]
+    def _wrapper(f):
+        # type: (F) -> F
+        return f
+    return _wrapper
+
+
+_lru_cache = getattr(functools, "lru_cache", noop_lru_cache)  # type: LruCache
+
+
 def _match_vcs_scheme(url):
     # type: (str) -> Optional[str]
     """Look for VCS schemes in the URL.
@@ -285,6 +307,48 @@ def _create_link_from_element(
     return link
 
 
+class CacheablePageContent(object):
+    def __init__(self, page):
+        # type: (HTMLPage) -> None
+        assert page.cache_link_parsing
+        self.page = page
+
+    def __eq__(self, other):
+        # type: (object) -> bool
+        return (isinstance(other, type(self)) and
+                self.page.url == other.page.url)
+
+    def __hash__(self):
+        # type: () -> int
+        return hash(self.page.url)
+
+
+def with_cached_html_pages(
+    fn,    # type: Callable[[HTMLPage], Iterable[Link]]
+):
+    # type: (...) -> Callable[[HTMLPage], List[Link]]
+    """
+    Given a function that parses an Iterable[Link] from an HTMLPage, cache the
+    function's result (keyed by CacheablePageContent), unless the HTMLPage
+    `page` has `page.cache_link_parsing == False`.
+    """
+
+    @_lru_cache(maxsize=None)
+    def wrapper(cacheable_page):
+        # type: (CacheablePageContent) -> List[Link]
+        return list(fn(cacheable_page.page))
+
+    @functools.wraps(fn)
+    def wrapper_wrapper(page):
+        # type: (HTMLPage) -> List[Link]
+        if page.cache_link_parsing:
+            return wrapper(CacheablePageContent(page))
+        return list(fn(page))
+
+    return wrapper_wrapper
+
+
+@with_cached_html_pages
 def parse_links(page):
     # type: (HTMLPage) -> Iterable[Link]
     """
@@ -314,18 +378,23 @@ class HTMLPage(object):
 
     def __init__(
         self,
-        content,   # type: bytes
-        encoding,  # type: Optional[str]
-        url,       # type: str
+        content,                  # type: bytes
+        encoding,                 # type: Optional[str]
+        url,                      # type: str
+        cache_link_parsing=True,  # type: bool
     ):
         # type: (...) -> None
         """
         :param encoding: the encoding to decode the given content.
         :param url: the URL from which the HTML was downloaded.
+        :param cache_link_parsing: whether links parsed from this page's url
+                                   should be cached. PyPI index urls should
+                                   have this set to False, for example.
         """
         self.content = content
         self.encoding = encoding
         self.url = url
+        self.cache_link_parsing = cache_link_parsing
 
     def __str__(self):
         # type: () -> str
@@ -343,10 +412,14 @@ def _handle_get_page_fail(
     meth("Could not fetch URL %s: %s - skipping", link, reason)
 
 
-def _make_html_page(response):
-    # type: (Response) -> HTMLPage
+def _make_html_page(response, cache_link_parsing=True):
+    # type: (Response, bool) -> HTMLPage
     encoding = _get_encoding_from_headers(response.headers)
-    return HTMLPage(response.content, encoding=encoding, url=response.url)
+    return HTMLPage(
+        response.content,
+        encoding=encoding,
+        url=response.url,
+        cache_link_parsing=cache_link_parsing)
 
 
 def _get_html_page(link, session=None):
@@ -399,7 +472,8 @@ def _get_html_page(link, session=None):
     except requests.Timeout:
         _handle_get_page_fail(link, "timed out")
     else:
-        return _make_html_page(resp)
+        return _make_html_page(resp,
+                               cache_link_parsing=link.cache_link_parsing)
     return None
 
 
@@ -562,7 +636,9 @@ def collect_links(self, project_name):
         # We want to filter out anything that does not have a secure origin.
         url_locations = [
             link for link in itertools.chain(
-                (Link(url) for url in index_url_loc),
+                # Mark PyPI indices as "cache_link_parsing == False" -- this
+                # will avoid caching the result of parsing the page for links.
+                (Link(url, cache_link_parsing=False) for url in index_url_loc),
                 (Link(url) for url in fl_url_loc),
             )
             if self.session.is_secure_origin(link)
diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
@@ -30,6 +30,7 @@ def __init__(
         comes_from=None,       # type: Optional[Union[str, HTMLPage]]
         requires_python=None,  # type: Optional[str]
         yanked_reason=None,    # type: Optional[Text]
+        cache_link_parsing=True,  # type: bool
     ):
         # type: (...) -> None
         """
@@ -46,6 +47,11 @@ def __init__(
             a simple repository HTML link. If the file has been yanked but
             no reason was provided, this should be the empty string. See
             PEP 592 for more information and the specification.
+        :param cache_link_parsing: A flag that is used elsewhere to determine
+                                   whether resources retrieved from this link
+                                   should be cached. PyPI index urls should
+                                   generally have this set to False, for
+                                   example.
         """
 
         # url can be a UNC windows share
@@ -63,6 +69,8 @@ def __init__(
 
         super(Link, self).__init__(key=url, defining_class=Link)
 
+        self.cache_link_parsing = cache_link_parsing
+
     def __str__(self):
         # type: () -> str
         if self.requires_python:
diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -1,5 +1,7 @@
 import logging
 import os.path
+import re
+import uuid
 from textwrap import dedent
 
 import mock
@@ -26,7 +28,7 @@
 from pip._internal.models.index import PyPI
 from pip._internal.models.link import Link
 from pip._internal.network.session import PipSession
-from tests.lib import make_test_link_collector
+from tests.lib import make_test_link_collector, skip_if_python2
 
 
 @pytest.mark.parametrize(
@@ -355,14 +357,61 @@ def test_parse_links__yanked_reason(anchor_html, expected):
     page = HTMLPage(
         html_bytes,
         encoding=None,
-        url='https://example.com/simple/',
+        # parse_links() is cached by url, so we inject a random uuid to ensure
+        # the page content isn't cached.
+        url='https://example.com/simple-{}/'.format(uuid.uuid4()),
     )
     links = list(parse_links(page))
     link, = links
     actual = link.yanked_reason
     assert actual == expected
 
 
+@skip_if_python2
+def test_parse_links_caches_same_page_by_url():
+    html = (
+        '<html><head><meta charset="utf-8"><head>'
+        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
+    )
+    html_bytes = html.encode('utf-8')
+
+    url = 'https://example.com/simple/'
+
+    page_1 = HTMLPage(
+        html_bytes,
+        encoding=None,
+        url=url,
+    )
+    # Make a second page with zero content, to ensure that it's not accessed,
+    # because the page was cached by url.
+    page_2 = HTMLPage(
+        b'',
+        encoding=None,
+        url=url,
+    )
+    # Make a third page which represents an index url, which should not be
+    # cached, even for the same url. We modify the page content slightly to
+    # verify that the result is not cached.
+    page_3 = HTMLPage(
+        re.sub(b'pkg1', b'pkg2', html_bytes),
+        encoding=None,
+        url=url,
+        cache_link_parsing=False,
+    )
+
+    parsed_links_1 = list(parse_links(page_1))
+    assert len(parsed_links_1) == 1
+    assert 'pkg1' in parsed_links_1[0].url
+
+    parsed_links_2 = list(parse_links(page_2))
+    assert parsed_links_2 == parsed_links_1
+
+    parsed_links_3 = list(parse_links(page_3))
+    assert len(parsed_links_3) == 1
+    assert parsed_links_3 != parsed_links_1
+    assert 'pkg2' in parsed_links_3[0].url
+
+
 def test_request_http_error(caplog):
     caplog.set_level(logging.DEBUG)
     link = Link('http://localhost')
@@ -528,13 +577,14 @@ def test_fetch_page(self, mock_get_html_response):
         fake_response = make_fake_html_response(url)
         mock_get_html_response.return_value = fake_response
 
-        location = Link(url)
+        location = Link(url, cache_link_parsing=False)
         link_collector = make_test_link_collector()
         actual = link_collector.fetch_page(location)
 
         assert actual.content == fake_response.content
         assert actual.encoding is None
         assert actual.url == url
+        assert actual.cache_link_parsing == location.cache_link_parsing
 
         # Also check that the right session object was passed to
         # _get_html_response().
@@ -559,8 +609,12 @@ def test_collect_links(self, caplog, data):
 
         assert len(actual.find_links) == 1
         check_links_include(actual.find_links, names=['packages'])
+        # Check that find-links URLs are marked as cacheable.
+        assert actual.find_links[0].cache_link_parsing
 
         assert actual.project_urls == [Link('https://pypi.org/simple/twine/')]
+        # Check that index URLs are marked as *un*cacheable.
+        assert not actual.project_urls[0].cache_link_parsing
 
         expected_message = dedent("""\
         1 location(s) to search for versions of twine:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Cache the result of parse_links() to avoid re-tokenizing a find-links page multiple times over a pip run.`
	`2`	`+`
	`3`	`+This change significantly improves resolve performance when --find-links points to a very large html page.`