Skip to content

Commit d7318e6

Browse files
add failing test
apply the fix add template NEWS entry according to https://pip.pypa.io/en/latest/development/contributing/#news-entries (wrong PR #) rename news entry to the current PR # respond to review comments fix test failures fix tests by adding uuid salt in urls cache html page fetching by link make CI pass (?) make the types much better finally listen to the maintainer and cache parse_links() by url :) avoid caching parse_links() when the url is an index url cleanup add testing for uncachable marking only conditionally vendor _lru_cache for py2 bugfix => feature python 2 does not cache! Do away with type: ignore with getattr() respond to review comments
1 parent 451f5d9 commit d7318e6

File tree

4 files changed

+154
-13
lines changed

4 files changed

+154
-13
lines changed

news/7729.feature

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Cache the result of parse_links() to avoid re-tokenizing a find-links page multiple times over a pip run.
2+
3+
This change significantly improves resolve performance when --find-links points to a very large html page.

src/pip/_internal/index/collector.py

+86-10
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import cgi
6+
import functools
67
import itertools
78
import logging
89
import mimetypes
@@ -25,8 +26,8 @@
2526

2627
if MYPY_CHECK_RUNNING:
2728
from typing import (
28-
Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
29-
Union,
29+
Callable, Iterable, List, MutableMapping, Optional,
30+
Protocol, Sequence, Tuple, TypeVar, Union,
3031
)
3132
import xml.etree.ElementTree
3233

@@ -38,10 +39,31 @@
3839
HTMLElement = xml.etree.ElementTree.Element
3940
ResponseHeaders = MutableMapping[str, str]
4041

42+
# Used in the @lru_cache polyfill.
43+
F = TypeVar('F')
44+
45+
class LruCache(Protocol):
46+
def __call__(self, maxsize=None):
47+
# type: (Optional[int]) -> Callable[[F], F]
48+
raise NotImplementedError
49+
4150

4251
logger = logging.getLogger(__name__)
4352

4453

54+
# Fallback to noop_lru_cache in Python 2
55+
# TODO: this can be removed when python 2 support is dropped!
56+
def noop_lru_cache(maxsize=None):
57+
# type: (Optional[int]) -> Callable[[F], F]
58+
def _wrapper(f):
59+
# type: (F) -> F
60+
return f
61+
return _wrapper
62+
63+
64+
_lru_cache = getattr(functools, "lru_cache", noop_lru_cache) # type: LruCache
65+
66+
4567
def _match_vcs_scheme(url):
4668
# type: (str) -> Optional[str]
4769
"""Look for VCS schemes in the URL.
@@ -285,6 +307,48 @@ def _create_link_from_element(
285307
return link
286308

287309

310+
class CacheablePageContent(object):
311+
def __init__(self, page):
312+
# type: (HTMLPage) -> None
313+
assert page.cache_link_parsing
314+
self.page = page
315+
316+
def __eq__(self, other):
317+
# type: (object) -> bool
318+
return (isinstance(other, type(self)) and
319+
self.page.url == other.page.url)
320+
321+
def __hash__(self):
322+
# type: () -> int
323+
return hash(self.page.url)
324+
325+
326+
def with_cached_html_pages(
327+
fn, # type: Callable[[HTMLPage], Iterable[Link]]
328+
):
329+
# type: (...) -> Callable[[HTMLPage], List[Link]]
330+
"""
331+
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
332+
function's result (keyed by CacheablePageContent), unless the HTMLPage
333+
`page` has `page.cache_link_parsing == False`.
334+
"""
335+
336+
@_lru_cache(maxsize=None)
337+
def wrapper(cacheable_page):
338+
# type: (CacheablePageContent) -> List[Link]
339+
return list(fn(cacheable_page.page))
340+
341+
@functools.wraps(fn)
342+
def wrapper_wrapper(page):
343+
# type: (HTMLPage) -> List[Link]
344+
if page.cache_link_parsing:
345+
return wrapper(CacheablePageContent(page))
346+
return list(fn(page))
347+
348+
return wrapper_wrapper
349+
350+
351+
@with_cached_html_pages
288352
def parse_links(page):
289353
# type: (HTMLPage) -> Iterable[Link]
290354
"""
@@ -314,18 +378,23 @@ class HTMLPage(object):
314378

315379
def __init__(
316380
self,
317-
content, # type: bytes
318-
encoding, # type: Optional[str]
319-
url, # type: str
381+
content, # type: bytes
382+
encoding, # type: Optional[str]
383+
url, # type: str
384+
cache_link_parsing=True, # type: bool
320385
):
321386
# type: (...) -> None
322387
"""
323388
:param encoding: the encoding to decode the given content.
324389
:param url: the URL from which the HTML was downloaded.
390+
:param cache_link_parsing: whether links parsed from this page's url
391+
should be cached. PyPI index urls should
392+
have this set to False, for example.
325393
"""
326394
self.content = content
327395
self.encoding = encoding
328396
self.url = url
397+
self.cache_link_parsing = cache_link_parsing
329398

330399
def __str__(self):
331400
# type: () -> str
@@ -343,10 +412,14 @@ def _handle_get_page_fail(
343412
meth("Could not fetch URL %s: %s - skipping", link, reason)
344413

345414

346-
def _make_html_page(response):
347-
# type: (Response) -> HTMLPage
415+
def _make_html_page(response, cache_link_parsing=True):
416+
# type: (Response, bool) -> HTMLPage
348417
encoding = _get_encoding_from_headers(response.headers)
349-
return HTMLPage(response.content, encoding=encoding, url=response.url)
418+
return HTMLPage(
419+
response.content,
420+
encoding=encoding,
421+
url=response.url,
422+
cache_link_parsing=cache_link_parsing)
350423

351424

352425
def _get_html_page(link, session=None):
@@ -399,7 +472,8 @@ def _get_html_page(link, session=None):
399472
except requests.Timeout:
400473
_handle_get_page_fail(link, "timed out")
401474
else:
402-
return _make_html_page(resp)
475+
return _make_html_page(resp,
476+
cache_link_parsing=link.cache_link_parsing)
403477
return None
404478

405479

@@ -562,7 +636,9 @@ def collect_links(self, project_name):
562636
# We want to filter out anything that does not have a secure origin.
563637
url_locations = [
564638
link for link in itertools.chain(
565-
(Link(url) for url in index_url_loc),
639+
# Mark PyPI indices as "cache_link_parsing == False" -- this
640+
# will avoid caching the result of parsing the page for links.
641+
(Link(url, cache_link_parsing=False) for url in index_url_loc),
566642
(Link(url) for url in fl_url_loc),
567643
)
568644
if self.session.is_secure_origin(link)

src/pip/_internal/models/link.py

+8
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def __init__(
3030
comes_from=None, # type: Optional[Union[str, HTMLPage]]
3131
requires_python=None, # type: Optional[str]
3232
yanked_reason=None, # type: Optional[Text]
33+
cache_link_parsing=True, # type: bool
3334
):
3435
# type: (...) -> None
3536
"""
@@ -46,6 +47,11 @@ def __init__(
4647
a simple repository HTML link. If the file has been yanked but
4748
no reason was provided, this should be the empty string. See
4849
PEP 592 for more information and the specification.
50+
:param cache_link_parsing: A flag that is used elsewhere to determine
51+
whether resources retrieved from this link
52+
should be cached. PyPI index urls should
53+
generally have this set to False, for
54+
example.
4955
"""
5056

5157
# url can be a UNC windows share
@@ -63,6 +69,8 @@ def __init__(
6369

6470
super(Link, self).__init__(key=url, defining_class=Link)
6571

72+
self.cache_link_parsing = cache_link_parsing
73+
6674
def __str__(self):
6775
# type: () -> str
6876
if self.requires_python:

tests/unit/test_collector.py

+57-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import logging
22
import os.path
3+
import re
4+
import uuid
35
from textwrap import dedent
46

57
import mock
@@ -26,7 +28,7 @@
2628
from pip._internal.models.index import PyPI
2729
from pip._internal.models.link import Link
2830
from pip._internal.network.session import PipSession
29-
from tests.lib import make_test_link_collector
31+
from tests.lib import make_test_link_collector, skip_if_python2
3032

3133

3234
@pytest.mark.parametrize(
@@ -355,14 +357,61 @@ def test_parse_links__yanked_reason(anchor_html, expected):
355357
page = HTMLPage(
356358
html_bytes,
357359
encoding=None,
358-
url='https://example.com/simple/',
360+
# parse_links() is cached by url, so we inject a random uuid to ensure
361+
# the page content isn't cached.
362+
url='https://example.com/simple-{}/'.format(uuid.uuid4()),
359363
)
360364
links = list(parse_links(page))
361365
link, = links
362366
actual = link.yanked_reason
363367
assert actual == expected
364368

365369

370+
@skip_if_python2
371+
def test_parse_links_caches_same_page_by_url():
372+
html = (
373+
'<html><head><meta charset="utf-8"><head>'
374+
'<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
375+
)
376+
html_bytes = html.encode('utf-8')
377+
378+
url = 'https://example.com/simple/'
379+
380+
page_1 = HTMLPage(
381+
html_bytes,
382+
encoding=None,
383+
url=url,
384+
)
385+
# Make a second page with zero content, to ensure that it's not accessed,
386+
# because the page was cached by url.
387+
page_2 = HTMLPage(
388+
b'',
389+
encoding=None,
390+
url=url,
391+
)
392+
# Make a third page which represents an index url, which should not be
393+
# cached, even for the same url. We modify the page content slightly to
394+
# verify that the result is not cached.
395+
page_3 = HTMLPage(
396+
re.sub(b'pkg1', b'pkg2', html_bytes),
397+
encoding=None,
398+
url=url,
399+
cache_link_parsing=False,
400+
)
401+
402+
parsed_links_1 = list(parse_links(page_1))
403+
assert len(parsed_links_1) == 1
404+
assert 'pkg1' in parsed_links_1[0].url
405+
406+
parsed_links_2 = list(parse_links(page_2))
407+
assert parsed_links_2 == parsed_links_1
408+
409+
parsed_links_3 = list(parse_links(page_3))
410+
assert len(parsed_links_3) == 1
411+
assert parsed_links_3 != parsed_links_1
412+
assert 'pkg2' in parsed_links_3[0].url
413+
414+
366415
def test_request_http_error(caplog):
367416
caplog.set_level(logging.DEBUG)
368417
link = Link('http://localhost')
@@ -528,13 +577,14 @@ def test_fetch_page(self, mock_get_html_response):
528577
fake_response = make_fake_html_response(url)
529578
mock_get_html_response.return_value = fake_response
530579

531-
location = Link(url)
580+
location = Link(url, cache_link_parsing=False)
532581
link_collector = make_test_link_collector()
533582
actual = link_collector.fetch_page(location)
534583

535584
assert actual.content == fake_response.content
536585
assert actual.encoding is None
537586
assert actual.url == url
587+
assert actual.cache_link_parsing == location.cache_link_parsing
538588

539589
# Also check that the right session object was passed to
540590
# _get_html_response().
@@ -559,8 +609,12 @@ def test_collect_links(self, caplog, data):
559609

560610
assert len(actual.find_links) == 1
561611
check_links_include(actual.find_links, names=['packages'])
612+
# Check that find-links URLs are marked as cacheable.
613+
assert actual.find_links[0].cache_link_parsing
562614

563615
assert actual.project_urls == [Link('https://pypi.org/simple/twine/')]
616+
# Check that index URLs are marked as *un*cacheable.
617+
assert not actual.project_urls[0].cache_link_parsing
564618

565619
expected_message = dedent("""\
566620
1 location(s) to search for versions of twine:

0 commit comments

Comments
 (0)