Skip to content

Commit 8809012

Browse files
cache html page fetching by link
1 parent c399ba2 commit 8809012

File tree

2 files changed

+54
-23
lines changed

2 files changed

+54
-23
lines changed

src/pip/_internal/index/collector.py

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,19 @@
99
import os
1010
from collections import OrderedDict
1111

12+
from pip._vendor import html5lib, requests
13+
from pip._vendor.distlib.compat import unescape
14+
from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError
15+
from pip._vendor.six.moves.urllib import parse as urllib_parse
16+
from pip._vendor.six.moves.urllib import request as urllib_request
17+
18+
from pip._internal.models.link import Link
19+
from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
20+
from pip._internal.utils.misc import redact_auth_from_url
21+
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
22+
from pip._internal.utils.urls import path_to_url, url_to_path
23+
from pip._internal.vcs import is_url, vcs
24+
1225
try:
1326
from functools import lru_cache
1427
except ImportError:
@@ -26,18 +39,6 @@ def wrapped(arg):
2639
return wrapped
2740
return wrapper
2841

29-
from pip._vendor import html5lib, requests
30-
from pip._vendor.distlib.compat import unescape
31-
from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError
32-
from pip._vendor.six.moves.urllib import parse as urllib_parse
33-
from pip._vendor.six.moves.urllib import request as urllib_request
34-
35-
from pip._internal.models.link import Link
36-
from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
37-
from pip._internal.utils.misc import redact_auth_from_url
38-
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
39-
from pip._internal.utils.urls import path_to_url, url_to_path
40-
from pip._internal.vcs import is_url, vcs
4142

4243
if MYPY_CHECK_RUNNING:
4344
from typing import (
@@ -261,15 +262,17 @@ def _create_link_from_element(
261262
return link
262263

263264

264-
class CacheablePage(object):
265+
class CacheablePageContent(object):
265266
def __init__(self, page):
266267
self.page = page
267268

268269
def __eq__(self, other):
269-
return isinstance(other, type(self)) and self.page.url == other.page.url
270+
return (isinstance(other, type(self)) and
271+
self.page.content == other.page.content and
272+
self.page.encoding == other.page.encoding)
270273

271274
def __hash__(self):
272-
return hash(self.page.url)
275+
return hash((self.page.content, self.page.encoding))
273276

274277

275278
def with_cached_html_pages(fn):
@@ -278,7 +281,7 @@ def wrapper(cacheable_page):
278281
return list(fn(cacheable_page.page))
279282

280283
def wrapper_wrapper(page):
281-
return wrapper(CacheablePage(page))
284+
return wrapper(CacheablePageContent(page))
282285

283286
return wrapper_wrapper
284287

@@ -348,6 +351,15 @@ def _make_html_page(response):
348351
return HTMLPage(response.content, encoding=encoding, url=response.url)
349352

350353

354+
def with_cached_link_fetch(fn):
355+
@lru_cache(maxsize=None)
356+
def wrapper(link, session=None):
357+
return fn(link, session=session)
358+
359+
return wrapper
360+
361+
362+
@with_cached_link_fetch
351363
def _get_html_page(link, session=None):
352364
# type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
353365
if session is None:

tests/unit/test_collector.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import logging
22
import os.path
3-
import uuid
43
from textwrap import dedent
54

65
import mock
@@ -11,6 +10,7 @@
1110
from pip._vendor.six.moves.urllib import request as urllib_request
1211

1312
from pip._internal.index.collector import (
13+
CacheablePageContent,
1414
HTMLPage,
1515
_clean_link,
1616
_determine_base_url,
@@ -270,7 +270,7 @@ def test_parse_links__yanked_reason(anchor_html, expected):
270270
page = HTMLPage(
271271
html_bytes,
272272
encoding=None,
273-
url='https://example.com/find-links-{}'.format(uuid.uuid4()),
273+
url='https://example.com/simple/',
274274
)
275275
links = list(parse_links(page))
276276
link, = links
@@ -287,19 +287,19 @@ def test_parse_links_caches_same_page():
287287
)
288288
html_bytes = html.encode('utf-8')
289289

290-
# The caching is only keyed on having the same `url`.
291290
page_1 = HTMLPage(
292291
html_bytes,
293292
encoding=None,
294-
url='https://example.com/some-find-links-url/',
293+
url='https://example.com/simple/',
295294
)
296295
page_2 = HTMLPage(
297296
html_bytes,
298297
encoding=None,
299-
url='https://example.com/some-find-links-url/',
298+
url='https://example.com/simple/',
300299
)
301300

302-
with mock.patch("pip._internal.index.collector.html5lib.parse") as mock_parse:
301+
mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
302+
with mock_parse as mock_parse:
303303
mock_parse.return_value = html5lib.parse(
304304
page_1.content,
305305
transport_encoding=page_1.encoding,
@@ -308,7 +308,7 @@ def test_parse_links_caches_same_page():
308308
parsed_links_1 = list(parse_links(page_1))
309309
mock_parse.assert_called()
310310

311-
with mock.patch("pip._internal.index.collector.html5lib.parse") as mock_parse:
311+
with mock_parse as mock_parse:
312312
parsed_links_2 = list(parse_links(page_2))
313313
assert parsed_links_2 == parsed_links_1
314314
mock_parse.assert_not_called()
@@ -378,6 +378,25 @@ def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme):
378378
]
379379

380380

381+
def test_get_html_page_caches_same_link():
382+
link = Link('https://example.com/link-1/')
383+
session = mock.Mock(PipSession)
384+
385+
fake_response = make_fake_html_response(link.url)
386+
mock_func = mock.patch("pip._internal.index.collector._get_html_response")
387+
with mock_func as mock_func:
388+
mock_func.return_value = fake_response
389+
page_1 = _get_html_page(link, session=session)
390+
mock_func.assert_called_once()
391+
392+
with mock_func as mock_func:
393+
page_2 = _get_html_page(link, session=session)
394+
# Assert that the result of the cached html page fetch will also then
395+
# be cached by parse_links() and @with_cached_html_pages.
396+
assert CacheablePageContent(page_1) == CacheablePageContent(page_2)
397+
mock_func.assert_not_called()
398+
399+
381400
def make_fake_html_response(url):
382401
"""
383402
Create a fake requests.Response object.

0 commit comments

Comments
 (0)