3
3
"""
4
4
5
5
import cgi
6
+ import functools
6
7
import itertools
7
8
import logging
8
9
import mimetypes
25
26
26
27
if MYPY_CHECK_RUNNING :
27
28
from typing import (
28
- Callable , Iterable , List , MutableMapping , Optional , Sequence , Tuple ,
29
- Union ,
29
+ Callable , Iterable , List , MutableMapping , Optional ,
30
+ Protocol , Sequence , Tuple , TypeVar , Union ,
30
31
)
31
32
import xml .etree .ElementTree
32
33
38
39
HTMLElement = xml .etree .ElementTree .Element
39
40
ResponseHeaders = MutableMapping [str , str ]
40
41
42
+ # Used in the @lru_cache polyfill.
43
+ F = TypeVar ('F' )
44
+
45
+ class LruCache (Protocol ):
46
+ def __call__ (self , maxsize = None ):
47
+ # type: (Optional[int]) -> Callable[[F], F]
48
+ raise NotImplementedError
49
+
41
50
42
51
logger = logging .getLogger (__name__ )
43
52
44
53
54
+ # Fallback to noop_lru_cache in Python 2
55
+ # TODO: this can be removed when python 2 support is dropped!
56
+ def noop_lru_cache (maxsize = None ):
57
+ # type: (Optional[int]) -> Callable[[F], F]
58
+ def _wrapper (f ):
59
+ # type: (F) -> F
60
+ return f
61
+ return _wrapper
62
+
63
+
64
+ _lru_cache = getattr (functools , "lru_cache" , noop_lru_cache ) # type: LruCache
65
+
66
+
45
67
def _match_vcs_scheme (url ):
46
68
# type: (str) -> Optional[str]
47
69
"""Look for VCS schemes in the URL.
@@ -285,6 +307,48 @@ def _create_link_from_element(
285
307
return link
286
308
287
309
310
+ class CacheablePageContent (object ):
311
+ def __init__ (self , page ):
312
+ # type: (HTMLPage) -> None
313
+ assert page .cache_link_parsing
314
+ self .page = page
315
+
316
+ def __eq__ (self , other ):
317
+ # type: (object) -> bool
318
+ return (isinstance (other , type (self )) and
319
+ self .page .url == other .page .url )
320
+
321
+ def __hash__ (self ):
322
+ # type: () -> int
323
+ return hash (self .page .url )
324
+
325
+
326
+ def with_cached_html_pages (
327
+ fn , # type: Callable[[HTMLPage], Iterable[Link]]
328
+ ):
329
+ # type: (...) -> Callable[[HTMLPage], List[Link]]
330
+ """
331
+ Given a function that parses an Iterable[Link] from an HTMLPage, cache the
332
+ function's result (keyed by CacheablePageContent), unless the HTMLPage
333
+ `page` has `page.cache_link_parsing == False`.
334
+ """
335
+
336
+ @_lru_cache (maxsize = None )
337
+ def wrapper (cacheable_page ):
338
+ # type: (CacheablePageContent) -> List[Link]
339
+ return list (fn (cacheable_page .page ))
340
+
341
+ @functools .wraps (fn )
342
+ def wrapper_wrapper (page ):
343
+ # type: (HTMLPage) -> List[Link]
344
+ if page .cache_link_parsing :
345
+ return wrapper (CacheablePageContent (page ))
346
+ return list (fn (page ))
347
+
348
+ return wrapper_wrapper
349
+
350
+
351
+ @with_cached_html_pages
288
352
def parse_links (page ):
289
353
# type: (HTMLPage) -> Iterable[Link]
290
354
"""
@@ -314,18 +378,23 @@ class HTMLPage(object):
314
378
315
379
def __init__ (
316
380
self ,
317
- content , # type: bytes
318
- encoding , # type: Optional[str]
319
- url , # type: str
381
+ content , # type: bytes
382
+ encoding , # type: Optional[str]
383
+ url , # type: str
384
+ cache_link_parsing = True , # type: bool
320
385
):
321
386
# type: (...) -> None
322
387
"""
323
388
:param encoding: the encoding to decode the given content.
324
389
:param url: the URL from which the HTML was downloaded.
390
+ :param cache_link_parsing: whether links parsed from this page's url
391
+ should be cached. PyPI index urls should
392
+ have this set to False, for example.
325
393
"""
326
394
self .content = content
327
395
self .encoding = encoding
328
396
self .url = url
397
+ self .cache_link_parsing = cache_link_parsing
329
398
330
399
def __str__ (self ):
331
400
# type: () -> str
@@ -343,10 +412,14 @@ def _handle_get_page_fail(
343
412
meth ("Could not fetch URL %s: %s - skipping" , link , reason )
344
413
345
414
346
- def _make_html_page (response ):
347
- # type: (Response) -> HTMLPage
415
+ def _make_html_page (response , cache_link_parsing = True ):
416
+ # type: (Response, bool ) -> HTMLPage
348
417
encoding = _get_encoding_from_headers (response .headers )
349
- return HTMLPage (response .content , encoding = encoding , url = response .url )
418
+ return HTMLPage (
419
+ response .content ,
420
+ encoding = encoding ,
421
+ url = response .url ,
422
+ cache_link_parsing = cache_link_parsing )
350
423
351
424
352
425
def _get_html_page (link , session = None ):
@@ -399,7 +472,8 @@ def _get_html_page(link, session=None):
399
472
except requests .Timeout :
400
473
_handle_get_page_fail (link , "timed out" )
401
474
else :
402
- return _make_html_page (resp )
475
+ return _make_html_page (resp ,
476
+ cache_link_parsing = link .cache_link_parsing )
403
477
return None
404
478
405
479
@@ -562,7 +636,9 @@ def collect_links(self, project_name):
562
636
# We want to filter out anything that does not have a secure origin.
563
637
url_locations = [
564
638
link for link in itertools .chain (
565
- (Link (url ) for url in index_url_loc ),
639
+ # Mark PyPI indices as "cache_link_parsing == False" -- this
640
+ # will avoid caching the result of parsing the page for links.
641
+ (Link (url , cache_link_parsing = False ) for url in index_url_loc ),
566
642
(Link (url ) for url in fl_url_loc ),
567
643
)
568
644
if self .session .is_secure_origin (link )
0 commit comments