pypa · cosmicexplorer · Jun 23, 2020 · Jun 23, 2020
diff --git a/news/8488.feature b/news/8488.feature
@@ -0,0 +1 @@
+The ``--unstable-feature=regex_link_parsing`` option is added, which uses regular expressions to find href links instead of parsing with html5lib. Turning on this option increased resolve performance by 12% on ``tensorflow==1.14.0``.
diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py
@@ -919,7 +919,7 @@ def check_list_path_option(options):
     metavar='feature',
     action='append',
     default=[],
-    choices=['resolver'],
+    choices=['resolver', 'regex_link_parsing'],
     help=SUPPRESS_HELP,  # TODO: Enable this when the resolver actually works.
     # help='Enable unstable feature(s) that may be backward incompatible.',
 )  # type: Callable[..., Option]

diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py
@@ -394,6 +394,8 @@ def _build_package_finder(
             allow_all_prereleases=options.pre,
             prefer_binary=options.prefer_binary,
             ignore_requires_python=ignore_requires_python,
+            use_regex_link_parsing=(
+                'regex_link_parsing' in options.unstable_features),
         )
 
         return PackageFinder.create(

diff --git a/src/pip/_internal/commands/list.py b/src/pip/_internal/commands/list.py
@@ -129,6 +129,8 @@ def _build_package_finder(self, options, session):
         selection_prefs = SelectionPreferences(
             allow_yanked=False,
             allow_all_prereleases=options.pre,
+            use_regex_link_parsing=(
+                'regex_link_parsing' in options.unstable_features),
         )
 
         return PackageFinder.create(

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -9,7 +9,7 @@
 import mimetypes
 import os
 import re
-from collections import OrderedDict
+from collections import OrderedDict, namedtuple
 
 from pip._vendor import html5lib, requests
 from pip._vendor.distlib.compat import unescape
@@ -324,9 +324,9 @@ def __hash__(self):
 
 
 def with_cached_html_pages(
-    fn,    # type: Callable[[HTMLPage], Iterable[Link]]
+    fn,    # type: Callable[[HTMLPage, bool], Iterable[Link]]
 ):
-    # type: (...) -> Callable[[HTMLPage], List[Link]]
+    # type: (...) -> Callable[[HTMLPage, bool], List[Link]]
     """
     Given a function that parses an Iterable[Link] from an HTMLPage, cache the
     function's result (keyed by CacheablePageContent), unless the HTMLPage
@@ -339,21 +339,97 @@ def wrapper(cacheable_page):
         return list(fn(cacheable_page.page))
 
     @functools.wraps(fn)
-    def wrapper_wrapper(page):
-        # type: (HTMLPage) -> List[Link]
+    def wrapper_wrapper(page, use_regex_link_parsing):
+        # type: (HTMLPage, bool) -> List[Link]
         if page.cache_link_parsing:
             return wrapper(CacheablePageContent(page))
-        return list(fn(page))
+        return list(fn(page, use_regex_link_parsing))
 
     return wrapper_wrapper
 
 
-@with_cached_html_pages
-def parse_links(page):
+_link_pattern = re.compile(r'href.?=.?"([^"]+)"')
+
+_max_search_gap = 50            # type: int
+
+
+class MatchedHref(namedtuple('MatchedHref', [
+        'tag',
+        'href',
+        'requires_python',
+        'yanked_reason',
+])):
+    def __new__(cls, tag, href, requires_python=None, yanked_reason=None):
+        # type: (str, str, Optional[str], Optional[str]) -> MatchedHref
+        return super(MatchedHref, cls).__new__(
+            cls,
+            tag=tag,
+            href=href,
+            requires_python=requires_python,
+            yanked_reason=yanked_reason)
+
+
+def _match_hrefs_with_regex(page_content):
+    # type: (str) -> Iterable[MatchedHref]
+    for href_match in _link_pattern.finditer(page_content):
+        href = href_match.group(1)
+        # (1) Find the tag name by searching backwards.
+        start = href_match.start()
+        backwards_start = max(0, start - _max_search_gap)
+        a_tag_start = page_content.rfind('<a', backwards_start, start)
+        base_tag_start = page_content.rfind('<base', backwards_start, start)
+        tag_start = max(a_tag_start, base_tag_start)
+        if tag_start == -1:
+            # This is neither <a> nor <base> -- ignore it.
+            continue
+        tag_first_char = page_content[tag_start+1:tag_start + 2]
+        if tag_first_char == 'a':
+            tag = 'a'
+        else:
+            assert tag_first_char == 'b'
+            tag = 'base'
+        # (2) Find any data-requires-python or data-yanked anchors.
+        # FIXME: TODO!!
+        pyrequire = None
+        yanked_reason = None
+
+        result = MatchedHref(
+            tag=tag,
+            href=href,
+            requires_python=pyrequire,
+            yanked_reason=yanked_reason,
+        )
+        yield result
+
+
+def _parse_links_regex(page_url, page_content):
+    # type: (str, str) -> Iterable[Link]
+    base_url = None             # type: Optional[str]
+    anchor_links = []           # type: List[MatchedHref]
+    for matched_href in _match_hrefs_with_regex(page_content):
+        if matched_href.tag == 'a':
+            anchor_links.append(matched_href)
+        else:
+            assert matched_href.tag == 'base'
+            if base_url is None:
+                base_url = matched_href.href
+    if base_url is None:
+        base_url = page_url
+
+    for matched_href in anchor_links:
+        href = matched_href.href
+        url = _clean_link(urllib_parse.urljoin(base_url, href))
+        link = Link(
+            url,
+            comes_from=page_url,
+            requires_python=matched_href.requires_python,
+            yanked_reason=matched_href.yanked_reason,
+        )
+        yield link
+
+
+def _parse_links_html5lib(page):
     # type: (HTMLPage) -> Iterable[Link]
-    """
-    Parse an HTML document, and yield its anchor elements as Link objects.
-    """
     document = html5lib.parse(
         page.content,
         transport_encoding=page.encoding,
@@ -373,6 +449,19 @@ def parse_links(page):
         yield link
 
 
+@with_cached_html_pages
+def parse_links(page, use_regex_link_parsing):
+    # type: (HTMLPage, bool) -> Iterable[Link]
+    """
+    Parse an HTML document, and yield its anchor elements as Link objects.
+    """
+    if use_regex_link_parsing:
+        yield from _parse_links_regex(page.url, page.decode_content())
+    else:
+        yield from _parse_links_html5lib(page)
+
+
+
 class HTMLPage(object):
     """Represents one page, along with its URL"""
 
@@ -400,6 +489,10 @@ def __str__(self):
         # type: () -> str
         return redact_auth_from_url(self.url)
 
+    def decode_content(self):
+        # type: () -> str
+        return self.content.decode(self.encoding)
+
 
 def _handle_get_page_fail(
     link,  # type: Link

diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py
@@ -593,6 +593,7 @@ def __init__(
         format_control=None,  # type: Optional[FormatControl]
         candidate_prefs=None,         # type: CandidatePreferences
         ignore_requires_python=None,  # type: Optional[bool]
+        use_regex_link_parsing=False,  # type: bool
     ):
         # type: (...) -> None
         """
@@ -621,6 +622,8 @@ def __init__(
         # These are boring links that have already been logged somehow.
         self._logged_links = set()  # type: Set[Link]
 
+        self._use_regex_link_parsing = use_regex_link_parsing
+
     # Don't include an allow_yanked default value to make sure each call
     # site considers whether yanked releases are allowed. This also causes
     # that decision to be made explicit in the calling code, which helps
@@ -656,6 +659,7 @@ def create(
             allow_yanked=selection_prefs.allow_yanked,
             format_control=selection_prefs.format_control,
             ignore_requires_python=selection_prefs.ignore_requires_python,
+            use_regex_link_parsing=selection_prefs.use_regex_link_parsing,
         )
 
     @property
@@ -791,7 +795,7 @@ def process_project_url(self, project_url, link_evaluator):
         if html_page is None:
             return []
 
-        page_links = list(parse_links(html_page))
+        page_links = list(parse_links(html_page, self._use_regex_link_parsing))
 
         with indent_log():
             package_links = self.evaluate_links(

diff --git a/src/pip/_internal/models/selection_prefs.py b/src/pip/_internal/models/selection_prefs.py
@@ -12,7 +12,8 @@ class SelectionPreferences(object):
     """
 
     __slots__ = ['allow_yanked', 'allow_all_prereleases', 'format_control',
-                 'prefer_binary', 'ignore_requires_python']
+                 'prefer_binary', 'ignore_requires_python',
+                 'use_regex_link_parsing']
 
     # Don't include an allow_yanked default value to make sure each call
     # site considers whether yanked releases are allowed. This also causes
@@ -25,6 +26,7 @@ def __init__(
         format_control=None,          # type: Optional[FormatControl]
         prefer_binary=False,          # type: bool
         ignore_requires_python=None,  # type: Optional[bool]
+        use_regex_link_parsing=False,  # type: bool
     ):
         # type: (...) -> None
         """Create a SelectionPreferences object.
@@ -47,3 +49,4 @@ def __init__(
         self.format_control = format_control
         self.prefer_binary = prefer_binary
         self.ignore_requires_python = ignore_requires_python
+        self.use_regex_link_parsing = use_regex_link_parsing
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The ``--unstable-feature=regex_link_parsing`` option is added, which uses regular expressions to find href links instead of parsing with html5lib. Turning on this option increased resolve performance by 12% on ``tensorflow==1.14.0``.