Skip to content

add --unstable-feature=regex_link_parsing for a 12% resolve speedup #8488

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/8488.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The ``--unstable-feature=regex_link_parsing`` option is added, which uses regular expressions to find href links instead of parsing with html5lib. Turning on this option increased resolve performance by 12% on ``tensorflow==1.14.0``.
2 changes: 1 addition & 1 deletion src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,7 @@ def check_list_path_option(options):
metavar='feature',
action='append',
default=[],
choices=['resolver'],
choices=['resolver', 'regex_link_parsing'],
help=SUPPRESS_HELP, # TODO: Enable this when the resolver actually works.
# help='Enable unstable feature(s) that may be backward incompatible.',
) # type: Callable[..., Option]
Expand Down
2 changes: 2 additions & 0 deletions src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,8 @@ def _build_package_finder(
allow_all_prereleases=options.pre,
prefer_binary=options.prefer_binary,
ignore_requires_python=ignore_requires_python,
use_regex_link_parsing=(
'regex_link_parsing' in options.unstable_features),
)

return PackageFinder.create(
Expand Down
2 changes: 2 additions & 0 deletions src/pip/_internal/commands/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ def _build_package_finder(self, options, session):
selection_prefs = SelectionPreferences(
allow_yanked=False,
allow_all_prereleases=options.pre,
use_regex_link_parsing=(
'regex_link_parsing' in options.unstable_features),
)

return PackageFinder.create(
Expand Down
115 changes: 104 additions & 11 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import mimetypes
import os
import re
from collections import OrderedDict
from collections import OrderedDict, namedtuple

from pip._vendor import html5lib, requests
from pip._vendor.distlib.compat import unescape
Expand Down Expand Up @@ -324,9 +324,9 @@ def __hash__(self):


def with_cached_html_pages(
fn, # type: Callable[[HTMLPage], Iterable[Link]]
fn, # type: Callable[[HTMLPage, bool], Iterable[Link]]
):
# type: (...) -> Callable[[HTMLPage], List[Link]]
# type: (...) -> Callable[[HTMLPage, bool], List[Link]]
"""
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
function's result (keyed by CacheablePageContent), unless the HTMLPage
Expand All @@ -339,21 +339,97 @@ def wrapper(cacheable_page):
return list(fn(cacheable_page.page))

@functools.wraps(fn)
def wrapper_wrapper(page):
# type: (HTMLPage) -> List[Link]
def wrapper_wrapper(page, use_regex_link_parsing):
# type: (HTMLPage, bool) -> List[Link]
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page))
return list(fn(page))
return list(fn(page, use_regex_link_parsing))

return wrapper_wrapper


@with_cached_html_pages
def parse_links(page):
_link_pattern = re.compile(r'href.?=.?"([^"]+)"')

_max_search_gap = 50 # type: int


class MatchedHref(namedtuple('MatchedHref', [
'tag',
'href',
'requires_python',
'yanked_reason',
])):
def __new__(cls, tag, href, requires_python=None, yanked_reason=None):
# type: (str, str, Optional[str], Optional[str]) -> MatchedHref
return super(MatchedHref, cls).__new__(
cls,
tag=tag,
href=href,
requires_python=requires_python,
yanked_reason=yanked_reason)


def _match_hrefs_with_regex(page_content):
# type: (str) -> Iterable[MatchedHref]
for href_match in _link_pattern.finditer(page_content):
href = href_match.group(1)
# (1) Find the tag name by searching backwards.
start = href_match.start()
backwards_start = max(0, start - _max_search_gap)
a_tag_start = page_content.rfind('<a', backwards_start, start)
base_tag_start = page_content.rfind('<base', backwards_start, start)
tag_start = max(a_tag_start, base_tag_start)
if tag_start == -1:
# This is neither <a> nor <base> -- ignore it.
continue
tag_first_char = page_content[tag_start+1:tag_start + 2]
if tag_first_char == 'a':
tag = 'a'
else:
assert tag_first_char == 'b'
tag = 'base'
# (2) Find any data-requires-python or data-yanked anchors.
# FIXME: TODO!!
pyrequire = None
yanked_reason = None

result = MatchedHref(
tag=tag,
href=href,
requires_python=pyrequire,
yanked_reason=yanked_reason,
)
yield result


def _parse_links_regex(page_url, page_content):
# type: (str, str) -> Iterable[Link]
base_url = None # type: Optional[str]
anchor_links = [] # type: List[MatchedHref]
for matched_href in _match_hrefs_with_regex(page_content):
if matched_href.tag == 'a':
anchor_links.append(matched_href)
else:
assert matched_href.tag == 'base'
if base_url is None:
base_url = matched_href.href
if base_url is None:
base_url = page_url

for matched_href in anchor_links:
href = matched_href.href
url = _clean_link(urllib_parse.urljoin(base_url, href))
link = Link(
url,
comes_from=page_url,
requires_python=matched_href.requires_python,
yanked_reason=matched_href.yanked_reason,
)
yield link


def _parse_links_html5lib(page):
# type: (HTMLPage) -> Iterable[Link]
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
document = html5lib.parse(
page.content,
transport_encoding=page.encoding,
Expand All @@ -373,6 +449,19 @@ def parse_links(page):
yield link


@with_cached_html_pages
def parse_links(page, use_regex_link_parsing):
# type: (HTMLPage, bool) -> Iterable[Link]
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
if use_regex_link_parsing:
yield from _parse_links_regex(page.url, page.decode_content())
else:
yield from _parse_links_html5lib(page)



class HTMLPage(object):
"""Represents one page, along with its URL"""

Expand Down Expand Up @@ -400,6 +489,10 @@ def __str__(self):
# type: () -> str
return redact_auth_from_url(self.url)

def decode_content(self):
# type: () -> str
return self.content.decode(self.encoding)


def _handle_get_page_fail(
link, # type: Link
Expand Down
6 changes: 5 additions & 1 deletion src/pip/_internal/index/package_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,7 @@ def __init__(
format_control=None, # type: Optional[FormatControl]
candidate_prefs=None, # type: CandidatePreferences
ignore_requires_python=None, # type: Optional[bool]
use_regex_link_parsing=False, # type: bool
):
# type: (...) -> None
"""
Expand Down Expand Up @@ -621,6 +622,8 @@ def __init__(
# These are boring links that have already been logged somehow.
self._logged_links = set() # type: Set[Link]

self._use_regex_link_parsing = use_regex_link_parsing

# Don't include an allow_yanked default value to make sure each call
# site considers whether yanked releases are allowed. This also causes
# that decision to be made explicit in the calling code, which helps
Expand Down Expand Up @@ -656,6 +659,7 @@ def create(
allow_yanked=selection_prefs.allow_yanked,
format_control=selection_prefs.format_control,
ignore_requires_python=selection_prefs.ignore_requires_python,
use_regex_link_parsing=selection_prefs.use_regex_link_parsing,
)

@property
Expand Down Expand Up @@ -791,7 +795,7 @@ def process_project_url(self, project_url, link_evaluator):
if html_page is None:
return []

page_links = list(parse_links(html_page))
page_links = list(parse_links(html_page, self._use_regex_link_parsing))

with indent_log():
package_links = self.evaluate_links(
Expand Down
5 changes: 4 additions & 1 deletion src/pip/_internal/models/selection_prefs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ class SelectionPreferences(object):
"""

__slots__ = ['allow_yanked', 'allow_all_prereleases', 'format_control',
'prefer_binary', 'ignore_requires_python']
'prefer_binary', 'ignore_requires_python',
'use_regex_link_parsing']

# Don't include an allow_yanked default value to make sure each call
# site considers whether yanked releases are allowed. This also causes
Expand All @@ -25,6 +26,7 @@ def __init__(
format_control=None, # type: Optional[FormatControl]
prefer_binary=False, # type: bool
ignore_requires_python=None, # type: Optional[bool]
use_regex_link_parsing=False, # type: bool
):
# type: (...) -> None
"""Create a SelectionPreferences object.
Expand All @@ -47,3 +49,4 @@ def __init__(
self.format_control = format_control
self.prefer_binary = prefer_binary
self.ignore_requires_python = ignore_requires_python
self.use_regex_link_parsing = use_regex_link_parsing