Skip to content

Commit 649048b

Browse files
authored
Merge pull request #10291 from jdufresne/html5lib
2 parents 98b1022 + 20fe83f commit 649048b

File tree

26 files changed

+177
-28
lines changed

26 files changed

+177
-28
lines changed

news/10291.feature.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Changed ``PackageFinder`` to parse HTML documents using the stdlib
2+
:class:`html.parser.HTMLParser` class instead of the ``html5lib`` package. For
3+
now, the deprecated ``html5lib`` code remains and can be used with the
4+
``--use-deprecated=html5lib`` command line option, but it will be removed in a
5+
future pip release.

src/pip/_internal/cli/cmdoptions.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -964,7 +964,12 @@ def check_list_path_option(options: Values) -> None:
964964
metavar="feature",
965965
action="append",
966966
default=[],
967-
choices=["legacy-resolver", "out-of-tree-build", "backtrack-on-build-failures"],
967+
choices=[
968+
"legacy-resolver",
969+
"out-of-tree-build",
970+
"backtrack-on-build-failures",
971+
"html5lib",
972+
],
968973
help=("Enable deprecated functionality, that will be removed in the future."),
969974
)
970975

src/pip/_internal/cli/req_command.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,4 +502,5 @@ def _build_package_finder(
502502
link_collector=link_collector,
503503
selection_prefs=selection_prefs,
504504
target_python=target_python,
505+
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
505506
)

src/pip/_internal/commands/index.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def _build_package_finder(
9797
link_collector=link_collector,
9898
selection_prefs=selection_prefs,
9999
target_python=target_python,
100+
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
100101
)
101102

102103
def get_available_package_versions(self, options: Values, args: List[Any]) -> None:

src/pip/_internal/commands/list.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ def _build_package_finder(
149149
return PackageFinder.create(
150150
link_collector=link_collector,
151151
selection_prefs=selection_prefs,
152+
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
152153
)
153154

154155
def run(self, options: Values, args: List[str]) -> int:

src/pip/_internal/index/collector.py

Lines changed: 103 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,20 @@
1212
import urllib.parse
1313
import urllib.request
1414
import xml.etree.ElementTree
15+
from html.parser import HTMLParser
1516
from optparse import Values
1617
from typing import (
18+
TYPE_CHECKING,
19+
Any,
1720
Callable,
21+
Dict,
1822
Iterable,
1923
List,
2024
MutableMapping,
2125
NamedTuple,
2226
Optional,
2327
Sequence,
28+
Tuple,
2429
Union,
2530
)
2631

@@ -39,6 +44,11 @@
3944

4045
from .sources import CandidatesFromPage, LinkSource, build_source
4146

47+
if TYPE_CHECKING:
48+
from typing import Protocol
49+
else:
50+
Protocol = object
51+
4252
logger = logging.getLogger(__name__)
4353

4454
HTMLElement = xml.etree.ElementTree.Element
@@ -163,6 +173,8 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str:
163173
:param document: An HTML document representation. The current
164174
implementation expects the result of ``html5lib.parse()``.
165175
:param page_url: The URL of the HTML document.
176+
177+
TODO: Remove when `html5lib` is dropped.
166178
"""
167179
for base in document.findall(".//base"):
168180
href = base.get("href")
@@ -234,20 +246,20 @@ def _clean_link(url: str) -> str:
234246

235247

236248
def _create_link_from_element(
237-
anchor: HTMLElement,
249+
element_attribs: Dict[str, Optional[str]],
238250
page_url: str,
239251
base_url: str,
240252
) -> Optional[Link]:
241253
"""
242-
Convert an anchor element in a simple repository page to a Link.
254+
Convert an anchor element's attributes in a simple repository page to a Link.
243255
"""
244-
href = anchor.get("href")
256+
href = element_attribs.get("href")
245257
if not href:
246258
return None
247259

248260
url = _clean_link(urllib.parse.urljoin(base_url, href))
249-
pyrequire = anchor.get("data-requires-python")
250-
yanked_reason = anchor.get("data-yanked")
261+
pyrequire = element_attribs.get("data-requires-python")
262+
yanked_reason = element_attribs.get("data-yanked")
251263

252264
link = Link(
253265
url,
@@ -271,32 +283,40 @@ def __hash__(self) -> int:
271283
return hash(self.page.url)
272284

273285

274-
def with_cached_html_pages(
275-
fn: Callable[["HTMLPage"], Iterable[Link]],
276-
) -> Callable[["HTMLPage"], List[Link]]:
286+
class ParseLinks(Protocol):
287+
def __call__(
288+
self, page: "HTMLPage", use_deprecated_html5lib: bool
289+
) -> Iterable[Link]:
290+
...
291+
292+
293+
def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
277294
"""
278295
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
279296
function's result (keyed by CacheablePageContent), unless the HTMLPage
280297
`page` has `page.cache_link_parsing == False`.
281298
"""
282299

283300
@functools.lru_cache(maxsize=None)
284-
def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
285-
return list(fn(cacheable_page.page))
301+
def wrapper(
302+
cacheable_page: CacheablePageContent, use_deprecated_html5lib: bool
303+
) -> List[Link]:
304+
return list(fn(cacheable_page.page, use_deprecated_html5lib))
286305

287306
@functools.wraps(fn)
288-
def wrapper_wrapper(page: "HTMLPage") -> List[Link]:
307+
def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
289308
if page.cache_link_parsing:
290-
return wrapper(CacheablePageContent(page))
291-
return list(fn(page))
309+
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
310+
return list(fn(page, use_deprecated_html5lib))
292311

293312
return wrapper_wrapper
294313

295314

296-
@with_cached_html_pages
297-
def parse_links(page: "HTMLPage") -> Iterable[Link]:
315+
def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
298316
"""
299317
Parse an HTML document, and yield its anchor elements as Link objects.
318+
319+
TODO: Remove when `html5lib` is dropped.
300320
"""
301321
document = html5lib.parse(
302322
page.content,
@@ -307,6 +327,31 @@ def parse_links(page: "HTMLPage") -> Iterable[Link]:
307327
url = page.url
308328
base_url = _determine_base_url(document, url)
309329
for anchor in document.findall(".//a"):
330+
link = _create_link_from_element(
331+
anchor.attrib,
332+
page_url=url,
333+
base_url=base_url,
334+
)
335+
if link is None:
336+
continue
337+
yield link
338+
339+
340+
@with_cached_html_pages
341+
def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
342+
"""
343+
Parse an HTML document, and yield its anchor elements as Link objects.
344+
"""
345+
if use_deprecated_html5lib:
346+
return _parse_links_html5lib(page)
347+
348+
parser = HTMLLinkParser()
349+
encoding = page.encoding or "utf-8"
350+
parser.feed(page.content.decode(encoding))
351+
352+
url = page.url
353+
base_url = parser.base_url or url
354+
for anchor in parser.anchors:
310355
link = _create_link_from_element(
311356
anchor,
312357
page_url=url,
@@ -343,6 +388,49 @@ def __str__(self) -> str:
343388
return redact_auth_from_url(self.url)
344389

345390

391+
class HTMLLinkParser(HTMLParser):
392+
"""
393+
HTMLParser that keeps the first base HREF and a list of all anchor
394+
elements' attributes.
395+
"""
396+
397+
def __init__(self, *args: Any, **kwargs: Any) -> None:
398+
super().__init__(*args, **kwargs)
399+
self._seen_decl = False
400+
self.base_url: Optional[str] = None
401+
self.anchors: List[Dict[str, Optional[str]]] = []
402+
403+
def handle_decl(self, decl: str) -> None:
404+
if decl != "DOCTYPE html":
405+
self._raise_error()
406+
self._seen_decl = True
407+
408+
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
409+
if not self._seen_decl:
410+
self._raise_error()
411+
412+
if tag == "base" and self.base_url is None:
413+
href = self.get_href(attrs)
414+
if href is not None:
415+
self.base_url = href
416+
elif tag == "a":
417+
self.anchors.append(dict(attrs))
418+
419+
def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
420+
for name, value in attrs:
421+
if name == "href":
422+
return value
423+
return None
424+
425+
def _raise_error(self) -> None:
426+
raise ValueError(
427+
"HTML doctype missing or incorrect. Expected <!DOCTYPE html>.\n\n"
428+
"If you believe this error to be incorrect, try passing the "
429+
"command line option --use-deprecated=html5lib and please leave "
430+
"a comment on the pip issue at https://github.com/pypa/pip/issues/10825."
431+
)
432+
433+
346434
def _handle_get_page_fail(
347435
link: Link,
348436
reason: Union[str, Exception],

src/pip/_internal/index/package_finder.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,7 @@ def __init__(
580580
link_collector: LinkCollector,
581581
target_python: TargetPython,
582582
allow_yanked: bool,
583+
use_deprecated_html5lib: bool,
583584
format_control: Optional[FormatControl] = None,
584585
candidate_prefs: Optional[CandidatePreferences] = None,
585586
ignore_requires_python: Optional[bool] = None,
@@ -604,6 +605,7 @@ def __init__(
604605
self._ignore_requires_python = ignore_requires_python
605606
self._link_collector = link_collector
606607
self._target_python = target_python
608+
self._use_deprecated_html5lib = use_deprecated_html5lib
607609

608610
self.format_control = format_control
609611

@@ -620,6 +622,8 @@ def create(
620622
link_collector: LinkCollector,
621623
selection_prefs: SelectionPreferences,
622624
target_python: Optional[TargetPython] = None,
625+
*,
626+
use_deprecated_html5lib: bool,
623627
) -> "PackageFinder":
624628
"""Create a PackageFinder.
625629
@@ -644,6 +648,7 @@ def create(
644648
allow_yanked=selection_prefs.allow_yanked,
645649
format_control=selection_prefs.format_control,
646650
ignore_requires_python=selection_prefs.ignore_requires_python,
651+
use_deprecated_html5lib=use_deprecated_html5lib,
647652
)
648653

649654
@property
@@ -765,7 +770,7 @@ def process_project_url(
765770
if html_page is None:
766771
return []
767772

768-
page_links = list(parse_links(html_page))
773+
page_links = list(parse_links(html_page, self._use_deprecated_html5lib))
769774

770775
with indent_log():
771776
package_links = self.evaluate_links(

src/pip/_internal/self_outdated_check.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ def pip_self_version_check(session: PipSession, options: optparse.Values) -> Non
141141
finder = PackageFinder.create(
142142
link_collector=link_collector,
143143
selection_prefs=selection_prefs,
144+
use_deprecated_html5lib=(
145+
"html5lib" in options.deprecated_features_enabled
146+
),
144147
)
145148
best_candidate = finder.find_best_candidate("pip").best_candidate
146149
if best_candidate is None:

tests/data/indexes/datarequire/fakepackage/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html><head><title>Links for fakepackage</title><meta name="api-version" value="2" /></head><body><h1>Links for fakepackage</h1>
23
<a data-requires-python='' href="/fakepackage-1.0.0.tar.gz#md5=00000000000000000000000000000000" rel="internal">fakepackage-1.0.0.tar.gz</a><br/>
34
<a data-requires-python='&lt;2.7' href="/fakepackage-2.6.0.tar.gz#md5=00000000000000000000000000000000" rel="internal">fakepackage-2.6.0.tar.gz</a><br/>

tests/data/indexes/dev/bar/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html>
23
<body>
34
<a href="bar-1.0.tar.gz">bar-1.0.tar.gz</a>

tests/data/indexes/in dex/simple/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html>
23
<body>
34
<a href="../../../packages/simple-1.0.tar.gz#md5=4bdf78ebb7911f215c1972cf71b378f0">simple-1.0.tar.gz</a>

tests/data/indexes/pre/bar/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html>
23
<body>
34
<a href="bar-1.0.tar.gz">bar-1.0.tar.gz</a>

tests/data/indexes/simple/simple/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html>
23
<body>
34
<a href="../../../packages/simple-1.0.tar.gz#md5=4bdf78ebb7911f215c1972cf71b378f0">simple-1.0.tar.gz</a>

tests/data/indexes/yanked/simple/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html>
23
<body>
34
<a href="../../../packages/simple-1.0.tar.gz">simple-1.0.tar.gz</a>

tests/data/indexes/yanked_all/simple/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html>
23
<body>
34
<a data-yanked="test reason message" href="../../../packages/simple-1.0.tar.gz">simple-1.0.tar.gz</a>

tests/data/packages3/dinner/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html><head><title>PyPI Mirror</title></head>
23
<body>
34
<h1>PyPI Mirror</h1>

tests/data/packages3/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html><head><title>PyPI Mirror</title></head>
23
<body>
34
<h1>PyPI Mirror</h1>

tests/data/packages3/requiredinner/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
<!DOCTYPE html>
12
<html><head><title>PyPI Mirror</title></head>
23
<body>
34
<h1>PyPI Mirror</h1>

tests/functional/test_build_env.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def run_with_build_env(
4848
finder = PackageFinder.create(
4949
link_collector=link_collector,
5050
selection_prefs=selection_prefs,
51+
use_deprecated_html5lib=False,
5152
)
5253
5354
with global_tempdir_manager():

tests/functional/test_new_resolver_hashes.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def _create_find_links(script: PipTestEnvironment) -> _FindLinks:
2626
index_html = script.scratch_path / "index.html"
2727
index_html.write_text(
2828
"""
29+
<!DOCTYPE html>
2930
<a href="{sdist_url}#sha256={sdist_hash}">{sdist_path.stem}</a>
3031
<a href="{wheel_url}#sha256={wheel_hash}">{wheel_path.stem}</a>
3132
""".format(

tests/lib/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ def make_test_finder(
141141
allow_all_prereleases: bool = False,
142142
session: Optional[PipSession] = None,
143143
target_python: Optional[TargetPython] = None,
144+
use_deprecated_html5lib: bool = False,
144145
) -> PackageFinder:
145146
"""
146147
Create a PackageFinder for testing purposes.
@@ -159,6 +160,7 @@ def make_test_finder(
159160
link_collector=link_collector,
160161
selection_prefs=selection_prefs,
161162
target_python=target_python,
163+
use_deprecated_html5lib=use_deprecated_html5lib,
162164
)
163165

164166

tests/unit/resolution_resolvelib/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def finder(data: TestData) -> Iterator[PackageFinder]:
2626
scope = SearchScope([str(data.packages)], [])
2727
collector = LinkCollector(session, scope)
2828
prefs = SelectionPreferences(allow_yanked=False)
29-
finder = PackageFinder.create(collector, prefs)
29+
finder = PackageFinder.create(collector, prefs, use_deprecated_html5lib=False)
3030
yield finder
3131

3232

0 commit comments

Comments
 (0)