Skip to content

Drop html5lib #11259

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/10825.removal.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove the ``html5lib`` deprecated feature flag.
1 change: 1 addition & 0 deletions news/html5lib.vendor.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove vendored html5lib.
1 change: 0 additions & 1 deletion src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,6 @@ def check_list_path_option(options: Values) -> None:
default=[],
choices=[
"legacy-resolver",
"html5lib",
],
help=("Enable deprecated functionality, that will be removed in the future."),
)
Expand Down
1 change: 0 additions & 1 deletion src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,5 +499,4 @@ def _build_package_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)
1 change: 0 additions & 1 deletion src/pip/_internal/commands/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ def _build_package_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)

def get_available_package_versions(self, options: Values, args: List[Any]) -> None:
Expand Down
1 change: 0 additions & 1 deletion src/pip/_internal/commands/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ def _build_package_finder(
return PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)

def run(self, options: Values, args: List[str]) -> int:
Expand Down
72 changes: 8 additions & 64 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
Union,
)

from pip._vendor import html5lib, requests
from pip._vendor import requests
from pip._vendor.requests import Response
from pip._vendor.requests.exceptions import RetryError, SSLError

Expand Down Expand Up @@ -191,27 +191,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
return None


def _determine_base_url(document: HTMLElement, page_url: str) -> str:
"""Determine the HTML document's base URL.

This looks for a ``<base>`` tag in the HTML document. If present, its href
attribute denotes the base URL of anchor tags in the document. If there is
no such tag (or if it does not have a valid href attribute), the HTML
file's URL is used as the base URL.

:param document: An HTML document representation. The current
implementation expects the result of ``html5lib.parse()``.
:param page_url: The URL of the HTML document.

TODO: Remove when `html5lib` is dropped.
"""
for base in document.findall(".//base"):
href = base.get("href")
if href is not None:
return href
return page_url


def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
Expand Down Expand Up @@ -313,9 +292,7 @@ def __hash__(self) -> int:


class ParseLinks(Protocol):
def __call__(
self, page: "IndexContent", use_deprecated_html5lib: bool
) -> Iterable[Link]:
def __call__(self, page: "IndexContent") -> Iterable[Link]:
...


Expand All @@ -327,49 +304,20 @@ def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
"""

@functools.lru_cache(maxsize=None)
def wrapper(
cacheable_page: CacheablePageContent, use_deprecated_html5lib: bool
) -> List[Link]:
return list(fn(cacheable_page.page, use_deprecated_html5lib))
def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
return list(fn(cacheable_page.page))

@functools.wraps(fn)
def wrapper_wrapper(
page: "IndexContent", use_deprecated_html5lib: bool
) -> List[Link]:
def wrapper_wrapper(page: "IndexContent") -> List[Link]:
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
return list(fn(page, use_deprecated_html5lib))
return wrapper(CacheablePageContent(page))
return list(fn(page))

return wrapper_wrapper


def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.

TODO: Remove when `html5lib` is dropped.
"""
document = html5lib.parse(
page.content,
transport_encoding=page.encoding,
namespaceHTMLElements=False,
)

url = page.url
base_url = _determine_base_url(document, url)
for anchor in document.findall(".//a"):
link = _create_link_from_element(
anchor.attrib,
page_url=url,
base_url=base_url,
)
if link is None:
continue
yield link


@with_cached_index_content
def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]:
def parse_links(page: "IndexContent") -> Iterable[Link]:
"""
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
"""
Expand Down Expand Up @@ -398,10 +346,6 @@ def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable
hashes=file.get("hashes", {}),
)

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return

parser = HTMLLinkParser(page.url)
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))
Expand Down
7 changes: 1 addition & 6 deletions src/pip/_internal/index/package_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,6 @@ def __init__(
link_collector: LinkCollector,
target_python: TargetPython,
allow_yanked: bool,
use_deprecated_html5lib: bool,
format_control: Optional[FormatControl] = None,
candidate_prefs: Optional[CandidatePreferences] = None,
ignore_requires_python: Optional[bool] = None,
Expand All @@ -623,7 +622,6 @@ def __init__(
self._ignore_requires_python = ignore_requires_python
self._link_collector = link_collector
self._target_python = target_python
self._use_deprecated_html5lib = use_deprecated_html5lib

self.format_control = format_control

Expand All @@ -640,8 +638,6 @@ def create(
link_collector: LinkCollector,
selection_prefs: SelectionPreferences,
target_python: Optional[TargetPython] = None,
*,
use_deprecated_html5lib: bool,
) -> "PackageFinder":
"""Create a PackageFinder.

Expand All @@ -666,7 +662,6 @@ def create(
allow_yanked=selection_prefs.allow_yanked,
format_control=selection_prefs.format_control,
ignore_requires_python=selection_prefs.ignore_requires_python,
use_deprecated_html5lib=use_deprecated_html5lib,
)

@property
Expand Down Expand Up @@ -796,7 +791,7 @@ def process_project_url(
if index_response is None:
return []

page_links = list(parse_links(index_response, self._use_deprecated_html5lib))
page_links = list(parse_links(index_response))

with indent_log():
package_links = self.evaluate_links(
Expand Down
1 change: 0 additions & 1 deletion src/pip/_internal/self_outdated_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ def _get_current_remote_pip_version(
finder = PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib=("html5lib" in options.deprecated_features_enabled),
)
best_candidate = finder.find_best_candidate("pip").best_candidate
if best_candidate is None:
Expand Down
3 changes: 0 additions & 3 deletions src/pip/_vendor/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,6 @@ Modifications
rather than ``appdirs``.
* ``packaging`` has been modified to import its dependencies from
``pip._vendor``.
* ``html5lib`` has been modified to import six from ``pip._vendor``, to prefer
importing from ``collections.abc`` instead of ``collections`` and does not
import ``xml.etree.cElementTree`` on Python 3.
* ``CacheControl`` has been modified to import its dependencies from
``pip._vendor``.
* ``requests`` has been modified to import its other dependencies from
Expand Down
1 change: 0 additions & 1 deletion src/pip/_vendor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def vendored(modulename):
vendored("colorama")
vendored("distlib")
vendored("distro")
vendored("html5lib")
vendored("six")
vendored("six.moves")
vendored("six.moves.urllib")
Expand Down
1 change: 0 additions & 1 deletion src/pip/_vendor/html5lib.pyi

This file was deleted.

20 changes: 0 additions & 20 deletions src/pip/_vendor/html5lib/LICENSE

This file was deleted.

35 changes: 0 additions & 35 deletions src/pip/_vendor/html5lib/__init__.py

This file was deleted.

Loading