From 630dd0a6b1a757ba57ca72cb0bb64433e7198169 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sat, 1 May 2021 15:48:32 -0700 Subject: [PATCH 1/6] bpo-44002: Switch to lru_cache in urllib.parse. urllib.parse now uses functool.lru_cache for its internal URL splitting and quoting caches instead of rolling its own like its the 90s. The undocumented internal Quoted class and clear_cache() APIs are now deprecated, for removal in 3.14. --- Lib/test/test_urlparse.py | 24 ++++++- Lib/urllib/parse.py | 63 ++++++++++--------- .../2021-05-01-15-43-37.bpo-44002.KLT_wd.rst | 5 ++ 3 files changed, 61 insertions(+), 31 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 67341fecef94cd..bfef6de0aa9823 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -2,6 +2,7 @@ import unicodedata import unittest import urllib.parse +import warnings RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -1035,8 +1036,29 @@ def test_telurl_params(self): self.assertEqual(p1.path, '863-1234') self.assertEqual(p1.params, 'phone-context=+1-914-555') + def test_clear_cache_deprecation(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + urllib.parse.clear_cache() + self.assertEqual(len(w), 1, msg=repr(w)) + v = sys.version_info + exc = PendingDeprecationWarning if v <= (3, 11) else DeprecationWarning + self.assertIs(w[0].category, exc) + self.assertIn('clear_cache() will be removed', str(w[0].message)) + + def test_Quoter_deprecation(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + old_class = urllib.parse.Quoter + self.assertIs(old_class, urllib.parse._Quoter) + self.assertEqual(len(w), 1, msg=repr(w)) + v = sys.version_info + exc = PendingDeprecationWarning if v <= (3, 11) else DeprecationWarning + self.assertIs(w[0].category, exc) + self.assertIn('Quoter will be removed', str(w[0].message)) + def test_Quoter_repr(self): - quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE) + quoter = urllib.parse._Quoter(urllib.parse._ALWAYS_SAFE) self.assertIn('Quoter', repr(quoter)) def test_all(self): diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 4249163f0edde7..b7fea437ac63ae 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -27,10 +27,11 @@ test_urlparse.py provides a good indicator of parsing behavior. """ +from collections import namedtuple +import functools import re import sys import types -import collections import warnings __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", @@ -81,15 +82,15 @@ # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] -# XXX: Consider replacing with functools.lru_cache -MAX_CACHE_SIZE = 20 -_parse_cache = {} - def clear_cache(): """Clear the parse cache and the quoters cache.""" - _parse_cache.clear() - _safe_quoters.clear() - + warnings.warn( + 'Deprecated in 3.11. ' + 'urllib.parse.clear_cache() will be removed in Python 3.14. ' + 'Use urllib.parse.urlsplit.cache_clear() on Python 3.11 or later.', + PendingDeprecationWarning) + urlsplit.cache_clear() + _byte_quoter_factory.cache_clear() # Helpers for bytes handling # For 3.2, we deliberately require applications that @@ -243,8 +244,6 @@ def _hostinfo(self): return hostname, port -from collections import namedtuple - _DefragResultBase = namedtuple('DefragResult', 'url fragment') _SplitResultBase = namedtuple( 'SplitResult', 'scheme netloc path query fragment') @@ -434,6 +433,8 @@ def _checknetloc(netloc): raise ValueError("netloc '" + netloc + "' contains invalid " + "characters under NFKC normalization") +# Cache size chosen by random die roll; prior to functools it was 20 in 1997. +@functools.lru_cache(maxsize=99) def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: :///?# @@ -457,12 +458,6 @@ def urlsplit(url, scheme='', allow_fragments=True): url, scheme, _coerce_result = _coerce_args(url, scheme) allow_fragments = bool(allow_fragments) - key = url, scheme, allow_fragments, type(url), type(scheme) - cached = _parse_cache.get(key, None) - if cached: - return _coerce_result(cached) - if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth - clear_cache() netloc = query = fragment = '' i = url.find(':') if i > 0: @@ -486,7 +481,6 @@ def urlsplit(url, scheme='', allow_fragments=True): url, query = url.split('?', 1) _checknetloc(netloc) v = SplitResult(scheme, netloc, url, query, fragment) - _parse_cache[key] = v return _coerce_result(v) def urlunparse(components): @@ -789,23 +783,30 @@ def unquote_plus(string, encoding='utf-8', errors='replace'): b'0123456789' b'_.-~') _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) -_safe_quoters = {} -class Quoter(collections.defaultdict): - """A mapping from bytes (in range(0,256)) to strings. +def __getattr__(name): + if name == 'Quoter': + warnings.warn('Deprecated in 3.11. ' + 'urllib.parse.Quoter will be removed in Python 3.14. ' + 'It was not intended to be a public API.', + PendingDeprecationWarning, stacklevel=2) + return _Quoter + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') + +class _Quoter(dict): + """A mapping from bytes numbers (in range(0,256)) to strings. String values are percent-encoded byte values, unless the key < 128, and - in the "safe" set (either the specified safe set, or default set). + in either of the specified safe set, or the always safe set. """ - # Keeps a cache internally, using defaultdict, for efficiency (lookups + # Keeps a cache internally, via __missing__, for efficiency (lookups # of cached keys don't call Python code at all). def __init__(self, safe): """safe: bytes object.""" self.safe = _ALWAYS_SAFE.union(safe) def __repr__(self): - # Without this, will just display as a defaultdict - return "<%s %r>" % (self.__class__.__name__, dict(self)) + return f"" def __missing__(self, b): # Handle a cache miss. Store quoted string in cache and return. @@ -884,6 +885,11 @@ def quote_plus(string, safe='', encoding=None, errors=None): string = quote(string, safe + space, encoding, errors) return string.replace(' ', '+') +# Expectation: A typical program is unlikely to create more than 5 of these. +@functools.lru_cache(maxsize=30) +def _byte_quoter_factory(safe): + return _Quoter(safe).__getitem__ + def quote_from_bytes(bs, safe='/'): """Like quote(), but accepts a bytes object rather than a str, and does not perform string-to-bytes encoding. It always returns an ASCII string. @@ -897,14 +903,11 @@ def quote_from_bytes(bs, safe='/'): # Normalize 'safe' by converting to bytes and removing non-ASCII chars safe = safe.encode('ascii', 'ignore') else: - safe = bytes([c for c in safe if c < 128]) + safe = bytes(c for c in safe if c < 128) if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): return bs.decode() - try: - quoter = _safe_quoters[safe] - except KeyError: - _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ - return ''.join([quoter(char) for char in bs]) + quoter = _byte_quoter_factory(safe) + return ''.join(quoter(char_num) for char_num in bs) def urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus): diff --git a/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst b/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst new file mode 100644 index 00000000000000..e91dbcaaf65c35 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst @@ -0,0 +1,5 @@ +:mod:`urllib.parse` now uses :func:`functool.lru_cache` for its internal URL +splitting and quoting caches instead of rolling its own like its the 90s. + +The undocumented internal :mod:`urllib.parse` ``Quoted`` class and +``clear_cache()`` APIs are now deprecated, for removal in 3.14. From 7397c2c71faf8c201d9c57d0139273fd414644d0 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sat, 1 May 2021 18:20:58 -0700 Subject: [PATCH 2/6] Use the default lru_cache size. --- Lib/urllib/parse.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index b7fea437ac63ae..50fe6f7a5e2ce1 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -433,8 +433,7 @@ def _checknetloc(netloc): raise ValueError("netloc '" + netloc + "' contains invalid " + "characters under NFKC normalization") -# Cache size chosen by random die roll; prior to functools it was 20 in 1997. -@functools.lru_cache(maxsize=99) +@functools.lru_cache def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: :///?# @@ -886,7 +885,7 @@ def quote_plus(string, safe='', encoding=None, errors=None): return string.replace(' ', '+') # Expectation: A typical program is unlikely to create more than 5 of these. -@functools.lru_cache(maxsize=30) +@functools.lru_cache def _byte_quoter_factory(safe): return _Quoter(safe).__getitem__ From 8a54534eb6f0c76900700dd64ed075a93d1c7266 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sat, 1 May 2021 18:58:26 -0700 Subject: [PATCH 3/6] Use a typed lru_cache on urlsplit. --- Lib/urllib/parse.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 50fe6f7a5e2ce1..654b2ad274b406 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -433,7 +433,9 @@ def _checknetloc(netloc): raise ValueError("netloc '" + netloc + "' contains invalid " + "characters under NFKC normalization") -@functools.lru_cache +# typed=True avoids BytesWarnings being emitted during cache key +# comparison since this API supports both bytes and str input. +@functools.lru_cache(typed=True) def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: :///?# From 7b70e99f80ce712650abc45fabf12326b7c22d69 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sat, 1 May 2021 23:49:33 -0700 Subject: [PATCH 4/6] Restore the list comprehensions. --- Lib/urllib/parse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 654b2ad274b406..3b9f23cee34ff3 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -904,11 +904,12 @@ def quote_from_bytes(bs, safe='/'): # Normalize 'safe' by converting to bytes and removing non-ASCII chars safe = safe.encode('ascii', 'ignore') else: - safe = bytes(c for c in safe if c < 128) + # List comprehensions are faster than generator expressions. + safe = bytes([c for c in safe if c < 128]) if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): return bs.decode() quoter = _byte_quoter_factory(safe) - return ''.join(quoter(char_num) for char_num in bs) + return ''.join([quoter(char) for char in bs]) def urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus): From c2dbd84110cf439b88016db20e92c8a55f463748 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Mon, 3 May 2021 16:34:20 -0700 Subject: [PATCH 5/6] don't deprecate clear_cache(). twisted, gevent, and our own libregrtest use it in test suites. We'll keep it for them. ... and skip Pending on Quoter's DeprecationWarning. No uses of that have been found externally. Also reoragnizes the unittest a bit to be cleaner and fixes a whitespace problem. --- Lib/test/test_urlparse.py | 33 ++++++------------- Lib/urllib/parse.py | 11 ++----- .../2021-05-01-15-43-37.bpo-44002.KLT_wd.rst | 6 ++-- 3 files changed, 16 insertions(+), 34 deletions(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index bfef6de0aa9823..de5f7712bc8b1c 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -2,7 +2,6 @@ import unicodedata import unittest import urllib.parse -import warnings RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -1036,38 +1035,20 @@ def test_telurl_params(self): self.assertEqual(p1.path, '863-1234') self.assertEqual(p1.params, 'phone-context=+1-914-555') - def test_clear_cache_deprecation(self): - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - urllib.parse.clear_cache() - self.assertEqual(len(w), 1, msg=repr(w)) - v = sys.version_info - exc = PendingDeprecationWarning if v <= (3, 11) else DeprecationWarning - self.assertIs(w[0].category, exc) - self.assertIn('clear_cache() will be removed', str(w[0].message)) - - def test_Quoter_deprecation(self): - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - old_class = urllib.parse.Quoter - self.assertIs(old_class, urllib.parse._Quoter) - self.assertEqual(len(w), 1, msg=repr(w)) - v = sys.version_info - exc = PendingDeprecationWarning if v <= (3, 11) else DeprecationWarning - self.assertIs(w[0].category, exc) - self.assertIn('Quoter will be removed', str(w[0].message)) - def test_Quoter_repr(self): quoter = urllib.parse._Quoter(urllib.parse._ALWAYS_SAFE) self.assertIn('Quoter', repr(quoter)) + def test_clear_cache_for_code_coverage(self): + urllib.parse.clear_cache() + def test_all(self): expected = [] undocumented = { 'splitattr', 'splithost', 'splitnport', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', - 'Quoter', 'ResultBase', 'clear_cache', 'to_bytes', 'unwrap', + 'ResultBase', 'clear_cache', 'to_bytes', 'unwrap', } for name in dir(urllib.parse): if name.startswith('_') or name in undocumented: @@ -1259,6 +1240,12 @@ def test_unwrap(self): class DeprecationTest(unittest.TestCase): + def test_Quoter_deprecation(self): + with self.assertWarns(DeprecationWarning) as cm: + old_class = urllib.parse.Quoter + self.assertIs(old_class, urllib.parse._Quoter) + self.assertIn('Quoter will be removed', str(cm.warning)) + def test_splittype_deprecation(self): with self.assertWarns(DeprecationWarning) as cm: urllib.parse.splittype('') diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 3b9f23cee34ff3..6ef42a462752ca 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -83,12 +83,7 @@ _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] def clear_cache(): - """Clear the parse cache and the quoters cache.""" - warnings.warn( - 'Deprecated in 3.11. ' - 'urllib.parse.clear_cache() will be removed in Python 3.14. ' - 'Use urllib.parse.urlsplit.cache_clear() on Python 3.11 or later.', - PendingDeprecationWarning) + """Clear internal performance caches. Undocumented; some tests want it.""" urlsplit.cache_clear() _byte_quoter_factory.cache_clear() @@ -790,7 +785,7 @@ def __getattr__(name): warnings.warn('Deprecated in 3.11. ' 'urllib.parse.Quoter will be removed in Python 3.14. ' 'It was not intended to be a public API.', - PendingDeprecationWarning, stacklevel=2) + DeprecationWarning, stacklevel=2) return _Quoter raise AttributeError(f'module {__name__!r} has no attribute {name!r}') @@ -889,7 +884,7 @@ def quote_plus(string, safe='', encoding=None, errors=None): # Expectation: A typical program is unlikely to create more than 5 of these. @functools.lru_cache def _byte_quoter_factory(safe): - return _Quoter(safe).__getitem__ + return _Quoter(safe).__getitem__ def quote_from_bytes(bs, safe='/'): """Like quote(), but accepts a bytes object rather than a str, and does diff --git a/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst b/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst index e91dbcaaf65c35..9d662d9827a91d 100644 --- a/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst +++ b/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst @@ -1,5 +1,5 @@ :mod:`urllib.parse` now uses :func:`functool.lru_cache` for its internal URL -splitting and quoting caches instead of rolling its own like its the 90s. +splitting and quoting caches instead of rolling its own like its the '90s. -The undocumented internal :mod:`urllib.parse` ``Quoted`` class and -``clear_cache()`` APIs are now deprecated, for removal in 3.14. +The undocumented internal :mod:`urllib.parse` ``Quoted`` class API is now +deprecated, for removal in 3.14. From 1129c9b4301de4faaa9101df4c0c6b868458f46b Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Tue, 11 May 2021 15:56:00 -0700 Subject: [PATCH 6/6] Add a test for the module level __getattr__(). --- Lib/test/test_urlparse.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index ea1293ccc93a9c..dff9a8ede9b601 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1050,6 +1050,11 @@ def test_Quoter_repr(self): def test_clear_cache_for_code_coverage(self): urllib.parse.clear_cache() + def test_urllib_parse_getattr_failure(self): + """Test that urllib.parse.__getattr__() fails correctly.""" + with self.assertRaises(AttributeError): + unused = urllib.parse.this_does_not_exist + def test_all(self): expected = [] undocumented = {