From bbfd404a23191f4b59ae71ee53129fe4de6e0799 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 18 Jun 2023 19:10:24 +0100 Subject: [PATCH 01/22] GH-72904: Add optional *seps* argument to `fnmatch.translate()` If a sequence of path separators is given to the new argument, `translate()` produces a pattern that matches similarly to `pathlib.Path.glob()`. Specifically: - A `*` pattern segment matches precisely one path segment. - A `**` pattern segment matches any number of path segments - If `**` appears in any other position within the pattern, `ValueError` is raised. - `*` and `?` wildcards in other positions don't match path separators. This change allows us to factor out a lot of complex code in pathlib. --- Doc/library/fnmatch.rst | 17 +- Lib/fnmatch.py | 34 +++- Lib/pathlib.py | 157 +++++------------- Lib/test/test_fnmatch.py | 10 ++ ...3-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst | 3 + 5 files changed, 96 insertions(+), 125 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst diff --git a/Doc/library/fnmatch.rst b/Doc/library/fnmatch.rst index aed8991d44772f..55ae453c2f9686 100644 --- a/Doc/library/fnmatch.rst +++ b/Doc/library/fnmatch.rst @@ -82,7 +82,7 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, ``[n for n in names if fnmatch(n, pattern)]``, but implemented more efficiently. -.. function:: translate(pattern) +.. function:: translate(pattern, seps='') Return the shell-style *pattern* converted to a regular expression for using with :func:`re.match`. @@ -98,6 +98,21 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, >>> reobj.match('foobar.txt') + A sequence of path separator characters may be supplied to the *seps* + argument. If given, the separators are used to split the pattern into + segments, where: + + - A ``*`` pattern segment matches precisely one path segment. + - A ``**`` pattern segment matches any number of path segments. + - If ``**`` appears in any other position within the pattern, + :exc:`ValueError` is raised. + - ``*`` and ``?`` wildcards in other positions don't match path separators. + + This closely approximates the matching rules of the :mod:`glob` module. + + .. versionchanged:: 3.13 + The *seps* parameter was added. + .. seealso:: diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index d5e296f7748c1c..14d8c3d214b56a 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -71,13 +71,19 @@ def fnmatchcase(name, pat): return match(name) is not None -def translate(pat): +def translate(pat, seps=None): """Translate a shell PATTERN to a regular expression. There is no way to quote meta-characters. """ STAR = object() + if seps: + SEPS = re.escape(seps) + DOT = f'[^{SEPS}]' + else: + SEPS = None + DOT = '.' res = [] add = res.append i, n = 0, len(pat) @@ -86,10 +92,30 @@ def translate(pat): i = i+1 if c == '*': # compress consecutive `*` into one - if (not res) or res[-1] is not STAR: + h = i - 1 + while i < n and pat[i] == '*': + i = i + 1 + + if seps: + star_count = i - h + is_segment = (h == 0 or pat[h - 1] in seps) and (i == n or pat[i] in seps) + if star_count == 1: + if is_segment: + add(f'{DOT}+') + else: + add(f'{DOT}*') + elif star_count == 2 and is_segment: + if i == n: + add('.*') + else: + add(f'(.*[{SEPS}])?') + i += 1 + else: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: add(STAR) elif c == '?': - add('.') + add(DOT) elif c == '[': j = i if j < n and pat[j] == '!': @@ -136,7 +162,7 @@ def translate(pat): add('(?!)') elif stuff == '!': # Negated empty range: match any character. - add('.') + add(DOT) else: if stuff[0] == '!': stuff = '^' + stuff[1:] diff --git a/Lib/pathlib.py b/Lib/pathlib.py index f3813e04109904..20a0684b6762c1 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -64,78 +64,12 @@ def _is_case_sensitive(flavour): # -# fnmatch.translate() returns a regular expression that includes a prefix and -# a suffix, which enable matching newlines and ensure the end of the string is -# matched, respectively. These features are undesirable for our implementation -# of PurePatch.match(), which represents path separators as newlines and joins -# pattern segments together. As a workaround, we define a slice object that -# can remove the prefix and suffix from any translate() result. See the -# _compile_pattern_lines() function for more details. -_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') -_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) -_SWAP_SEP_AND_NEWLINE = { - '/': str.maketrans({'/': '\n', '\n': '/'}), - '\\': str.maketrans({'\\': '\n', '\n': '\\'}), -} - - @functools.lru_cache(maxsize=256) -def _compile_pattern(pat, case_sensitive): +def _compile_pattern(pat, sep, case_sensitive): """Compile given glob pattern to a re.Pattern object (observing case sensitivity), or None if the pattern should match everything.""" - if pat == '*': - return None flags = re.NOFLAG if case_sensitive else re.IGNORECASE - return re.compile(fnmatch.translate(pat), flags).match - - -@functools.lru_cache() -def _compile_pattern_lines(pattern_lines, case_sensitive): - """Compile the given pattern lines to an `re.Pattern` object. - - The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with - its path separators and newlines swapped (e.g. '**\n*.py`). By using - newlines to separate path components, and not setting `re.DOTALL`, we - ensure that the `*` wildcard cannot match path separators. - - The returned `re.Pattern` object may have its `match()` method called to - match a complete pattern, or `search()` to match from the right. The - argument supplied to these methods must also have its path separators and - newlines swapped. - """ - - # Match the start of the path, or just after a path separator - parts = ['^'] - for part in pattern_lines.splitlines(keepends=True): - if part == '*\n': - part = r'.+\n' - elif part == '*': - part = r'.+' - elif part == '**\n': - # '**/' component: we use '[\s\S]' rather than '.' so that path - # separators (i.e. newlines) are matched. The trailing '^' ensures - # we terminate after a path separator (i.e. on a new line). - part = r'[\s\S]*^' - elif part == '**': - # '**' component. - part = r'[\s\S]*' - elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: - # Any other component: pass to fnmatch.translate(). We slice off - # the common prefix and suffix added by translate() to ensure that - # re.DOTALL is not set, and the end of the string not matched, - # respectively. With DOTALL not set, '*' wildcards will not match - # path separators, because the '.' characters in the pattern will - # not match newlines. - part = fnmatch.translate(part)[_FNMATCH_SLICE] - parts.append(part) - # Match the end of the path, always. - parts.append(r'\Z') - flags = re.MULTILINE - if not case_sensitive: - flags |= re.IGNORECASE - return re.compile(''.join(parts), flags=flags) + return re.compile(fnmatch.translate(pat, sep), flags).match def _select_children(parent_paths, dir_only, follow_symlinks, match): @@ -159,7 +93,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match): except OSError: continue name = entry.name - if match is None or match(name): + if match(name): yield parent_path._make_child_relpath(name) @@ -196,7 +130,7 @@ def _select_unique(paths): yielded = set() try: for path in paths: - path_str = str(path) + path_str = path._str if path_str not in yielded: yield path yielded.add(path_str) @@ -268,10 +202,10 @@ class PurePath: # tail are normalized. '_drv', '_root', '_tail_cached', - # The `_str` slot stores the string representation of the path, + # The `_str_cached` slot stores the string representation of the path, # computed from the drive, root and tail when `__str__()` is called # for the first time. It's used to implement `_str_normcase` - '_str', + '_str_cached', # The `_str_normcase_cached` slot stores the string path with # normalized case. It is set when the `_str_normcase` property is @@ -285,10 +219,6 @@ class PurePath: # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', - # The `_lines_cached` slot stores the string path with path separators - # and newlines swapped. This is used to implement `match()`. - '_lines_cached', - # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. '_hash', @@ -375,7 +305,7 @@ def _load_parts(self): def _from_parsed_parts(self, drv, root, tail): path_str = self._format_parsed_parts(drv, root, tail) path = self.with_segments(path_str) - path._str = path_str or '.' + path._str_cached = path_str path._drv = drv path._root = root path._tail_cached = tail @@ -392,12 +322,7 @@ def _format_parsed_parts(cls, drv, root, tail): def __str__(self): """Return the string representation of the path, suitable for passing to system calls.""" - try: - return self._str - except AttributeError: - self._str = self._format_parsed_parts(self.drive, self.root, - self._tail) or '.' - return self._str + return self._str or '.' def __fspath__(self): return str(self) @@ -436,6 +361,15 @@ def as_uri(self): path = str(self) return prefix + urlquote_from_bytes(os.fsencode(path)) + @property + def _str(self): + try: + return self._str_cached + except AttributeError: + self._str_cached = self._format_parsed_parts( + self.drive, self.root, self._tail) + return self._str_cached + @property def _str_normcase(self): # String with normalized case, for hashing and equality checks @@ -443,9 +377,9 @@ def _str_normcase(self): return self._str_normcase_cached except AttributeError: if _is_case_sensitive(self._flavour): - self._str_normcase_cached = str(self) + self._str_normcase_cached = self._str else: - self._str_normcase_cached = str(self).lower() + self._str_normcase_cached = self._str.lower() return self._str_normcase_cached @property @@ -457,20 +391,6 @@ def _parts_normcase(self): self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) return self._parts_normcase_cached - @property - def _lines(self): - # Path with separators and newlines swapped, for pattern matching. - try: - return self._lines_cached - except AttributeError: - path_str = str(self) - if path_str == '.': - self._lines_cached = '' - else: - trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] - self._lines_cached = path_str.translate(trans) - return self._lines_cached - def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented @@ -738,13 +658,16 @@ def match(self, path_pattern, *, case_sensitive=None): path_pattern = self.with_segments(path_pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self._flavour) - pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive) + sep = path_pattern._flavour.sep + pattern_str = path_pattern._str if path_pattern.drive or path_pattern.root: - return pattern.match(self._lines) is not None + pass elif path_pattern._tail: - return pattern.search(self._lines) is not None + pattern_str = f'**{sep}{pattern_str}' else: raise ValueError("empty pattern") + match = _compile_pattern(pattern_str, sep, case_sensitive) + return match(self._str) is not None # Subclassing os.PathLike makes isinstance() checks slower, @@ -1017,26 +940,19 @@ def _scandir(self): return os.scandir(self) def _make_child_relpath(self, name): - sep = self._flavour.sep - lines_name = name.replace('\n', sep) - lines_str = self._lines - path_str = str(self) + path_str = self._str tail = self._tail if tail: - path_str = f'{path_str}{sep}{name}' - lines_str = f'{lines_str}\n{lines_name}' - elif path_str != '.': + path_str = f'{path_str}{self._flavour.sep}{name}' + elif path_str: path_str = f'{path_str}{name}' - lines_str = f'{lines_str}{lines_name}' else: path_str = name - lines_str = lines_name path = self.with_segments(path_str) - path._str = path_str + path._str_cached = path_str path._drv = self.drive path._root = self.root path._tail_cached = tail + [name] - path._lines_cached = lines_str return path def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): @@ -1082,6 +998,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): # do not perform any filesystem access, which can be much faster! filter_paths = follow_symlinks is not None and '..' not in pattern_parts deduplicate_paths = False + sep = self._flavour.sep paths = iter([self] if self.is_dir() else []) part_idx = 0 while part_idx < len(pattern_parts): @@ -1102,9 +1019,9 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): paths = _select_recursive(paths, dir_only, follow_symlinks) # Filter out paths that don't match pattern. - prefix_len = len(self._make_child_relpath('_')._lines) - 1 - match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match - paths = (path for path in paths if match(path._lines[prefix_len:])) + prefix_len = len(self._make_child_relpath('_')._str) - 1 + match = _compile_pattern(path_pattern._str, sep, case_sensitive) + paths = (path for path in paths if match(path._str[prefix_len:])) return paths dir_only = part_idx < len(pattern_parts) @@ -1117,7 +1034,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): raise ValueError("Invalid pattern: '**' can only be an entire path component") else: dir_only = part_idx < len(pattern_parts) - match = _compile_pattern(part, case_sensitive) + match = _compile_pattern(part, sep, case_sensitive) paths = _select_children(paths, dir_only, follow_symlinks, match) return paths @@ -1210,11 +1127,11 @@ def absolute(self): # Fast path for "empty" paths, e.g. Path("."), Path("") or Path(). # We pass only one argument to with_segments() to avoid the cost # of joining, and we exploit the fact that getcwd() returns a - # fully-normalized string by storing it in _str. This is used to - # implement Path.cwd(). + # fully-normalized string by storing it in _str_cached. This is + # used to implement Path.cwd(). if not self.root and not self._tail: result = self.with_segments(cwd) - result._str = cwd + result._str_cached = cwd return result return self.with_segments(cwd, self) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 10ed496d4e2f37..5670d573fe2023 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -250,6 +250,16 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) + def test_translate_seps(self): + self.assertEqual(translate('*', seps='/'), r'(?s:[^/]+)\Z') + self.assertEqual(translate('?', seps='/'), r'(?s:[^/])\Z') + self.assertEqual(translate('a?b*', seps='/'), r'(?s:a[^/]b[^/]*)\Z') + self.assertEqual(translate('/**/*/*.*/**', seps='/'), + r'(?s:/(.*[/])?[^/]+/[^/]*\.[^/]*/.*)\Z') + self.assertRaises(ValueError, translate, 'a**', seps='/') + self.assertRaises(ValueError, translate, '**b', seps='/') + + class FilterTestCase(unittest.TestCase): def test_filter(self): diff --git a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst new file mode 100644 index 00000000000000..75249391ca83ab --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst @@ -0,0 +1,3 @@ +Add optional *seps* argument to :func:`fnmatch.translate`. If a sequence of +path separators is given, the resulting pattern matches paths similarly to +:func:`glob.glob`. From da9948d5cf7ee78a3d37d47044d636bdb58285d2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 13 Jul 2023 12:43:32 +0100 Subject: [PATCH 02/22] Simplify `_make_child_relpath()` further --- Lib/pathlib.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 20a0684b6762c1..51c1875d081082 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -940,14 +940,11 @@ def _scandir(self): return os.scandir(self) def _make_child_relpath(self, name): - path_str = self._str tail = self._tail if tail: - path_str = f'{path_str}{self._flavour.sep}{name}' - elif path_str: - path_str = f'{path_str}{name}' + path_str = f'{self._str}{self._flavour.sep}{name}' else: - path_str = name + path_str = f'{self._str}{name}' path = self.with_segments(path_str) path._str_cached = path_str path._drv = self.drive From a07118bcba731d444e6295c1d189b5b6880c814e Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 13 Jul 2023 12:46:52 +0100 Subject: [PATCH 03/22] Fix default value in docs --- Doc/library/fnmatch.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/fnmatch.rst b/Doc/library/fnmatch.rst index 55ae453c2f9686..7ad00cf6c775cc 100644 --- a/Doc/library/fnmatch.rst +++ b/Doc/library/fnmatch.rst @@ -82,7 +82,7 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, ``[n for n in names if fnmatch(n, pattern)]``, but implemented more efficiently. -.. function:: translate(pattern, seps='') +.. function:: translate(pattern, seps=None) Return the shell-style *pattern* converted to a regular expression for using with :func:`re.match`. From 2728dcd268025b0b272da3d51cb4aebe48e15b75 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 13 Jul 2023 13:17:26 +0100 Subject: [PATCH 04/22] Match style of surrounding `fnmatch` code a little better. --- Lib/fnmatch.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 14d8c3d214b56a..b6181597c43bdc 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -92,13 +92,12 @@ def translate(pat, seps=None): i = i+1 if c == '*': # compress consecutive `*` into one - h = i - 1 + h = i-1 while i < n and pat[i] == '*': - i = i + 1 - + i = i+1 if seps: - star_count = i - h - is_segment = (h == 0 or pat[h - 1] in seps) and (i == n or pat[i] in seps) + star_count = i-h + is_segment = (h == 0 or pat[h-1] in seps) and (i == n or pat[i] in seps) if star_count == 1: if is_segment: add(f'{DOT}+') @@ -109,7 +108,7 @@ def translate(pat, seps=None): add('.*') else: add(f'(.*[{SEPS}])?') - i += 1 + i = i+1 else: raise ValueError("Invalid pattern: '**' can only be an entire path component") else: From a0ce9c45eca50229ed481038bbdd2846f18aad76 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 19 Jul 2023 22:17:09 +0100 Subject: [PATCH 05/22] Docs + naming improvements --- Doc/library/fnmatch.rst | 4 +++- Lib/fnmatch.py | 8 +++++++- Lib/pathlib.py | 2 +- Lib/test/test_fnmatch.py | 2 ++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Doc/library/fnmatch.rst b/Doc/library/fnmatch.rst index 7ad00cf6c775cc..fac07e293465b2 100644 --- a/Doc/library/fnmatch.rst +++ b/Doc/library/fnmatch.rst @@ -108,7 +108,9 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, :exc:`ValueError` is raised. - ``*`` and ``?`` wildcards in other positions don't match path separators. - This closely approximates the matching rules of the :mod:`glob` module. + These rules approximate shell recursive globbing. The :mod:`pathlib` module + calls this function and supplies *seps* to implement + :meth:`~pathlib.PurePath.match` and :meth:`~pathlib.Path.glob`. .. versionchanged:: 3.13 The *seps* parameter was added. diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index b6181597c43bdc..3c6157df0ab97b 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -74,12 +74,18 @@ def fnmatchcase(name, pat): def translate(pat, seps=None): """Translate a shell PATTERN to a regular expression. + A sequence of path separator characters may be supplied to the *seps* + argument. If given, '*' and '?' wildcards will not match separators. + '*' wildcards in standalone pattern segments match precisely one path + segment; '**' wildcards in standalone segments match any number of path + segments. + There is no way to quote meta-characters. """ STAR = object() if seps: - SEPS = re.escape(seps) + SEPS = re.escape(''.join(seps)) DOT = f'[^{SEPS}]' else: SEPS = None diff --git a/Lib/pathlib.py b/Lib/pathlib.py index d84bf78c57642d..3049cfe18e069a 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -67,7 +67,7 @@ def _is_case_sensitive(pathmod): @functools.lru_cache(maxsize=256) def _compile_pattern(pat, sep, case_sensitive): """Compile given glob pattern to a re.Pattern object (observing case - sensitivity), or None if the pattern should match everything.""" + sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE return re.compile(fnmatch.translate(pat, sep), flags).match diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 5670d573fe2023..a2154889456e9f 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -256,6 +256,8 @@ def test_translate_seps(self): self.assertEqual(translate('a?b*', seps='/'), r'(?s:a[^/]b[^/]*)\Z') self.assertEqual(translate('/**/*/*.*/**', seps='/'), r'(?s:/(.*[/])?[^/]+/[^/]*\.[^/]*/.*)\Z') + self.assertEqual(translate('foo/bar\\baz', seps=('/', '\\')), + r'(?s:foo/bar\\baz)\Z') self.assertRaises(ValueError, translate, 'a**', seps='/') self.assertRaises(ValueError, translate, '**b', seps='/') From 5b620fb51eb5382d2ee6afbc1b5ec27883eb66fb Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 26 Jul 2023 18:45:35 +0100 Subject: [PATCH 06/22] Replace *seps* with *sep* --- Doc/library/fnmatch.rst | 11 ++++---- Lib/fnmatch.py | 25 +++++++++---------- Lib/test/test_fnmatch.py | 20 +++++++-------- ...3-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst | 7 +++--- 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/Doc/library/fnmatch.rst b/Doc/library/fnmatch.rst index fac07e293465b2..e7091e31981079 100644 --- a/Doc/library/fnmatch.rst +++ b/Doc/library/fnmatch.rst @@ -82,7 +82,7 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, ``[n for n in names if fnmatch(n, pattern)]``, but implemented more efficiently. -.. function:: translate(pattern, seps=None) +.. function:: translate(pattern, sep=None) Return the shell-style *pattern* converted to a regular expression for using with :func:`re.match`. @@ -98,9 +98,8 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, >>> reobj.match('foobar.txt') - A sequence of path separator characters may be supplied to the *seps* - argument. If given, the separators are used to split the pattern into - segments, where: + A path separator character may be supplied to the *sep* argument. If given, + the separator is sed to split the pattern into segments, where: - A ``*`` pattern segment matches precisely one path segment. - A ``**`` pattern segment matches any number of path segments. @@ -109,11 +108,11 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, - ``*`` and ``?`` wildcards in other positions don't match path separators. These rules approximate shell recursive globbing. The :mod:`pathlib` module - calls this function and supplies *seps* to implement + calls this function and supplies *sep* to implement :meth:`~pathlib.PurePath.match` and :meth:`~pathlib.Path.glob`. .. versionchanged:: 3.13 - The *seps* parameter was added. + The *sep* parameter was added. .. seealso:: diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 3c6157df0ab97b..a08b8d7bdc2e1f 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -71,24 +71,23 @@ def fnmatchcase(name, pat): return match(name) is not None -def translate(pat, seps=None): +def translate(pat, sep=None): """Translate a shell PATTERN to a regular expression. - A sequence of path separator characters may be supplied to the *seps* - argument. If given, '*' and '?' wildcards will not match separators. - '*' wildcards in standalone pattern segments match precisely one path - segment; '**' wildcards in standalone segments match any number of path - segments. + A path separator character may be supplied to the *sep* argument. If + given, '*' and '?' wildcards will not match separators; '*' wildcards in + standalone pattern segments match precisely one path segment; and '**' + wildcards in standalone segments match any number of path segments. There is no way to quote meta-characters. """ STAR = object() - if seps: - SEPS = re.escape(''.join(seps)) - DOT = f'[^{SEPS}]' + if sep: + SEP = re.escape(sep) + DOT = f'[^{SEP}]' else: - SEPS = None + SEP = None DOT = '.' res = [] add = res.append @@ -101,9 +100,9 @@ def translate(pat, seps=None): h = i-1 while i < n and pat[i] == '*': i = i+1 - if seps: + if sep: star_count = i-h - is_segment = (h == 0 or pat[h-1] in seps) and (i == n or pat[i] in seps) + is_segment = (h == 0 or pat[h-1] == sep) and (i == n or pat[i] == sep) if star_count == 1: if is_segment: add(f'{DOT}+') @@ -113,7 +112,7 @@ def translate(pat, seps=None): if i == n: add('.*') else: - add(f'(.*[{SEPS}])?') + add(f'(.*{SEP})?') i = i+1 else: raise ValueError("Invalid pattern: '**' can only be an entire path component") diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index a2154889456e9f..db6b7e0bc94237 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -250,16 +250,16 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) - def test_translate_seps(self): - self.assertEqual(translate('*', seps='/'), r'(?s:[^/]+)\Z') - self.assertEqual(translate('?', seps='/'), r'(?s:[^/])\Z') - self.assertEqual(translate('a?b*', seps='/'), r'(?s:a[^/]b[^/]*)\Z') - self.assertEqual(translate('/**/*/*.*/**', seps='/'), - r'(?s:/(.*[/])?[^/]+/[^/]*\.[^/]*/.*)\Z') - self.assertEqual(translate('foo/bar\\baz', seps=('/', '\\')), - r'(?s:foo/bar\\baz)\Z') - self.assertRaises(ValueError, translate, 'a**', seps='/') - self.assertRaises(ValueError, translate, '**b', seps='/') + def test_translate_sep(self): + self.assertEqual(translate('*', sep='/'), r'(?s:[^/]+)\Z') + self.assertEqual(translate('?', sep='/'), r'(?s:[^/])\Z') + self.assertEqual(translate('a?b*', sep='/'), r'(?s:a[^/]b[^/]*)\Z') + self.assertEqual(translate('/**/*/*.*/**', sep='/'), + r'(?s:/(.*/)?[^/]+/[^/]*\.[^/]*/.*)\Z') + self.assertEqual(translate(r'\**\*\*.*\**', sep='\\'), + r'(?s:\\(.*\\)?[^\\]+\\[^\\]*\.[^\\]*\\.*)\Z') + self.assertRaises(ValueError, translate, 'a**', sep='/') + self.assertRaises(ValueError, translate, '**b', sep='/') class FilterTestCase(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst index 75249391ca83ab..7be11c7fd86cc3 100644 --- a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst +++ b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst @@ -1,3 +1,4 @@ -Add optional *seps* argument to :func:`fnmatch.translate`. If a sequence of -path separators is given, the resulting pattern matches paths similarly to -:func:`glob.glob`. +Add optional *sep* argument to :func:`fnmatch.translate`. If a path separator +character is given, the resulting pattern matches paths like +:meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob`. For example, the +``*`` wildcard will not match path separators. From 51f269881007d9c90a070e9ad4646f352fead833 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Fri, 4 Aug 2023 22:06:34 +0100 Subject: [PATCH 07/22] Update Doc/library/fnmatch.rst Co-authored-by: Jason R. Coombs --- Doc/library/fnmatch.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/fnmatch.rst b/Doc/library/fnmatch.rst index e7091e31981079..a0778960d39cea 100644 --- a/Doc/library/fnmatch.rst +++ b/Doc/library/fnmatch.rst @@ -99,7 +99,7 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, A path separator character may be supplied to the *sep* argument. If given, - the separator is sed to split the pattern into segments, where: + the separator is used to split the pattern into segments, where: - A ``*`` pattern segment matches precisely one path segment. - A ``**`` pattern segment matches any number of path segments. From 9c8c3f3cb6f53c0d48cadeef4f2d98a5c1958b3a Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 11 Aug 2023 23:39:17 +0100 Subject: [PATCH 08/22] Move to `glob.translate()` --- Doc/library/fnmatch.rst | 18 +-- Doc/library/glob.rst | 41 +++++++ Lib/fnmatch.py | 38 +----- Lib/glob.py | 111 ++++++++++++++++++ Lib/pathlib.py | 5 +- Lib/test/test_fnmatch.py | 12 -- Lib/test/test_glob.py | 35 ++++++ ...3-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst | 6 +- 8 files changed, 197 insertions(+), 69 deletions(-) diff --git a/Doc/library/fnmatch.rst b/Doc/library/fnmatch.rst index a0778960d39cea..aed8991d44772f 100644 --- a/Doc/library/fnmatch.rst +++ b/Doc/library/fnmatch.rst @@ -82,7 +82,7 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, ``[n for n in names if fnmatch(n, pattern)]``, but implemented more efficiently. -.. function:: translate(pattern, sep=None) +.. function:: translate(pattern) Return the shell-style *pattern* converted to a regular expression for using with :func:`re.match`. @@ -98,22 +98,6 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, >>> reobj.match('foobar.txt') - A path separator character may be supplied to the *sep* argument. If given, - the separator is used to split the pattern into segments, where: - - - A ``*`` pattern segment matches precisely one path segment. - - A ``**`` pattern segment matches any number of path segments. - - If ``**`` appears in any other position within the pattern, - :exc:`ValueError` is raised. - - ``*`` and ``?`` wildcards in other positions don't match path separators. - - These rules approximate shell recursive globbing. The :mod:`pathlib` module - calls this function and supplies *sep* to implement - :meth:`~pathlib.PurePath.match` and :meth:`~pathlib.Path.glob`. - - .. versionchanged:: 3.13 - The *sep* parameter was added. - .. seealso:: diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index 0e4cfe7ebed797..9a8d8f97da8a52 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -145,6 +145,47 @@ default. For example, consider a directory containing :file:`card.gif` and >>> glob.glob('.c*') ['.card.gif'] + +.. function:: translate(pathname, *, recursive=False, seps=None) + + Convert the given path specification to a regular expression for use with + :func:`re.match`. The path specification can contain shell-style wildcards. + + For example: + + >>> import glob, re + >>> + >>> regex = glob.translate('**/*.txt', recursive=True) + >>> regex + '(?s:(?:.*/)?[^/]*\\.txt)\\Z' + >>> reobj = re.compile(regex) + >>> reobj.match('foo/bar/baz.txt') + + + Path separators and segments are meaningful to this function, unlike + :func:`fnmatch.translate`. By default wildcards do not match path + separators, and ``*`` pattern segments match precisely one path segment. + + If *recursive* is true, the pattern segment "``**``" will match any number + of path segments. If "``**``" occurs in any position other than a full + pattern segment, :exc:`ValueError` is raised. + + A sequence of path separators may be supplied to the *seps* argument. If + not given, :data:`os.sep` and :data:`~os.altsep` (if available) are used. + + .. note:: + + Filenames that begin with a dot (``.``) are matched by wildcards, unlike + :func:`glob`. + + .. seealso:: + + :meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob` methods, + which call this function to implement pattern matching and globbing. + + .. versionadded:: 3.13 + + .. seealso:: Module :mod:`fnmatch` diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index a08b8d7bdc2e1f..d5e296f7748c1c 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -71,24 +71,13 @@ def fnmatchcase(name, pat): return match(name) is not None -def translate(pat, sep=None): +def translate(pat): """Translate a shell PATTERN to a regular expression. - A path separator character may be supplied to the *sep* argument. If - given, '*' and '?' wildcards will not match separators; '*' wildcards in - standalone pattern segments match precisely one path segment; and '**' - wildcards in standalone segments match any number of path segments. - There is no way to quote meta-characters. """ STAR = object() - if sep: - SEP = re.escape(sep) - DOT = f'[^{SEP}]' - else: - SEP = None - DOT = '.' res = [] add = res.append i, n = 0, len(pat) @@ -97,29 +86,10 @@ def translate(pat, sep=None): i = i+1 if c == '*': # compress consecutive `*` into one - h = i-1 - while i < n and pat[i] == '*': - i = i+1 - if sep: - star_count = i-h - is_segment = (h == 0 or pat[h-1] == sep) and (i == n or pat[i] == sep) - if star_count == 1: - if is_segment: - add(f'{DOT}+') - else: - add(f'{DOT}*') - elif star_count == 2 and is_segment: - if i == n: - add('.*') - else: - add(f'(.*{SEP})?') - i = i+1 - else: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: + if (not res) or res[-1] is not STAR: add(STAR) elif c == '?': - add(DOT) + add('.') elif c == '[': j = i if j < n and pat[j] == '!': @@ -166,7 +136,7 @@ def translate(pat, sep=None): add('(?!)') elif stuff == '!': # Negated empty range: match any character. - add(DOT) + add('.') else: if stuff[0] == '!': stuff = '^' + stuff[1:] diff --git a/Lib/glob.py b/Lib/glob.py index a7256422d520fb..59110865f182cb 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -249,3 +249,114 @@ def escape(pathname): _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) + + +def translate(pat, *, recursive=False, seps=None): + """Translate a pathname with shell wildcards to a regular expression. + + If `recursive` is true, the pattern segment '**' will match any number of + path segments; if '**' appears outside its own segment, ValueError will be + raised. + + If a sequence of separator characters is given to `seps`, they will be + used to split the pattern into segments and match path separators. If not + given, os.path.sep and os.path.altsep (where available) are used. + + Filenames beginning with a dot ('.') are NOT special in this method; they + are matched by wildcards, unlike in glob(). + """ + if not seps: + if os.path.altsep: + seps = [os.path.sep, os.path.altsep] + else: + seps = os.path.sep + escaped_seps = ''.join(re.escape(sep) for sep in seps) + any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps + not_sep = f'[^{escaped_seps}]' + res = [] + add = res.append + i, n = 0, len(pat) + while i < n: + c = pat[i] + i = i+1 + if c == '*': + # compress consecutive `*` into one + h = i-1 + while i < n and pat[i] == '*': + i = i+1 + star_count = i-h + is_segment = (h == 0 or pat[h-1] in seps) and (i == n or pat[i] in seps) + if star_count == 1 or not recursive: + if is_segment: + add(f'{not_sep}+') + else: + add(f'{not_sep}*') + elif star_count == 2 and is_segment: + if i == n: + add('.*') + else: + add(f'(?:.*{any_sep})?') + i = i+1 + else: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + elif c in seps: + add(any_sep) + elif c == '?': + add(not_sep) + elif c == '[': + j = i + if j < n and pat[j] == '!': + j = j+1 + if j < n and pat[j] == ']': + j = j+1 + while j < n and pat[j] != ']': + j = j+1 + if j >= n: + add('\\[') + else: + stuff = pat[i:j] + if '-' not in stuff: + stuff = stuff.replace('\\', r'\\') + else: + chunks = [] + k = i+2 if pat[i] == '!' else i+1 + while True: + k = pat.find('-', k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k+1 + k = k+3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += '-' + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks)-1, 0, -1): + if chunks[k-1][-1] > chunks[k][0]: + chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') + for s in chunks) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r'([&~|])', r'\\\1', stuff) + i = j+1 + if not stuff: + # Empty range: never match. + add('(?!)') + elif stuff == '!': + # Negated empty range: match any character. + add(not_sep) + else: + if stuff[0] == '!': + stuff = '^' + stuff[1:] + elif stuff[0] in ('^', '['): + stuff = '\\' + stuff + add(f'[{stuff}]') + else: + add(re.escape(c)) + res = "".join(res) + return fr'(?s:{res})\Z' diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 3049cfe18e069a..5add4a2d40a3e9 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -5,8 +5,8 @@ operating systems. """ -import fnmatch import functools +import glob import io import ntpath import os @@ -69,7 +69,8 @@ def _compile_pattern(pat, sep, case_sensitive): """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE - return re.compile(fnmatch.translate(pat, sep), flags).match + regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep) + return re.compile(regex, flags).match def _select_children(parent_paths, dir_only, follow_symlinks, match): diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index db6b7e0bc94237..10ed496d4e2f37 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -250,18 +250,6 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) - def test_translate_sep(self): - self.assertEqual(translate('*', sep='/'), r'(?s:[^/]+)\Z') - self.assertEqual(translate('?', sep='/'), r'(?s:[^/])\Z') - self.assertEqual(translate('a?b*', sep='/'), r'(?s:a[^/]b[^/]*)\Z') - self.assertEqual(translate('/**/*/*.*/**', sep='/'), - r'(?s:/(.*/)?[^/]+/[^/]*\.[^/]*/.*)\Z') - self.assertEqual(translate(r'\**\*\*.*\**', sep='\\'), - r'(?s:\\(.*\\)?[^\\]+\\[^\\]*\.[^\\]*\\.*)\Z') - self.assertRaises(ValueError, translate, 'a**', sep='/') - self.assertRaises(ValueError, translate, '**b', sep='/') - - class FilterTestCase(unittest.TestCase): def test_filter(self): diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index f4b5821f408cb4..85f82eb6fdd405 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -349,6 +349,41 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) + def test_translate(self): + def fn(pat): + return glob.translate(pat, seps='/') + self.assertEqual(fn('foo'), r'(?s:foo)\Z') + self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z') + self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('?'), r'(?s:[^/])\Z') + self.assertEqual(fn('a*'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('*a'), r'(?s:[^/]*a)\Z') + self.assertEqual(fn('.*'), r'(?s:\.[^/]*)\Z') + self.assertEqual(fn('?aa'), r'(?s:[^/]aa)\Z') + self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z') + self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z') + self.assertEqual(fn('**'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('***'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('**b'), r'(?s:[^/]*b)\Z') + self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/[^/]+/[^/]+/[^/]*\.[^/]*/[^/]+)\Z') + + def test_translate_recursive(self): + def fn(pat): + return glob.translate(pat, recursive=True, seps='/') + self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('?'), r'(?s:[^/])\Z') + self.assertEqual(fn('**'), r'(?s:.*)\Z') + self.assertRaises(ValueError, fn, '***') + self.assertRaises(ValueError, fn, 'a**') + self.assertRaises(ValueError, fn, '**b') + self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/(?:.*/)?[^/]+/[^/]*\.[^/]*/.*)\Z') + + def test_translate_seps(self): + def fn(pat): + return glob.translate(pat, recursive=True, seps=['/', '\\']) + self.assertEqual(fn('foo/bar\\baz'), r'(?s:foo[/\\]bar[/\\]baz)\Z') + self.assertEqual(fn('**/**'), r'(?s:(?:.*[/\\])?.*)\Z') @skip_unless_symlink class SymlinkLoopGlobTests(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst index 7be11c7fd86cc3..edc8ab07bb06b3 100644 --- a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst +++ b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst @@ -1,4 +1,2 @@ -Add optional *sep* argument to :func:`fnmatch.translate`. If a path separator -character is given, the resulting pattern matches paths like -:meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob`. For example, the -``*`` wildcard will not match path separators. +Add :func:`glob.translate`. This function converts a pathname with shell-style +wildcards to a regular expression. From 8518ea2f9ff6c342e66301b4247d875b8ed1446f Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 12 Aug 2023 02:53:01 +0100 Subject: [PATCH 09/22] Whoops --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 5add4a2d40a3e9..30b43f0ad81935 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -69,7 +69,7 @@ def _compile_pattern(pat, sep, case_sensitive): """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE - regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep) + regex = glob.translate(pat, recursive=True, seps=sep) return re.compile(regex, flags).match From 75129c8dbe3c6969d08ad2819d1ca4d1f0124a6c Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 13 Aug 2023 21:33:44 +0100 Subject: [PATCH 10/22] Deduplicate code to handle character sets --- Lib/fnmatch.py | 88 ++++++++++++++++++++++++++------------------------ Lib/glob.py | 45 ++------------------------ 2 files changed, 48 insertions(+), 85 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index d5e296f7748c1c..360065981cb3c6 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -71,6 +71,50 @@ def fnmatchcase(name, pat): return match(name) is not None +def _translate_char_set(stuff): + if '-' not in stuff: + stuff = stuff.replace('\\', r'\\') + else: + chunks = [] + i = 0 + k = 2 if stuff[0] == '!' else 1 + while True: + k = stuff.find('-', k) + if k < 0: + break + chunks.append(stuff[i:k]) + i = k + 1 + k = k + 3 + chunk = stuff[i:] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += '-' + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks) - 1, 0, -1): + if chunks[k - 1][-1] > chunks[k][0]: + chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') + for s in chunks) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r'([&~|])', r'\\\1', stuff) + if not stuff: + # Empty range: never match. + return '(?!)' + elif stuff == '!': + # Negated empty range: match any character. + return '.' + else: + if stuff[0] == '!': + stuff = '^' + stuff[1:] + elif stuff[0] in ('^', '['): + stuff = '\\' + stuff + return f'[{stuff}]' + + def translate(pat): """Translate a shell PATTERN to a regular expression. @@ -101,48 +145,8 @@ def translate(pat): if j >= n: add('\\[') else: - stuff = pat[i:j] - if '-' not in stuff: - stuff = stuff.replace('\\', r'\\') - else: - chunks = [] - k = i+2 if pat[i] == '!' else i+1 - while True: - k = pat.find('-', k, j) - if k < 0: - break - chunks.append(pat[i:k]) - i = k+1 - k = k+3 - chunk = pat[i:j] - if chunk: - chunks.append(chunk) - else: - chunks[-1] += '-' - # Remove empty ranges -- invalid in RE. - for k in range(len(chunks)-1, 0, -1): - if chunks[k-1][-1] > chunks[k][0]: - chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] - del chunks[k] - # Escape backslashes and hyphens for set difference (--). - # Hyphens that create ranges shouldn't be escaped. - stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') - for s in chunks) - # Escape set operations (&&, ~~ and ||). - stuff = re.sub(r'([&~|])', r'\\\1', stuff) - i = j+1 - if not stuff: - # Empty range: never match. - add('(?!)') - elif stuff == '!': - # Negated empty range: match any character. - add('.') - else: - if stuff[0] == '!': - stuff = '^' + stuff[1:] - elif stuff[0] in ('^', '['): - stuff = '\\' + stuff - add(f'[{stuff}]') + add(_translate_char_set(pat[i:j])) + i = j + 1 else: add(re.escape(c)) assert i == n diff --git a/Lib/glob.py b/Lib/glob.py index 59110865f182cb..0bcec8b1ba6014 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -280,7 +280,6 @@ def translate(pat, *, recursive=False, seps=None): c = pat[i] i = i+1 if c == '*': - # compress consecutive `*` into one h = i-1 while i < n and pat[i] == '*': i = i+1 @@ -314,48 +313,8 @@ def translate(pat, *, recursive=False, seps=None): if j >= n: add('\\[') else: - stuff = pat[i:j] - if '-' not in stuff: - stuff = stuff.replace('\\', r'\\') - else: - chunks = [] - k = i+2 if pat[i] == '!' else i+1 - while True: - k = pat.find('-', k, j) - if k < 0: - break - chunks.append(pat[i:k]) - i = k+1 - k = k+3 - chunk = pat[i:j] - if chunk: - chunks.append(chunk) - else: - chunks[-1] += '-' - # Remove empty ranges -- invalid in RE. - for k in range(len(chunks)-1, 0, -1): - if chunks[k-1][-1] > chunks[k][0]: - chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] - del chunks[k] - # Escape backslashes and hyphens for set difference (--). - # Hyphens that create ranges shouldn't be escaped. - stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') - for s in chunks) - # Escape set operations (&&, ~~ and ||). - stuff = re.sub(r'([&~|])', r'\\\1', stuff) - i = j+1 - if not stuff: - # Empty range: never match. - add('(?!)') - elif stuff == '!': - # Negated empty range: match any character. - add(not_sep) - else: - if stuff[0] == '!': - stuff = '^' + stuff[1:] - elif stuff[0] in ('^', '['): - stuff = '\\' + stuff - add(f'[{stuff}]') + add(fnmatch._translate_char_set(pat[i:j])) + i = j + 1 else: add(re.escape(c)) res = "".join(res) From 2505590e77dd7ebe56546b4fcdaafb63f1177834 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 23 Sep 2023 21:24:45 +0100 Subject: [PATCH 11/22] Add support for `include_hidden=False` --- Doc/library/glob.rst | 12 +++++------ Lib/glob.py | 28 +++++++++++++++++++------- Lib/pathlib.py | 2 +- Lib/test/test_glob.py | 46 ++++++++++++++++++++++++++++++++++++++----- 4 files changed, 68 insertions(+), 20 deletions(-) diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index 9a8d8f97da8a52..e2f1f48509894e 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -146,7 +146,7 @@ default. For example, consider a directory containing :file:`card.gif` and ['.card.gif'] -.. function:: translate(pathname, *, recursive=False, seps=None) +.. function:: translate(pathname, *, recursive=False, include_hidden=False, seps=None) Convert the given path specification to a regular expression for use with :func:`re.match`. The path specification can contain shell-style wildcards. @@ -155,7 +155,7 @@ default. For example, consider a directory containing :file:`card.gif` and >>> import glob, re >>> - >>> regex = glob.translate('**/*.txt', recursive=True) + >>> regex = glob.translate('**/*.txt', recursive=True, include_hidden=True) >>> regex '(?s:(?:.*/)?[^/]*\\.txt)\\Z' >>> reobj = re.compile(regex) @@ -170,14 +170,12 @@ default. For example, consider a directory containing :file:`card.gif` and of path segments. If "``**``" occurs in any position other than a full pattern segment, :exc:`ValueError` is raised. + If *include_hidden* is true, wildcards can match path segments that start + with a dot (``.``). + A sequence of path separators may be supplied to the *seps* argument. If not given, :data:`os.sep` and :data:`~os.altsep` (if available) are used. - .. note:: - - Filenames that begin with a dot (``.``) are matched by wildcards, unlike - :func:`glob`. - .. seealso:: :meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob` methods, diff --git a/Lib/glob.py b/Lib/glob.py index 0bcec8b1ba6014..168cd6bcbd8567 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -251,19 +251,19 @@ def escape(pathname): _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) -def translate(pat, *, recursive=False, seps=None): +def translate(pat, *, recursive=False, include_hidden=False, seps=None): """Translate a pathname with shell wildcards to a regular expression. If `recursive` is true, the pattern segment '**' will match any number of path segments; if '**' appears outside its own segment, ValueError will be raised. + If `include_hidden` is true, wildcards can match path segments beginning + with a dot ('.'). + If a sequence of separator characters is given to `seps`, they will be used to split the pattern into segments and match path separators. If not given, os.path.sep and os.path.altsep (where available) are used. - - Filenames beginning with a dot ('.') are NOT special in this method; they - are matched by wildcards, unlike in glob(). """ if not seps: if os.path.altsep: @@ -273,9 +273,12 @@ def translate(pat, *, recursive=False, seps=None): escaped_seps = ''.join(re.escape(sep) for sep in seps) any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' + not_dot = r'(?!\.)' res = [] add = res.append i, n = 0, len(pat) + if pat[:1] != '.' and not include_hidden: + add(not_dot) while i < n: c = pat[i] i = i+1 @@ -292,14 +295,25 @@ def translate(pat, *, recursive=False, seps=None): add(f'{not_sep}*') elif star_count == 2 and is_segment: if i == n: - add('.*') + if include_hidden: + add('.*') + else: + add(fr'(?:{not_dot}{not_sep}+{any_sep})*{not_dot}{not_sep}*') else: - add(f'(?:.*{any_sep})?') - i = i+1 + if include_hidden: + add(f'(?:.+{any_sep})?') + i = i+1 + else: + add(fr'(?:{not_dot}{not_sep}+{any_sep})*') + i = i+1 + if i < n and pat[i] != '.': + add(not_dot) else: raise ValueError("Invalid pattern: '**' can only be an entire path component") elif c in seps: add(any_sep) + if i < n and pat[i] != '.' and not include_hidden: + add(not_dot) elif c == '?': add(not_sep) elif c == '[': diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 30b43f0ad81935..5add4a2d40a3e9 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -69,7 +69,7 @@ def _compile_pattern(pat, sep, case_sensitive): """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE - regex = glob.translate(pat, recursive=True, seps=sep) + regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep) return re.compile(regex, flags).match diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 85f82eb6fdd405..83b030116ce0f0 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -1,5 +1,6 @@ import glob import os +import re import shutil import sys import unittest @@ -349,9 +350,43 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) + def test_translate(self): + match = re.compile(glob.translate('*')).match + self.assertIsNotNone(match('foo')) + self.assertIsNotNone(match('foo.bar')) + self.assertIsNone(match('.foo')) + match = re.compile(glob.translate('.*')).match + self.assertIsNotNone(match('.foo')) + match = re.compile(glob.translate('**', recursive=True)).match + self.assertIsNotNone(match('foo')) + self.assertIsNone(match('.foo')) + self.assertIsNotNone(match(os.path.join('foo', 'bar'))) + self.assertIsNone(match(os.path.join('foo', '.bar'))) + self.assertIsNone(match(os.path.join('.foo', 'bar'))) + self.assertIsNone(match(os.path.join('.foo', '.bar'))) + match = re.compile(glob.translate('**/*', recursive=True)).match + self.assertIsNotNone(match(os.path.join('foo', 'bar'))) + self.assertIsNone(match(os.path.join('foo', '.bar'))) + self.assertIsNone(match(os.path.join('.foo', 'bar'))) + self.assertIsNone(match(os.path.join('.foo', '.bar'))) + match = re.compile(glob.translate('*/**', recursive=True)).match + self.assertIsNotNone(match(os.path.join('foo', 'bar'))) + self.assertIsNone(match(os.path.join('foo', '.bar'))) + self.assertIsNone(match(os.path.join('.foo', 'bar'))) + self.assertIsNone(match(os.path.join('.foo', '.bar'))) + match = re.compile(glob.translate('**/.bar', recursive=True)).match + self.assertIsNotNone(match(os.path.join('foo', '.bar'))) + self.assertIsNone(match(os.path.join('.foo', '.bar'))) + match = re.compile(glob.translate('**/*.*', recursive=True)).match + self.assertIsNone(match(os.path.join('foo', 'bar'))) + self.assertIsNone(match(os.path.join('foo', '.bar'))) + self.assertIsNotNone(match(os.path.join('foo', 'bar.txt'))) + self.assertIsNone(match(os.path.join('foo', '.bar.txt'))) + + def test_translate_include_hidden(self): def fn(pat): - return glob.translate(pat, seps='/') + return glob.translate(pat, include_hidden=True, seps='/') self.assertEqual(fn('foo'), r'(?s:foo)\Z') self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z') self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') @@ -370,20 +405,21 @@ def fn(pat): def test_translate_recursive(self): def fn(pat): - return glob.translate(pat, recursive=True, seps='/') + return glob.translate(pat, recursive=True, include_hidden=True, seps='/') self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') self.assertEqual(fn('?'), r'(?s:[^/])\Z') self.assertEqual(fn('**'), r'(?s:.*)\Z') self.assertRaises(ValueError, fn, '***') self.assertRaises(ValueError, fn, 'a**') self.assertRaises(ValueError, fn, '**b') - self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/(?:.*/)?[^/]+/[^/]*\.[^/]*/.*)\Z') + self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/(?:.+/)?[^/]+/[^/]*\.[^/]*/.*)\Z') def test_translate_seps(self): def fn(pat): - return glob.translate(pat, recursive=True, seps=['/', '\\']) + return glob.translate(pat, recursive=True, include_hidden=True, seps=['/', '\\']) self.assertEqual(fn('foo/bar\\baz'), r'(?s:foo[/\\]bar[/\\]baz)\Z') - self.assertEqual(fn('**/**'), r'(?s:(?:.*[/\\])?.*)\Z') + self.assertEqual(fn('**/**'), r'(?s:(?:.+[/\\])?.*)\Z') + @skip_unless_symlink class SymlinkLoopGlobTests(unittest.TestCase): From 1754d42b4090ca6b201b0359fdf94cb37338ea5a Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 23 Sep 2023 21:42:09 +0100 Subject: [PATCH 12/22] Fix doctest --- Doc/library/glob.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index e2f1f48509894e..8e76d2d5f16535 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -157,7 +157,7 @@ default. For example, consider a directory containing :file:`card.gif` and >>> >>> regex = glob.translate('**/*.txt', recursive=True, include_hidden=True) >>> regex - '(?s:(?:.*/)?[^/]*\\.txt)\\Z' + '(?s:(?:.+/)?[^/]*\\.txt)\\Z' >>> reobj = re.compile(regex) >>> reobj.match('foo/bar/baz.txt') From 7b1ad63e0dbcd255514adead73dc27dd24bad99c Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 25 Sep 2023 23:05:29 +0100 Subject: [PATCH 13/22] Improve implementation; minimise fnmatch and pathlib diffs. --- Lib/fnmatch.py | 99 ++++++++++++++++++++++--------------------- Lib/glob.py | 84 ++++++++++++------------------------ Lib/pathlib.py | 50 ++++++++++------------ Lib/test/test_glob.py | 6 +-- 4 files changed, 105 insertions(+), 134 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 360065981cb3c6..73acb1fe8d4106 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -71,50 +71,6 @@ def fnmatchcase(name, pat): return match(name) is not None -def _translate_char_set(stuff): - if '-' not in stuff: - stuff = stuff.replace('\\', r'\\') - else: - chunks = [] - i = 0 - k = 2 if stuff[0] == '!' else 1 - while True: - k = stuff.find('-', k) - if k < 0: - break - chunks.append(stuff[i:k]) - i = k + 1 - k = k + 3 - chunk = stuff[i:] - if chunk: - chunks.append(chunk) - else: - chunks[-1] += '-' - # Remove empty ranges -- invalid in RE. - for k in range(len(chunks) - 1, 0, -1): - if chunks[k - 1][-1] > chunks[k][0]: - chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:] - del chunks[k] - # Escape backslashes and hyphens for set difference (--). - # Hyphens that create ranges shouldn't be escaped. - stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') - for s in chunks) - # Escape set operations (&&, ~~ and ||). - stuff = re.sub(r'([&~|])', r'\\\1', stuff) - if not stuff: - # Empty range: never match. - return '(?!)' - elif stuff == '!': - # Negated empty range: match any character. - return '.' - else: - if stuff[0] == '!': - stuff = '^' + stuff[1:] - elif stuff[0] in ('^', '['): - stuff = '\\' + stuff - return f'[{stuff}]' - - def translate(pat): """Translate a shell PATTERN to a regular expression. @@ -122,6 +78,11 @@ def translate(pat): """ STAR = object() + parts = _translate(pat, STAR, '.') + return _join_translated_parts(parts, STAR) + + +def _translate(pat, STAR, QUESTION_MARK): res = [] add = res.append i, n = 0, len(pat) @@ -133,7 +94,7 @@ def translate(pat): if (not res) or res[-1] is not STAR: add(STAR) elif c == '?': - add('.') + add(QUESTION_MARK) elif c == '[': j = i if j < n and pat[j] == '!': @@ -145,14 +106,56 @@ def translate(pat): if j >= n: add('\\[') else: - add(_translate_char_set(pat[i:j])) - i = j + 1 + stuff = pat[i:j] + if '-' not in stuff: + stuff = stuff.replace('\\', r'\\') + else: + chunks = [] + k = i+2 if pat[i] == '!' else i+1 + while True: + k = pat.find('-', k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k+1 + k = k+3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += '-' + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks)-1, 0, -1): + if chunks[k-1][-1] > chunks[k][0]: + chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') + for s in chunks) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r'([&~|])', r'\\\1', stuff) + i = j+1 + if not stuff: + # Empty range: never match. + add('(?!)') + elif stuff == '!': + # Negated empty range: match any character. + add('.') + else: + if stuff[0] == '!': + stuff = '^' + stuff[1:] + elif stuff[0] in ('^', '['): + stuff = '\\' + stuff + add(f'[{stuff}]') else: add(re.escape(c)) assert i == n + return res + +def _join_translated_parts(inp, STAR): # Deal with STARs. - inp = res res = [] add = res.append i, n = 0, len(inp) diff --git a/Lib/glob.py b/Lib/glob.py index 168cd6bcbd8567..3588bf0ad99e98 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -273,63 +273,35 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): escaped_seps = ''.join(re.escape(sep) for sep in seps) any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' - not_dot = r'(?!\.)' - res = [] - add = res.append - i, n = 0, len(pat) - if pat[:1] != '.' and not include_hidden: - add(not_dot) - while i < n: - c = pat[i] - i = i+1 - if c == '*': - h = i-1 - while i < n and pat[i] == '*': - i = i+1 - star_count = i-h - is_segment = (h == 0 or pat[h-1] in seps) and (i == n or pat[i] in seps) - if star_count == 1 or not recursive: - if is_segment: - add(f'{not_sep}+') - else: - add(f'{not_sep}*') - elif star_count == 2 and is_segment: - if i == n: - if include_hidden: - add('.*') - else: - add(fr'(?:{not_dot}{not_sep}+{any_sep})*{not_dot}{not_sep}*') + if include_hidden: + one_segment = f'{not_sep}+' + any_segments = f'(?:.+{any_sep})?' + any_final_segments = '.*' + else: + one_segment = f'[^{escaped_seps}.]{not_sep}*' + any_segments = fr'(?:{one_segment}{any_sep})*' + any_final_segments = fr'{any_segments}(?:{one_segment})?' + + results = ['(\.\Z)?+'] + parts = re.split(any_sep, pat) + last_part_idx = len(parts) - 1 + for idx, part in enumerate(parts): + if recursive: + if part == '**': + if idx < last_part_idx: + results.append(any_segments) else: - if include_hidden: - add(f'(?:.+{any_sep})?') - i = i+1 - else: - add(fr'(?:{not_dot}{not_sep}+{any_sep})*') - i = i+1 - if i < n and pat[i] != '.': - add(not_dot) - else: + results.append(any_final_segments) + continue + elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") - elif c in seps: - add(any_sep) - if i < n and pat[i] != '.' and not include_hidden: - add(not_dot) - elif c == '?': - add(not_sep) - elif c == '[': - j = i - if j < n and pat[j] == '!': - j = j+1 - if j < n and pat[j] == ']': - j = j+1 - while j < n and pat[j] != ']': - j = j+1 - if j >= n: - add('\\[') - else: - add(fnmatch._translate_char_set(pat[i:j])) - i = j + 1 + if part == '*': + results.append(one_segment) else: - add(re.escape(c)) - res = "".join(res) + if not (include_hidden or part.startswith('.')): + results.append(r'(?!\.)') + results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)) + if idx < last_part_idx: + results.append(any_sep) + res = ''.join(results) return fr'(?s:{res})\Z' diff --git a/Lib/pathlib.py b/Lib/pathlib.py index bed5e3eb95b699..178929f7e28923 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -131,7 +131,7 @@ def _select_unique(paths): yielded = set() try: for path in paths: - path_str = path._str + path_str = str(path) if path_str not in yielded: yield path yielded.add(path_str) @@ -203,10 +203,10 @@ class PurePath: # tail are normalized. '_drv', '_root', '_tail_cached', - # The `_str_cached` slot stores the string representation of the path, + # The `_str` slot stores the string representation of the path, # computed from the drive, root and tail when `__str__()` is called # for the first time. It's used to implement `_str_normcase` - '_str_cached', + '_str', # The `_str_normcase_cached` slot stores the string path with # normalized case. It is set when the `_str_normcase` property is @@ -306,7 +306,7 @@ def _load_parts(self): def _from_parsed_parts(self, drv, root, tail): path_str = self._format_parsed_parts(drv, root, tail) path = self.with_segments(path_str) - path._str_cached = path_str + path._str = path_str or '.' path._drv = drv path._root = root path._tail_cached = tail @@ -323,7 +323,12 @@ def _format_parsed_parts(cls, drv, root, tail): def __str__(self): """Return the string representation of the path, suitable for passing to system calls.""" - return self._str or '.' + try: + return self._str + except AttributeError: + self._str = self._format_parsed_parts(self.drive, self.root, + self._tail) or '.' + return self._str def __fspath__(self): return str(self) @@ -361,15 +366,6 @@ def as_uri(self): path = str(self) return prefix + urlquote_from_bytes(os.fsencode(path)) - @property - def _str(self): - try: - return self._str_cached - except AttributeError: - self._str_cached = self._format_parsed_parts( - self.drive, self.root, self._tail) - return self._str_cached - @property def _str_normcase(self): # String with normalized case, for hashing and equality checks @@ -377,9 +373,9 @@ def _str_normcase(self): return self._str_normcase_cached except AttributeError: if _is_case_sensitive(self.pathmod): - self._str_normcase_cached = self._str + self._str_normcase_cached = str(self) else: - self._str_normcase_cached = self._str.lower() + self._str_normcase_cached = str(self).lower() return self._str_normcase_cached @property @@ -661,7 +657,7 @@ def match(self, path_pattern, *, case_sensitive=None): if case_sensitive is None: case_sensitive = _is_case_sensitive(self.pathmod) sep = path_pattern.pathmod.sep - pattern_str = path_pattern._str + pattern_str = str(path_pattern) if path_pattern.drive or path_pattern.root: pass elif path_pattern._tail: @@ -669,7 +665,7 @@ def match(self, path_pattern, *, case_sensitive=None): else: raise ValueError("empty pattern") match = _compile_pattern(pattern_str, sep, case_sensitive) - return match(self._str) is not None + return match(str(self)) is not None # Subclassing os.PathLike makes isinstance() checks slower, @@ -944,11 +940,11 @@ def _scandir(self): def _make_child_relpath(self, name): tail = self._tail if tail: - path_str = f'{self._str}{self.pathmod.sep}{name}' + path_str = f'{self}{self.pathmod.sep}{name}' else: - path_str = f'{self._str}{name}' + path_str = f'{self}{name}' path = self.with_segments(path_str) - path._str_cached = path_str + path._str = path_str path._drv = self.drive path._root = self.root path._tail_cached = tail + [name] @@ -1023,9 +1019,9 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): paths = _select_recursive(paths, dir_only, follow_symlinks) # Filter out paths that don't match pattern. - prefix_len = len(self._make_child_relpath('_')._str) - 1 - match = _compile_pattern(path_pattern._str, sep, case_sensitive) - paths = (path for path in paths if match(path._str[prefix_len:])) + prefix_len = len(str(self._make_child_relpath('_'))) - 1 + match = _compile_pattern(str(path_pattern), sep, case_sensitive) + paths = (path for path in paths if match(str(path), prefix_len)) return paths dir_only = part_idx < len(pattern_parts) @@ -1131,11 +1127,11 @@ def absolute(self): # Fast path for "empty" paths, e.g. Path("."), Path("") or Path(). # We pass only one argument to with_segments() to avoid the cost # of joining, and we exploit the fact that getcwd() returns a - # fully-normalized string by storing it in _str_cached. This is - # used to implement Path.cwd(). + # fully-normalized string by storing it in _str. This is used to + # implement Path.cwd(). if not self.root and not self._tail: result = self.with_segments(cwd) - result._str_cached = cwd + result._str = cwd return result return self.with_segments(cwd, self) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 83b030116ce0f0..c27fc406993c10 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -397,11 +397,11 @@ def fn(pat): self.assertEqual(fn('?aa'), r'(?s:[^/]aa)\Z') self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z') self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z') - self.assertEqual(fn('**'), r'(?s:[^/]+)\Z') - self.assertEqual(fn('***'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('**'), r'(?s:[^/]*)\Z') + self.assertEqual(fn('***'), r'(?s:[^/]*)\Z') self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z') self.assertEqual(fn('**b'), r'(?s:[^/]*b)\Z') - self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/[^/]+/[^/]+/[^/]*\.[^/]*/[^/]+)\Z') + self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/[^/]*/[^/]+/[^/]*\.[^/]*/[^/]*)\Z') def test_translate_recursive(self): def fn(pat): From 1485ff3deb2180d3da8ba3d33f80e7b26329eabf Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 26 Sep 2023 23:43:53 +0100 Subject: [PATCH 14/22] Fix tests --- Lib/glob.py | 2 +- Lib/pathlib.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 3588bf0ad99e98..1d515331993585 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -282,7 +282,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): any_segments = fr'(?:{one_segment}{any_sep})*' any_final_segments = fr'{any_segments}(?:{one_segment})?' - results = ['(\.\Z)?+'] + results = [] parts = re.split(any_sep, pat) last_part_idx = len(parts) - 1 for idx, part in enumerate(parts): diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 178929f7e28923..7d7df7dc9d0f74 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -70,7 +70,7 @@ def _compile_pattern(pat, sep, case_sensitive): sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep) - return re.compile(regex, flags).match + return re.compile(r'(\.\Z)?+' + regex, flags).match def _select_children(parent_paths, dir_only, follow_symlinks, match): From 4c6d6f0f3cbcc1d01a9f88bf0317721cfc520eb9 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 26 Sep 2023 23:52:16 +0100 Subject: [PATCH 15/22] Tiny performance tweak --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 7d7df7dc9d0f74..8f4125f2cfc3ba 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -1019,7 +1019,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): paths = _select_recursive(paths, dir_only, follow_symlinks) # Filter out paths that don't match pattern. - prefix_len = len(str(self._make_child_relpath('_'))) - 1 + prefix_len = len(str(self)) + bool(self._tail) match = _compile_pattern(str(path_pattern), sep, case_sensitive) paths = (path for path in paths if match(str(path), prefix_len)) return paths From afb2d43fdf499a3054b44846de4908aaf0e4ff19 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 27 Sep 2023 00:11:03 +0100 Subject: [PATCH 16/22] Fix `_make_child_relpath()` --- Lib/pathlib.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index b8a619507150f7..f7750764a70334 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -938,11 +938,14 @@ def _scandir(self): return os.scandir(self) def _make_child_relpath(self, name): + path_str = str(self) tail = self._tail if tail: - path_str = f'{self}{self.pathmod.sep}{name}' + path_str = f'{path_str}{self.pathmod.sep}{name}' + elif path_str != '.': + path_str = f'{path_str}{name}' else: - path_str = f'{self}{name}' + path_str = name path = self.with_segments(path_str) path._str = path_str path._drv = self.drive @@ -1019,7 +1022,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks): paths = _select_recursive(paths, dir_only, follow_symlinks) # Filter out paths that don't match pattern. - prefix_len = len(str(self)) + bool(self._tail) + prefix_len = len(str(self._make_child_relpath('_'))) - 1 match = _compile_pattern(str(path_pattern), sep, case_sensitive) paths = (path for path in paths if match(str(path), prefix_len)) return paths From d73df1b0f935a4ff6d49a536c4a16412a8a81a46 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 27 Sep 2023 00:42:18 +0100 Subject: [PATCH 17/22] Minor code improvements --- Lib/glob.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 1d515331993585..5963b1691e0deb 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -274,31 +274,31 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' if include_hidden: - one_segment = f'{not_sep}+' + one_last_segment = f'{not_sep}+' + one_segment = f'{one_last_segment}{any_sep}' any_segments = f'(?:.+{any_sep})?' - any_final_segments = '.*' + any_last_segments = '.*' else: - one_segment = f'[^{escaped_seps}.]{not_sep}*' - any_segments = fr'(?:{one_segment}{any_sep})*' - any_final_segments = fr'{any_segments}(?:{one_segment})?' + one_last_segment = f'[^{escaped_seps}.]{not_sep}*' + one_segment = f'{one_last_segment}{any_sep}' + any_segments = fr'(?:{one_segment})*' + any_last_segments = fr'{any_segments}(?:{one_last_segment})?' results = [] parts = re.split(any_sep, pat) last_part_idx = len(parts) - 1 for idx, part in enumerate(parts): + if part == '*': + results.append(one_segment if idx < last_part_idx else one_last_segment) + continue if recursive: if part == '**': - if idx < last_part_idx: - results.append(any_segments) - else: - results.append(any_final_segments) + results.append(any_segments if idx < last_part_idx else any_last_segments) continue elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") - if part == '*': - results.append(one_segment) - else: - if not (include_hidden or part.startswith('.')): + if part: + if not include_hidden and part[0] in '*?': results.append(r'(?!\.)') results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)) if idx < last_part_idx: From c70afe33d93a0b7f9f13b86f1b8213405d9a1c89 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 27 Sep 2023 00:50:22 +0100 Subject: [PATCH 18/22] Add another test for `include_hidden=False` --- Lib/test/test_glob.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index c27fc406993c10..6a96cc2cf89d3e 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -350,8 +350,7 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) - - def test_translate(self): + def test_translate_matching(self): match = re.compile(glob.translate('*')).match self.assertIsNotNone(match('foo')) self.assertIsNotNone(match('foo.bar')) @@ -384,6 +383,26 @@ def test_translate(self): self.assertIsNotNone(match(os.path.join('foo', 'bar.txt'))) self.assertIsNone(match(os.path.join('foo', '.bar.txt'))) + def test_translate(self): + def fn(pat): + return glob.translate(pat, seps='/') + self.assertEqual(fn('foo'), r'(?s:foo)\Z') + self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z') + self.assertEqual(fn('*'), r'(?s:[^/.][^/]*)\Z') + self.assertEqual(fn('?'), r'(?s:(?!\.)[^/])\Z') + self.assertEqual(fn('a*'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('*a'), r'(?s:(?!\.)[^/]*a)\Z') + self.assertEqual(fn('.*'), r'(?s:\.[^/]*)\Z') + self.assertEqual(fn('?aa'), r'(?s:(?!\.)[^/]aa)\Z') + self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z') + self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z') + self.assertEqual(fn('**'), r'(?s:(?!\.)[^/]*)\Z') + self.assertEqual(fn('***'), r'(?s:(?!\.)[^/]*)\Z') + self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('**b'), r'(?s:(?!\.)[^/]*b)\Z') + self.assertEqual(fn('/**/*/*.*/**'), + r'(?s:/(?!\.)[^/]*/[^/.][^/]*/(?!\.)[^/]*\.[^/]*/(?!\.)[^/]*)\Z') + def test_translate_include_hidden(self): def fn(pat): return glob.translate(pat, include_hidden=True, seps='/') From f178b14bfef791f149b2cad0d6a49eed051b344d Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 30 Sep 2023 21:13:36 +0100 Subject: [PATCH 19/22] Add whatsnew entry --- Doc/whatsnew/3.13.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index c9e6ca8bf88866..79cd7b70628228 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -142,6 +142,13 @@ doctest :attr:`doctest.TestResults.skipped` attributes. (Contributed by Victor Stinner in :gh:`108794`.) +glob +---- + +* Add :func:`glob.translate` function that converts a path specification with + shell-style wildcards to a regular expression. + (Contributed by Barney Gale in :gh:`72904`.) + io -- From 4a726aa8063addb2d94e56d3e73ea8d132f8c7dd Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 30 Sep 2023 21:33:44 +0100 Subject: [PATCH 20/22] Collapse adjacent `**` segments. --- Lib/glob.py | 6 +++++- Lib/test/test_glob.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 5963b1691e0deb..8b921569501d82 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -293,7 +293,11 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): continue if recursive: if part == '**': - results.append(any_segments if idx < last_part_idx else any_last_segments) + if idx < last_part_idx: + if parts[idx + 1] != '**': + results.append(any_segments) + else: + results.append(any_last_segments) continue elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 6a96cc2cf89d3e..aa5fac8eca1354 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -428,6 +428,7 @@ def fn(pat): self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') self.assertEqual(fn('?'), r'(?s:[^/])\Z') self.assertEqual(fn('**'), r'(?s:.*)\Z') + self.assertEqual(fn('**/**'), r'(?s:.*)\Z') self.assertRaises(ValueError, fn, '***') self.assertRaises(ValueError, fn, 'a**') self.assertRaises(ValueError, fn, '**b') @@ -437,7 +438,7 @@ def test_translate_seps(self): def fn(pat): return glob.translate(pat, recursive=True, include_hidden=True, seps=['/', '\\']) self.assertEqual(fn('foo/bar\\baz'), r'(?s:foo[/\\]bar[/\\]baz)\Z') - self.assertEqual(fn('**/**'), r'(?s:(?:.+[/\\])?.*)\Z') + self.assertEqual(fn('**/*'), r'(?s:(?:.+[/\\])?[^/\\]+)\Z') @skip_unless_symlink From 78292eb172d019d94bbe5962b3dd85736e1026ac Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Sat, 30 Sep 2023 23:21:57 +0100 Subject: [PATCH 21/22] Apply suggestions from code review Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Lib/glob.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 8b921569501d82..4a335a10766cf4 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -267,10 +267,10 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): """ if not seps: if os.path.altsep: - seps = [os.path.sep, os.path.altsep] + seps = (os.path.sep, os.path.altsep) else: seps = os.path.sep - escaped_seps = ''.join(re.escape(sep) for sep in seps) + escaped_seps = ''.join(map(re.escape, seps)) any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' if include_hidden: @@ -281,8 +281,8 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): else: one_last_segment = f'[^{escaped_seps}.]{not_sep}*' one_segment = f'{one_last_segment}{any_sep}' - any_segments = fr'(?:{one_segment})*' - any_last_segments = fr'{any_segments}(?:{one_last_segment})?' + any_segments = f'(?:{one_segment})*' + any_last_segments = f'{any_segments}(?:{one_last_segment})?' results = [] parts = re.split(any_sep, pat) From 5d4062c126d9cb72f75016a499aaeadf6103d7e5 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 30 Sep 2023 23:42:34 +0100 Subject: [PATCH 22/22] Add comment explaining regex that consumes "empty" paths. --- Lib/pathlib.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index cad4ca8d15eeec..e57728c16962c4 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -83,7 +83,10 @@ def _compile_pattern(pat, sep, case_sensitive): sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep) - return re.compile(r'(\.\Z)?+' + regex, flags).match + # The string representation of an empty path is a single dot ('.'). Empty + # paths shouldn't match wildcards, so we consume it with an atomic group. + regex = r'(\.\Z)?+' + regex + return re.compile(regex, flags).match def _select_children(parent_paths, dir_only, follow_symlinks, match):