Skip to content

Commit cf67ebf

Browse files
barneygalejaracoAA-Turner
authored
GH-72904: Add glob.translate() function (#106703)
Add `glob.translate()` function that converts a pathname with shell wildcards to a regular expression. The regular expression is used by pathlib to implement `match()` and `glob()`. This function differs from `fnmatch.translate()` in that wildcards do not match path separators by default, and that a `*` pattern segment matches precisely one path segment. When *recursive* is set to true, `**` pattern segments match any number of path segments, and `**` cannot appear outside its own segment. In pathlib, this change speeds up directory walking (because `_make_child_relpath()` does less work), makes path objects smaller (they don't need a `_lines` slot), and removes the need for some gnarly code. Co-authored-by: Jason R. Coombs <[email protected]> Co-authored-by: Adam Turner <[email protected]>
1 parent babb787 commit cf67ebf

File tree

7 files changed

+229
-106
lines changed

7 files changed

+229
-106
lines changed

Doc/library/glob.rst

+39
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,45 @@ default. For example, consider a directory containing :file:`card.gif` and
145145
>>> glob.glob('.c*')
146146
['.card.gif']
147147

148+
149+
.. function:: translate(pathname, *, recursive=False, include_hidden=False, seps=None)
150+
151+
Convert the given path specification to a regular expression for use with
152+
:func:`re.match`. The path specification can contain shell-style wildcards.
153+
154+
For example:
155+
156+
>>> import glob, re
157+
>>>
158+
>>> regex = glob.translate('**/*.txt', recursive=True, include_hidden=True)
159+
>>> regex
160+
'(?s:(?:.+/)?[^/]*\\.txt)\\Z'
161+
>>> reobj = re.compile(regex)
162+
>>> reobj.match('foo/bar/baz.txt')
163+
<re.Match object; span=(0, 15), match='foo/bar/baz.txt'>
164+
165+
Path separators and segments are meaningful to this function, unlike
166+
:func:`fnmatch.translate`. By default wildcards do not match path
167+
separators, and ``*`` pattern segments match precisely one path segment.
168+
169+
If *recursive* is true, the pattern segment "``**``" will match any number
170+
of path segments. If "``**``" occurs in any position other than a full
171+
pattern segment, :exc:`ValueError` is raised.
172+
173+
If *include_hidden* is true, wildcards can match path segments that start
174+
with a dot (``.``).
175+
176+
A sequence of path separators may be supplied to the *seps* argument. If
177+
not given, :data:`os.sep` and :data:`~os.altsep` (if available) are used.
178+
179+
.. seealso::
180+
181+
:meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob` methods,
182+
which call this function to implement pattern matching and globbing.
183+
184+
.. versionadded:: 3.13
185+
186+
148187
.. seealso::
149188

150189
Module :mod:`fnmatch`

Doc/whatsnew/3.13.rst

+7
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,13 @@ doctest
183183
:attr:`doctest.TestResults.skipped` attributes.
184184
(Contributed by Victor Stinner in :gh:`108794`.)
185185

186+
glob
187+
----
188+
189+
* Add :func:`glob.translate` function that converts a path specification with
190+
shell-style wildcards to a regular expression.
191+
(Contributed by Barney Gale in :gh:`72904`.)
192+
186193
io
187194
--
188195

Lib/fnmatch.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ def translate(pat):
7878
"""
7979

8080
STAR = object()
81+
parts = _translate(pat, STAR, '.')
82+
return _join_translated_parts(parts, STAR)
83+
84+
85+
def _translate(pat, STAR, QUESTION_MARK):
8186
res = []
8287
add = res.append
8388
i, n = 0, len(pat)
@@ -89,7 +94,7 @@ def translate(pat):
8994
if (not res) or res[-1] is not STAR:
9095
add(STAR)
9196
elif c == '?':
92-
add('.')
97+
add(QUESTION_MARK)
9398
elif c == '[':
9499
j = i
95100
if j < n and pat[j] == '!':
@@ -146,9 +151,11 @@ def translate(pat):
146151
else:
147152
add(re.escape(c))
148153
assert i == n
154+
return res
155+
149156

157+
def _join_translated_parts(inp, STAR):
150158
# Deal with STARs.
151-
inp = res
152159
res = []
153160
add = res.append
154161
i, n = 0, len(inp)

Lib/glob.py

+60
Original file line numberDiff line numberDiff line change
@@ -249,3 +249,63 @@ def escape(pathname):
249249

250250

251251
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
252+
253+
254+
def translate(pat, *, recursive=False, include_hidden=False, seps=None):
255+
"""Translate a pathname with shell wildcards to a regular expression.
256+
257+
If `recursive` is true, the pattern segment '**' will match any number of
258+
path segments; if '**' appears outside its own segment, ValueError will be
259+
raised.
260+
261+
If `include_hidden` is true, wildcards can match path segments beginning
262+
with a dot ('.').
263+
264+
If a sequence of separator characters is given to `seps`, they will be
265+
used to split the pattern into segments and match path separators. If not
266+
given, os.path.sep and os.path.altsep (where available) are used.
267+
"""
268+
if not seps:
269+
if os.path.altsep:
270+
seps = (os.path.sep, os.path.altsep)
271+
else:
272+
seps = os.path.sep
273+
escaped_seps = ''.join(map(re.escape, seps))
274+
any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
275+
not_sep = f'[^{escaped_seps}]'
276+
if include_hidden:
277+
one_last_segment = f'{not_sep}+'
278+
one_segment = f'{one_last_segment}{any_sep}'
279+
any_segments = f'(?:.+{any_sep})?'
280+
any_last_segments = '.*'
281+
else:
282+
one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
283+
one_segment = f'{one_last_segment}{any_sep}'
284+
any_segments = f'(?:{one_segment})*'
285+
any_last_segments = f'{any_segments}(?:{one_last_segment})?'
286+
287+
results = []
288+
parts = re.split(any_sep, pat)
289+
last_part_idx = len(parts) - 1
290+
for idx, part in enumerate(parts):
291+
if part == '*':
292+
results.append(one_segment if idx < last_part_idx else one_last_segment)
293+
continue
294+
if recursive:
295+
if part == '**':
296+
if idx < last_part_idx:
297+
if parts[idx + 1] != '**':
298+
results.append(any_segments)
299+
else:
300+
results.append(any_last_segments)
301+
continue
302+
elif '**' in part:
303+
raise ValueError("Invalid pattern: '**' can only be an entire path component")
304+
if part:
305+
if not include_hidden and part[0] in '*?':
306+
results.append(r'(?!\.)')
307+
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
308+
if idx < last_part_idx:
309+
results.append(any_sep)
310+
res = ''.join(results)
311+
return fr'(?s:{res})\Z'

Lib/pathlib.py

+21-104
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
"""
77

88
import contextlib
9-
import fnmatch
109
import functools
10+
import glob
1111
import io
1212
import ntpath
1313
import os
@@ -76,78 +76,16 @@ def _is_case_sensitive(pathmod):
7676
#
7777

7878

79-
# fnmatch.translate() returns a regular expression that includes a prefix and
80-
# a suffix, which enable matching newlines and ensure the end of the string is
81-
# matched, respectively. These features are undesirable for our implementation
82-
# of PurePatch.match(), which represents path separators as newlines and joins
83-
# pattern segments together. As a workaround, we define a slice object that
84-
# can remove the prefix and suffix from any translate() result. See the
85-
# _compile_pattern_lines() function for more details.
86-
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
87-
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
88-
_SWAP_SEP_AND_NEWLINE = {
89-
'/': str.maketrans({'/': '\n', '\n': '/'}),
90-
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
91-
}
92-
93-
9479
@functools.lru_cache(maxsize=256)
95-
def _compile_pattern(pat, case_sensitive):
80+
def _compile_pattern(pat, sep, case_sensitive):
9681
"""Compile given glob pattern to a re.Pattern object (observing case
97-
sensitivity), or None if the pattern should match everything."""
98-
if pat == '*':
99-
return None
82+
sensitivity)."""
10083
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
101-
return re.compile(fnmatch.translate(pat), flags).match
102-
103-
104-
@functools.lru_cache()
105-
def _compile_pattern_lines(pattern_lines, case_sensitive):
106-
"""Compile the given pattern lines to an `re.Pattern` object.
107-
108-
The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
109-
its path separators and newlines swapped (e.g. '**\n*.py`). By using
110-
newlines to separate path components, and not setting `re.DOTALL`, we
111-
ensure that the `*` wildcard cannot match path separators.
112-
113-
The returned `re.Pattern` object may have its `match()` method called to
114-
match a complete pattern, or `search()` to match from the right. The
115-
argument supplied to these methods must also have its path separators and
116-
newlines swapped.
117-
"""
118-
119-
# Match the start of the path, or just after a path separator
120-
parts = ['^']
121-
for part in pattern_lines.splitlines(keepends=True):
122-
if part == '*\n':
123-
part = r'.+\n'
124-
elif part == '*':
125-
part = r'.+'
126-
elif part == '**\n':
127-
# '**/' component: we use '(?s:.)' rather than '.' so that path
128-
# separators (i.e. newlines) are matched. The trailing '^' ensures
129-
# we terminate after a path separator (i.e. on a new line).
130-
part = r'(?s:.)*^'
131-
elif part == '**':
132-
# '**' component.
133-
part = r'(?s:.)*'
134-
elif '**' in part:
135-
raise ValueError("Invalid pattern: '**' can only be an entire path component")
136-
else:
137-
# Any other component: pass to fnmatch.translate(). We slice off
138-
# the common prefix and suffix added by translate() to ensure that
139-
# re.DOTALL is not set, and the end of the string not matched,
140-
# respectively. With DOTALL not set, '*' wildcards will not match
141-
# path separators, because the '.' characters in the pattern will
142-
# not match newlines.
143-
part = fnmatch.translate(part)[_FNMATCH_SLICE]
144-
parts.append(part)
145-
# Match the end of the path, always.
146-
parts.append(r'\Z')
147-
flags = re.MULTILINE
148-
if not case_sensitive:
149-
flags |= re.IGNORECASE
150-
return re.compile(''.join(parts), flags=flags)
84+
regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep)
85+
# The string representation of an empty path is a single dot ('.'). Empty
86+
# paths shouldn't match wildcards, so we consume it with an atomic group.
87+
regex = r'(\.\Z)?+' + regex
88+
return re.compile(regex, flags).match
15189

15290

15391
def _select_children(parent_paths, dir_only, follow_symlinks, match):
@@ -171,7 +109,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
171109
except OSError:
172110
continue
173111
name = entry.name
174-
if match is None or match(name):
112+
if match(name):
175113
yield parent_path._make_child_relpath(name)
176114

177115

@@ -297,10 +235,6 @@ class PurePath:
297235
# to implement comparison methods like `__lt__()`.
298236
'_parts_normcase_cached',
299237

300-
# The `_lines_cached` slot stores the string path with path separators
301-
# and newlines swapped. This is used to implement `match()`.
302-
'_lines_cached',
303-
304238
# The `_hash` slot stores the hash of the case-normalized string
305239
# path. It's set when `__hash__()` is called for the first time.
306240
'_hash',
@@ -475,20 +409,6 @@ def _parts_normcase(self):
475409
self._parts_normcase_cached = self._str_normcase.split(self.pathmod.sep)
476410
return self._parts_normcase_cached
477411

478-
@property
479-
def _lines(self):
480-
# Path with separators and newlines swapped, for pattern matching.
481-
try:
482-
return self._lines_cached
483-
except AttributeError:
484-
path_str = str(self)
485-
if path_str == '.':
486-
self._lines_cached = ''
487-
else:
488-
trans = _SWAP_SEP_AND_NEWLINE[self.pathmod.sep]
489-
self._lines_cached = path_str.translate(trans)
490-
return self._lines_cached
491-
492412
def __eq__(self, other):
493413
if not isinstance(other, PurePath):
494414
return NotImplemented
@@ -763,13 +683,16 @@ def match(self, path_pattern, *, case_sensitive=None):
763683
path_pattern = self.with_segments(path_pattern)
764684
if case_sensitive is None:
765685
case_sensitive = _is_case_sensitive(self.pathmod)
766-
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
686+
sep = path_pattern.pathmod.sep
687+
pattern_str = str(path_pattern)
767688
if path_pattern.drive or path_pattern.root:
768-
return pattern.match(self._lines) is not None
689+
pass
769690
elif path_pattern._tail:
770-
return pattern.search(self._lines) is not None
691+
pattern_str = f'**{sep}{pattern_str}'
771692
else:
772693
raise ValueError("empty pattern")
694+
match = _compile_pattern(pattern_str, sep, case_sensitive)
695+
return match(str(self)) is not None
773696

774697

775698
# Subclassing os.PathLike makes isinstance() checks slower,
@@ -1069,26 +992,19 @@ def _scandir(self):
1069992
return contextlib.nullcontext(self.iterdir())
1070993

1071994
def _make_child_relpath(self, name):
1072-
sep = self.pathmod.sep
1073-
lines_name = name.replace('\n', sep)
1074-
lines_str = self._lines
1075995
path_str = str(self)
1076996
tail = self._tail
1077997
if tail:
1078-
path_str = f'{path_str}{sep}{name}'
1079-
lines_str = f'{lines_str}\n{lines_name}'
998+
path_str = f'{path_str}{self.pathmod.sep}{name}'
1080999
elif path_str != '.':
10811000
path_str = f'{path_str}{name}'
1082-
lines_str = f'{lines_str}{lines_name}'
10831001
else:
10841002
path_str = name
1085-
lines_str = lines_name
10861003
path = self.with_segments(path_str)
10871004
path._str = path_str
10881005
path._drv = self.drive
10891006
path._root = self.root
10901007
path._tail_cached = tail + [name]
1091-
path._lines_cached = lines_str
10921008
return path
10931009

10941010
def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@@ -1139,6 +1055,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
11391055
# do not perform any filesystem access, which can be much faster!
11401056
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
11411057
deduplicate_paths = False
1058+
sep = self.pathmod.sep
11421059
paths = iter([self] if self.is_dir() else [])
11431060
part_idx = 0
11441061
while part_idx < len(pattern_parts):
@@ -1159,9 +1076,9 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
11591076
paths = _select_recursive(paths, dir_only, follow_symlinks)
11601077

11611078
# Filter out paths that don't match pattern.
1162-
prefix_len = len(self._make_child_relpath('_')._lines) - 1
1163-
match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match
1164-
paths = (path for path in paths if match(path._lines[prefix_len:]))
1079+
prefix_len = len(str(self._make_child_relpath('_'))) - 1
1080+
match = _compile_pattern(str(path_pattern), sep, case_sensitive)
1081+
paths = (path for path in paths if match(str(path), prefix_len))
11651082
return paths
11661083

11671084
dir_only = part_idx < len(pattern_parts)
@@ -1174,7 +1091,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
11741091
raise ValueError("Invalid pattern: '**' can only be an entire path component")
11751092
else:
11761093
dir_only = part_idx < len(pattern_parts)
1177-
match = _compile_pattern(part, case_sensitive)
1094+
match = _compile_pattern(part, sep, case_sensitive)
11781095
paths = _select_children(paths, dir_only, follow_symlinks, match)
11791096
return paths
11801097

0 commit comments

Comments
 (0)