Skip to content

Commit 6313cdd

Browse files
authored
GH-79634: Accept path-like objects as pathlib glob patterns. (#114017)
Allow `os.PathLike` objects to be passed as patterns to `pathlib.Path.glob()` and `rglob()`. (It's already possible to use them in `PurePath.match()`) While we're in the area: - Allow empty glob patterns in `PathBase` (but not `Path`) - Speed up globbing in `PathBase` by generating paths with trailing slashes only as a final step, rather than for every intermediate directory. - Simplify and speed up handling of rare patterns involving both `**` and `..` segments.
1 parent 681e9e8 commit 6313cdd

File tree

6 files changed

+115
-72
lines changed

6 files changed

+115
-72
lines changed

Doc/library/pathlib.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,6 +1036,9 @@ call fails (for example because the path doesn't exist).
10361036
future Python release, patterns with this ending will match both files
10371037
and directories. Add a trailing slash to match only directories.
10381038

1039+
.. versionchanged:: 3.13
1040+
The *pattern* parameter accepts a :term:`path-like object`.
1041+
10391042
.. method:: Path.group(*, follow_symlinks=True)
10401043

10411044
Return the name of the group owning the file. :exc:`KeyError` is raised
@@ -1498,6 +1501,9 @@ call fails (for example because the path doesn't exist).
14981501
.. versionchanged:: 3.13
14991502
The *follow_symlinks* parameter was added.
15001503

1504+
.. versionchanged:: 3.13
1505+
The *pattern* parameter accepts a :term:`path-like object`.
1506+
15011507
.. method:: Path.rmdir()
15021508

15031509
Remove this directory. The directory must be empty.

Lib/pathlib/__init__.py

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,29 @@ def as_uri(self):
467467
from urllib.parse import quote_from_bytes
468468
return prefix + quote_from_bytes(os.fsencode(path))
469469

470+
@property
471+
def _pattern_stack(self):
472+
"""Stack of path components, to be used with patterns in glob()."""
473+
parts = self._tail.copy()
474+
pattern = self._raw_path
475+
if self.anchor:
476+
raise NotImplementedError("Non-relative patterns are unsupported")
477+
elif not parts:
478+
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
479+
elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
480+
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
481+
parts.append('')
482+
elif parts[-1] == '**':
483+
# GH-70303: '**' only matches directories. Add trailing slash.
484+
warnings.warn(
485+
"Pattern ending '**' will match files and directories in a "
486+
"future Python release. Add a trailing slash to match only "
487+
"directories and remove this warning.",
488+
FutureWarning, 4)
489+
parts.append('')
490+
parts.reverse()
491+
return parts
492+
470493

471494
# Subclassing os.PathLike makes isinstance() checks slower,
472495
# which in turn makes Path construction slower. Register instead!
@@ -580,7 +603,7 @@ def iterdir(self):
580603
def _scandir(self):
581604
return os.scandir(self)
582605

583-
def _make_child_entry(self, entry, is_dir=False):
606+
def _make_child_entry(self, entry):
584607
# Transform an entry yielded from _scandir() into a path object.
585608
path_str = entry.name if str(self) == '.' else entry.path
586609
path = self.with_segments(path_str)
@@ -591,6 +614,8 @@ def _make_child_entry(self, entry, is_dir=False):
591614
return path
592615

593616
def _make_child_relpath(self, name):
617+
if not name:
618+
return self
594619
path_str = str(self)
595620
tail = self._tail
596621
if tail:
@@ -611,14 +636,8 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
611636
kind, including directories) matching the given relative pattern.
612637
"""
613638
sys.audit("pathlib.Path.glob", self, pattern)
614-
if pattern.endswith('**'):
615-
# GH-70303: '**' only matches directories. Add trailing slash.
616-
warnings.warn(
617-
"Pattern ending '**' will match files and directories in a "
618-
"future Python release. Add a trailing slash to match only "
619-
"directories and remove this warning.",
620-
FutureWarning, 2)
621-
pattern = f'{pattern}/'
639+
if not isinstance(pattern, PurePath):
640+
pattern = self.with_segments(pattern)
622641
return _abc.PathBase.glob(
623642
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
624643

@@ -628,15 +647,9 @@ def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
628647
this subtree.
629648
"""
630649
sys.audit("pathlib.Path.rglob", self, pattern)
631-
if pattern.endswith('**'):
632-
# GH-70303: '**' only matches directories. Add trailing slash.
633-
warnings.warn(
634-
"Pattern ending '**' will match files and directories in a "
635-
"future Python release. Add a trailing slash to match only "
636-
"directories and remove this warning.",
637-
FutureWarning, 2)
638-
pattern = f'{pattern}/'
639-
pattern = f'**/{pattern}'
650+
if not isinstance(pattern, PurePath):
651+
pattern = self.with_segments(pattern)
652+
pattern = '**' / pattern
640653
return _abc.PathBase.glob(
641654
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
642655

Lib/pathlib/_abc.py

Lines changed: 47 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,12 @@ def _compile_pattern(pat, sep, case_sensitive):
6363
return re.compile(regex, flags=flags).match
6464

6565

66+
def _select_special(paths, part):
67+
"""Yield special literal children of the given paths."""
68+
for path in paths:
69+
yield path._make_child_relpath(part)
70+
71+
6672
def _select_children(parent_paths, dir_only, follow_symlinks, match):
6773
"""Yield direct children of given paths, filtering by name and type."""
6874
if follow_symlinks is None:
@@ -84,7 +90,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
8490
except OSError:
8591
continue
8692
if match(entry.name):
87-
yield parent_path._make_child_entry(entry, dir_only)
93+
yield parent_path._make_child_entry(entry)
8894

8995

9096
def _select_recursive(parent_paths, dir_only, follow_symlinks):
@@ -107,7 +113,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
107113
for entry in entries:
108114
try:
109115
if entry.is_dir(follow_symlinks=follow_symlinks):
110-
paths.append(path._make_child_entry(entry, dir_only))
116+
paths.append(path._make_child_entry(entry))
111117
continue
112118
except OSError:
113119
pass
@@ -427,6 +433,14 @@ def is_absolute(self):
427433
a drive)."""
428434
return self.pathmod.isabs(self._raw_path)
429435

436+
@property
437+
def _pattern_stack(self):
438+
"""Stack of path components, to be used with patterns in glob()."""
439+
anchor, parts = self._stack
440+
if anchor:
441+
raise NotImplementedError("Non-relative patterns are unsupported")
442+
return parts
443+
430444
def match(self, path_pattern, *, case_sensitive=None):
431445
"""
432446
Return True if this path matches the given pattern.
@@ -436,11 +450,10 @@ def match(self, path_pattern, *, case_sensitive=None):
436450
if case_sensitive is None:
437451
case_sensitive = _is_case_sensitive(self.pathmod)
438452
sep = path_pattern.pathmod.sep
439-
pattern_str = str(path_pattern)
440453
if path_pattern.anchor:
441-
pass
454+
pattern_str = str(path_pattern)
442455
elif path_pattern.parts:
443-
pattern_str = f'**{sep}{pattern_str}'
456+
pattern_str = str('**' / path_pattern)
444457
else:
445458
raise ValueError("empty pattern")
446459
match = _compile_pattern(pattern_str, sep, case_sensitive)
@@ -714,10 +727,8 @@ def _scandir(self):
714727
from contextlib import nullcontext
715728
return nullcontext(self.iterdir())
716729

717-
def _make_child_entry(self, entry, is_dir=False):
730+
def _make_child_entry(self, entry):
718731
# Transform an entry yielded from _scandir() into a path object.
719-
if is_dir:
720-
return entry.joinpath('')
721732
return entry
722733

723734
def _make_child_relpath(self, name):
@@ -727,57 +738,35 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
727738
"""Iterate over this subtree and yield all existing files (of any
728739
kind, including directories) matching the given relative pattern.
729740
"""
730-
path_pattern = self.with_segments(pattern)
731-
if path_pattern.anchor:
732-
raise NotImplementedError("Non-relative patterns are unsupported")
733-
elif not path_pattern.parts:
734-
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
735-
736-
pattern_parts = list(path_pattern.parts)
737-
if not self.pathmod.split(pattern)[1]:
738-
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
739-
pattern_parts.append('')
740-
741+
if not isinstance(pattern, PurePathBase):
742+
pattern = self.with_segments(pattern)
741743
if case_sensitive is None:
742744
# TODO: evaluate case-sensitivity of each directory in _select_children().
743745
case_sensitive = _is_case_sensitive(self.pathmod)
744746

745-
# If symlinks are handled consistently, and the pattern does not
746-
# contain '..' components, then we can use a 'walk-and-match' strategy
747-
# when expanding '**' wildcards. When a '**' wildcard is encountered,
748-
# all following pattern parts are immediately consumed and used to
749-
# build a `re.Pattern` object. This pattern is used to filter the
750-
# recursive walk. As a result, pattern parts following a '**' wildcard
751-
# do not perform any filesystem access, which can be much faster!
752-
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
747+
stack = pattern._pattern_stack
748+
specials = ('', '.', '..')
749+
filter_paths = False
753750
deduplicate_paths = False
754751
sep = self.pathmod.sep
755752
paths = iter([self.joinpath('')] if self.is_dir() else [])
756-
part_idx = 0
757-
while part_idx < len(pattern_parts):
758-
part = pattern_parts[part_idx]
759-
part_idx += 1
760-
if part == '':
761-
# Trailing slash.
762-
pass
763-
elif part == '..':
764-
paths = (path._make_child_relpath('..') for path in paths)
753+
while stack:
754+
part = stack.pop()
755+
if part in specials:
756+
paths = _select_special(paths, part)
765757
elif part == '**':
766758
# Consume adjacent '**' components.
767-
while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**':
768-
part_idx += 1
769-
770-
if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '':
771-
dir_only = pattern_parts[-1] == ''
772-
paths = _select_recursive(paths, dir_only, follow_symlinks)
759+
while stack and stack[-1] == '**':
760+
stack.pop()
773761

774-
# Filter out paths that don't match pattern.
775-
prefix_len = len(str(self._make_child_relpath('_'))) - 1
776-
match = _compile_pattern(str(path_pattern), sep, case_sensitive)
777-
paths = (path for path in paths if match(str(path), prefix_len))
778-
return paths
762+
# Consume adjacent non-special components and enable post-walk
763+
# regex filtering, provided we're treating symlinks consistently.
764+
if follow_symlinks is not None:
765+
while stack and stack[-1] not in specials:
766+
filter_paths = True
767+
stack.pop()
779768

780-
dir_only = part_idx < len(pattern_parts)
769+
dir_only = bool(stack)
781770
paths = _select_recursive(paths, dir_only, follow_symlinks)
782771
if deduplicate_paths:
783772
# De-duplicate if we've already seen a '**' component.
@@ -786,18 +775,25 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
786775
elif '**' in part:
787776
raise ValueError("Invalid pattern: '**' can only be an entire path component")
788777
else:
789-
dir_only = part_idx < len(pattern_parts)
778+
dir_only = bool(stack)
790779
match = _compile_pattern(part, sep, case_sensitive)
791780
paths = _select_children(paths, dir_only, follow_symlinks, match)
781+
if filter_paths:
782+
# Filter out paths that don't match pattern.
783+
prefix_len = len(str(self._make_child_relpath('_'))) - 1
784+
match = _compile_pattern(str(pattern), sep, case_sensitive)
785+
paths = (path for path in paths if match(str(path), prefix_len))
792786
return paths
793787

794788
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
795789
"""Recursively yield all existing files (of any kind, including
796790
directories) matching the given relative pattern, anywhere in
797791
this subtree.
798792
"""
799-
return self.glob(
800-
f'**/{pattern}', case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
793+
if not isinstance(pattern, PurePathBase):
794+
pattern = self.with_segments(pattern)
795+
pattern = '**' / pattern
796+
return self.glob(pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
801797

802798
def walk(self, top_down=True, on_error=None, follow_symlinks=False):
803799
"""Walk the directory tree from this directory, similar to os.walk()."""

Lib/test/test_pathlib/test_pathlib.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1818,6 +1818,13 @@ def test_walk_above_recursion_limit(self):
18181818
list(base.walk())
18191819
list(base.walk(top_down=False))
18201820

1821+
def test_glob_empty_pattern(self):
1822+
p = self.cls('')
1823+
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
1824+
list(p.glob(''))
1825+
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
1826+
list(p.glob('.'))
1827+
18211828
def test_glob_many_open_files(self):
18221829
depth = 30
18231830
P = self.cls
@@ -1860,6 +1867,22 @@ def test_glob_recursive_no_trailing_slash(self):
18601867
with self.assertWarns(FutureWarning):
18611868
p.rglob('*/**')
18621869

1870+
def test_glob_pathlike(self):
1871+
P = self.cls
1872+
p = P(self.base)
1873+
pattern = "dir*/file*"
1874+
expect = {p / "dirB/fileB", p / "dirC/fileC"}
1875+
self.assertEqual(expect, set(p.glob(P(pattern))))
1876+
self.assertEqual(expect, set(p.glob(FakePath(pattern))))
1877+
1878+
def test_rglob_pathlike(self):
1879+
P = self.cls
1880+
p = P(self.base, "dirC")
1881+
pattern = "**/file*"
1882+
expect = {p / "fileC", p / "dirD/fileD"}
1883+
self.assertEqual(expect, set(p.rglob(P(pattern))))
1884+
self.assertEqual(expect, set(p.rglob(FakePath(pattern))))
1885+
18631886

18641887
@only_posix
18651888
class PosixPathTest(PathTest, PurePosixPathTest):

Lib/test/test_pathlib/test_pathlib_abc.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,9 +1045,12 @@ def _check(glob, expected):
10451045
_check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"])
10461046

10471047
def test_glob_empty_pattern(self):
1048-
p = self.cls('')
1049-
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
1050-
list(p.glob(''))
1048+
def _check(glob, expected):
1049+
self.assertEqual(set(glob), { P(self.base, q) for q in expected })
1050+
P = self.cls
1051+
p = P(self.base)
1052+
_check(p.glob(""), [""])
1053+
_check(p.glob("."), ["."])
10511054

10521055
def test_glob_case_sensitive(self):
10531056
P = self.cls
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Accept :term:`path-like objects <path-like object>` as patterns in
2+
:meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`.

0 commit comments

Comments
 (0)