Skip to content

GH-79634: Accept path-like objects as pathlib glob patterns. #114017

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Doc/library/pathlib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,9 @@ call fails (for example because the path doesn't exist).
future Python release, patterns with this ending will match both files
and directories. Add a trailing slash to match only directories.

.. versionchanged:: 3.13
The *pattern* parameter accepts a :term:`path-like object`.

.. method:: Path.group(*, follow_symlinks=True)

Return the name of the group owning the file. :exc:`KeyError` is raised
Expand Down Expand Up @@ -1482,6 +1485,9 @@ call fails (for example because the path doesn't exist).
.. versionchanged:: 3.13
The *follow_symlinks* parameter was added.

.. versionchanged:: 3.13
The *pattern* parameter accepts a :term:`path-like object`.

.. method:: Path.rmdir()

Remove this directory. The directory must be empty.
Expand Down
49 changes: 31 additions & 18 deletions Lib/pathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,29 @@ def as_uri(self):
from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path))

@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
parts = self._tail.copy()
pattern = self._raw_path
if self.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
elif parts[-1] == '**':
# GH-70303: '**' only matches directories. Add trailing slash.
warnings.warn(
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 4)
parts.append('')
parts.reverse()
return parts


# Subclassing os.PathLike makes isinstance() checks slower,
# which in turn makes Path construction slower. Register instead!
Expand Down Expand Up @@ -580,7 +603,7 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)

def _make_child_entry(self, entry, is_dir=False):
def _make_child_entry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
Expand All @@ -591,6 +614,8 @@ def _make_child_entry(self, entry, is_dir=False):
return path

def _make_child_relpath(self, name):
if not name:
return self
path_str = str(self)
tail = self._tail
if tail:
Expand All @@ -611,14 +636,8 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
kind, including directories) matching the given relative pattern.
"""
sys.audit("pathlib.Path.glob", self, pattern)
if pattern.endswith('**'):
# GH-70303: '**' only matches directories. Add trailing slash.
warnings.warn(
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 2)
pattern = f'{pattern}/'
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)

Expand All @@ -628,15 +647,9 @@ def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
this subtree.
"""
sys.audit("pathlib.Path.rglob", self, pattern)
if pattern.endswith('**'):
# GH-70303: '**' only matches directories. Add trailing slash.
warnings.warn(
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 2)
pattern = f'{pattern}/'
pattern = f'**/{pattern}'
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
pattern = '**' / pattern
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)

Expand Down
98 changes: 47 additions & 51 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ def _compile_pattern(pat, sep, case_sensitive):
return re.compile(regex, flags=flags).match


def _select_special(paths, part):
"""Yield special literal children of the given paths."""
for path in paths:
yield path._make_child_relpath(part)


def _select_children(parent_paths, dir_only, follow_symlinks, match):
"""Yield direct children of given paths, filtering by name and type."""
if follow_symlinks is None:
Expand All @@ -84,7 +90,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
except OSError:
continue
if match(entry.name):
yield parent_path._make_child_entry(entry, dir_only)
yield parent_path._make_child_entry(entry)


def _select_recursive(parent_paths, dir_only, follow_symlinks):
Expand All @@ -107,7 +113,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
for entry in entries:
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
paths.append(path._make_child_entry(entry, dir_only))
paths.append(path._make_child_entry(entry))
continue
except OSError:
pass
Expand Down Expand Up @@ -427,6 +433,14 @@ def is_absolute(self):
a drive)."""
return self.pathmod.isabs(self._raw_path)

@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
anchor, parts = self._stack
if anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
return parts

def match(self, path_pattern, *, case_sensitive=None):
"""
Return True if this path matches the given pattern.
Expand All @@ -436,11 +450,10 @@ def match(self, path_pattern, *, case_sensitive=None):
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.pathmod)
sep = path_pattern.pathmod.sep
pattern_str = str(path_pattern)
if path_pattern.anchor:
pass
pattern_str = str(path_pattern)
elif path_pattern.parts:
pattern_str = f'**{sep}{pattern_str}'
pattern_str = str('**' / path_pattern)
else:
raise ValueError("empty pattern")
match = _compile_pattern(pattern_str, sep, case_sensitive)
Expand Down Expand Up @@ -714,10 +727,8 @@ def _scandir(self):
from contextlib import nullcontext
return nullcontext(self.iterdir())

def _make_child_entry(self, entry, is_dir=False):
def _make_child_entry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
if is_dir:
return entry.joinpath('')
return entry

def _make_child_relpath(self, name):
Expand All @@ -727,57 +738,35 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
"""Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern.
"""
path_pattern = self.with_segments(pattern)
if path_pattern.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not path_pattern.parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))

pattern_parts = list(path_pattern.parts)
if not self.pathmod.split(pattern)[1]:
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
pattern_parts.append('')

if not isinstance(pattern, PurePathBase):
pattern = self.with_segments(pattern)
if case_sensitive is None:
# TODO: evaluate case-sensitivity of each directory in _select_children().
case_sensitive = _is_case_sensitive(self.pathmod)

# If symlinks are handled consistently, and the pattern does not
# contain '..' components, then we can use a 'walk-and-match' strategy
# when expanding '**' wildcards. When a '**' wildcard is encountered,
# all following pattern parts are immediately consumed and used to
# build a `re.Pattern` object. This pattern is used to filter the
# recursive walk. As a result, pattern parts following a '**' wildcard
# do not perform any filesystem access, which can be much faster!
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
stack = pattern._pattern_stack
specials = ('', '.', '..')
filter_paths = False
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self.joinpath('')] if self.is_dir() else [])
part_idx = 0
while part_idx < len(pattern_parts):
part = pattern_parts[part_idx]
part_idx += 1
if part == '':
# Trailing slash.
pass
elif part == '..':
paths = (path._make_child_relpath('..') for path in paths)
while stack:
part = stack.pop()
if part in specials:
paths = _select_special(paths, part)
elif part == '**':
# Consume adjacent '**' components.
while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**':
part_idx += 1

if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '':
dir_only = pattern_parts[-1] == ''
paths = _select_recursive(paths, dir_only, follow_symlinks)
while stack and stack[-1] == '**':
stack.pop()

# Filter out paths that don't match pattern.
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(str(path_pattern), sep, case_sensitive)
paths = (path for path in paths if match(str(path), prefix_len))
return paths
# Consume adjacent non-special components and enable post-walk
# regex filtering, provided we're treating symlinks consistently.
if follow_symlinks is not None:
while stack and stack[-1] not in specials:
filter_paths = True
stack.pop()

dir_only = part_idx < len(pattern_parts)
dir_only = bool(stack)
paths = _select_recursive(paths, dir_only, follow_symlinks)
if deduplicate_paths:
# De-duplicate if we've already seen a '**' component.
Expand All @@ -786,18 +775,25 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
dir_only = part_idx < len(pattern_parts)
dir_only = bool(stack)
match = _compile_pattern(part, sep, case_sensitive)
paths = _select_children(paths, dir_only, follow_symlinks, match)
if filter_paths:
# Filter out paths that don't match pattern.
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(str(pattern), sep, case_sensitive)
paths = (path for path in paths if match(str(path), prefix_len))
return paths

def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
"""Recursively yield all existing files (of any kind, including
directories) matching the given relative pattern, anywhere in
this subtree.
"""
return self.glob(
f'**/{pattern}', case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
if not isinstance(pattern, PurePathBase):
pattern = self.with_segments(pattern)
pattern = '**' / pattern
return self.glob(pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)

def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk()."""
Expand Down
23 changes: 23 additions & 0 deletions Lib/test/test_pathlib/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1818,6 +1818,13 @@ def test_walk_above_recursion_limit(self):
list(base.walk())
list(base.walk(top_down=False))

def test_glob_empty_pattern(self):
p = self.cls('')
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
list(p.glob(''))
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
list(p.glob('.'))

def test_glob_many_open_files(self):
depth = 30
P = self.cls
Expand Down Expand Up @@ -1860,6 +1867,22 @@ def test_glob_recursive_no_trailing_slash(self):
with self.assertWarns(FutureWarning):
p.rglob('*/**')

def test_glob_pathlike(self):
P = self.cls
p = P(self.base)
pattern = "dir*/file*"
expect = {p / "dirB/fileB", p / "dirC/fileC"}
self.assertEqual(expect, set(p.glob(P(pattern))))
self.assertEqual(expect, set(p.glob(FakePath(pattern))))

def test_rglob_pathlike(self):
P = self.cls
p = P(self.base, "dirC")
pattern = "**/file*"
expect = {p / "fileC", p / "dirD/fileD"}
self.assertEqual(expect, set(p.rglob(P(pattern))))
self.assertEqual(expect, set(p.rglob(FakePath(pattern))))


@only_posix
class PosixPathTest(PathTest, PurePosixPathTest):
Expand Down
9 changes: 6 additions & 3 deletions Lib/test/test_pathlib/test_pathlib_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,9 +1045,12 @@ def _check(glob, expected):
_check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"])

def test_glob_empty_pattern(self):
p = self.cls('')
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
list(p.glob(''))
def _check(glob, expected):
self.assertEqual(set(glob), { P(self.base, q) for q in expected })
P = self.cls
p = P(self.base)
_check(p.glob(""), [""])
_check(p.glob("."), ["."])

def test_glob_case_sensitive(self):
P = self.cls
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Accept :term:`path-like objects <path-like object>` as patterns in
:meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`.