Skip to content

Commit 0eb52f5

Browse files
authored
GH-115060: Speed up pathlib.Path.glob() by not scanning literal parts (#117732)
Don't bother calling `os.scandir()` to scan for literal pattern segments, like `foo` in `foo/*.py`. Instead, append the segment(s) as-is and call through to the next selector with `exists=False`, which signals that the path might not exist. Subsequent selectors will call `os.scandir()` or `os.lstat()` to filter out missing paths as needed.
1 parent 069de14 commit 0eb52f5

File tree

4 files changed

+42
-11
lines changed

4 files changed

+42
-11
lines changed

Lib/glob.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,10 @@ class _Globber:
331331
"""Class providing shell-style pattern matching and globbing.
332332
"""
333333

334-
def __init__(self, sep, case_sensitive, recursive=False):
334+
def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False):
335335
self.sep = sep
336336
self.case_sensitive = case_sensitive
337+
self.case_pedantic = case_pedantic
337338
self.recursive = recursive
338339

339340
# Low-level methods
@@ -373,6 +374,8 @@ def selector(self, parts):
373374
selector = self.recursive_selector
374375
elif part in _special_parts:
375376
selector = self.special_selector
377+
elif not self.case_pedantic and magic_check.search(part) is None:
378+
selector = self.literal_selector
376379
else:
377380
selector = self.wildcard_selector
378381
return selector(part, parts)
@@ -387,6 +390,23 @@ def select_special(path, exists=False):
387390
return select_next(path, exists)
388391
return select_special
389392

393+
def literal_selector(self, part, parts):
394+
"""Returns a function that selects a literal descendant of a path.
395+
"""
396+
397+
# Optimization: consume and join any subsequent literal parts here,
398+
# rather than leaving them for the next selector. This reduces the
399+
# number of string concatenation operations and calls to add_slash().
400+
while parts and magic_check.search(parts[-1]) is None:
401+
part += self.sep + parts.pop()
402+
403+
select_next = self.selector(parts)
404+
405+
def select_literal(path, exists=False):
406+
path = self.concat_path(self.add_slash(path), part)
407+
return select_next(path, exists=False)
408+
return select_literal
409+
390410
def wildcard_selector(self, part, parts):
391411
"""Returns a function that selects direct children of a given path,
392412
filtering by pattern.

Lib/pathlib/_abc.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -686,8 +686,14 @@ def iterdir(self):
686686
def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
687687
if case_sensitive is None:
688688
case_sensitive = _is_case_sensitive(self.parser)
689+
case_pedantic = False
690+
else:
691+
# The user has expressed a case sensitivity choice, but we don't
692+
# know the case sensitivity of the underlying filesystem, so we
693+
# must use scandir() for everything, including non-wildcard parts.
694+
case_pedantic = True
689695
recursive = True if recurse_symlinks else glob._no_recurse_symlinks
690-
globber = self._globber(self.parser.sep, case_sensitive, recursive)
696+
globber = self._globber(self.parser.sep, case_sensitive, case_pedantic, recursive)
691697
return globber.selector(parts)
692698

693699
def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):

Lib/test/test_pathlib/test_pathlib_abc.py

+12-9
Original file line numberDiff line numberDiff line change
@@ -1429,10 +1429,10 @@ def __repr__(self):
14291429
return "{}({!r})".format(self.__class__.__name__, self.as_posix())
14301430

14311431
def stat(self, *, follow_symlinks=True):
1432-
if follow_symlinks:
1433-
path = str(self.resolve())
1432+
if follow_symlinks or self.name in ('', '.', '..'):
1433+
path = str(self.resolve(strict=True))
14341434
else:
1435-
path = str(self.parent.resolve() / self.name)
1435+
path = str(self.parent.resolve(strict=True) / self.name)
14361436
if path in self._files:
14371437
st_mode = stat.S_IFREG
14381438
elif path in self._directories:
@@ -1741,8 +1741,9 @@ def _check(glob, expected):
17411741
def test_glob_posix(self):
17421742
P = self.cls
17431743
p = P(self.base)
1744+
q = p / "FILEa"
17441745
given = set(p.glob("FILEa"))
1745-
expect = set()
1746+
expect = {q} if q.exists() else set()
17461747
self.assertEqual(given, expect)
17471748
self.assertEqual(set(p.glob("FILEa*")), set())
17481749

@@ -1753,8 +1754,6 @@ def test_glob_windows(self):
17531754
self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") })
17541755
self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") })
17551756
self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") })
1756-
self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"})
1757-
self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"})
17581757

17591758
def test_glob_empty_pattern(self):
17601759
P = self.cls
@@ -1857,8 +1856,9 @@ def _check(path, glob, expected):
18571856
def test_rglob_posix(self):
18581857
P = self.cls
18591858
p = P(self.base, "dirC")
1859+
q = p / "dirD" / "FILEd"
18601860
given = set(p.rglob("FILEd"))
1861-
expect = set()
1861+
expect = {q} if q.exists() else set()
18621862
self.assertEqual(given, expect)
18631863
self.assertEqual(set(p.rglob("FILEd*")), set())
18641864

@@ -1868,7 +1868,6 @@ def test_rglob_windows(self):
18681868
p = P(self.base, "dirC")
18691869
self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") })
18701870
self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") })
1871-
self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"})
18721871

18731872
@needs_symlinks
18741873
def test_rglob_recurse_symlinks_common(self):
@@ -1931,7 +1930,11 @@ def test_glob_dotdot(self):
19311930
self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") })
19321931
self.assertEqual(set(p.glob("dirA/../file*/..")), set())
19331932
self.assertEqual(set(p.glob("../xyzzy")), set())
1934-
self.assertEqual(set(p.glob("xyzzy/..")), set())
1933+
if self.cls.parser is posixpath:
1934+
self.assertEqual(set(p.glob("xyzzy/..")), set())
1935+
else:
1936+
# ".." segments are normalized first on Windows, so this path is stat()able.
1937+
self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "xyzzy", "..") })
19351938
self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)})
19361939

19371940
@needs_symlinks
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Speed up :meth:`pathlib.Path.glob` by not scanning directories for
2+
non-wildcard pattern segments.

0 commit comments

Comments
 (0)