From 74aed7628efb5c1b25840d03d730e6f9c42096fe Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Aug 2025 17:18:29 +0100 Subject: [PATCH 1/7] GH-116380: Speed up `glob.[i]glob()` by making fewer system calls (take 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Filtered recursive walk Expanding a recursive `**` segment entails walking the entire directory tree, and so any subsequent pattern segments (except special segments) can be evaluated by filtering the expanded paths through a regex. For example, `glob.glob("foo/**/*.py", recursive=True)` recursively walks `foo/` with `os.scandir()`, and then filters paths through a regex based on "`**/*.py`, with no further filesystem access needed. This fixes an issue where `glob()` could return duplicate results. ## Tracking path existence We store a flag alongside each path indicating whether the path is guaranteed to exist. As we process the pattern: - Certain special pattern segments (`""`, `"."` and `".."`) leave the flag unchanged - Literal pattern segments (e.g. `foo/bar`) set the flag to false - Wildcard pattern segments (e.g. `*/*.py`) set the flag to true (because children are found via `os.scandir()`) - Recursive pattern segments (e.g. `**`) leave the flag unchanged for the root path, and set it to true for descendants discovered via `os.scandir()`. If the flag is false at the end, we call `lstat()` on each path to filter out missing paths. ## Minor speed-ups - Exclude paths that don't match a non-terminal non-recursive wildcard pattern _prior_ to calling `is_dir()`. - Use a stack rather than recursion to implement recursive wildcards. - This fixes a recursion error when globbing deep trees. - Pre-compile regular expressions and pre-join literal pattern segments. - Convert to/from `bytes` (a minor use-case) in `iglob()` rather than supporting `bytes` throughout. This particularly simplifies the code needed to handle relative bytes paths with `dir_fd`. - Avoid calling `os.path.join()`; instead we keep paths in a normalized form and append trailing slashes when needed. - Avoid calling `os.path.normcase()`; instead we use case-insensitive regex matching. ## Implementation notes Much of this functionality is already present in pathlib's implementation of globbing. The specific additions we make are: 1. Support for `dir_fd` 2. Support for `include_hidden` 3. Support for generating paths relative to `root_dir` This unifies the implementations of globbing in the `glob` and `pathlib` modules. Co-authored-by: Pieter Eendebak Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Doc/library/glob.rst | 18 +- Doc/whatsnew/3.15.rst | 8 + Lib/glob.py | 363 ++++++++---------- Lib/pathlib/__init__.py | 6 +- Lib/pathlib/types.py | 6 +- Lib/test/test_glob.py | 33 +- ...-03-05-23-08-11.gh-issue-116380.56HU7I.rst | 2 + 7 files changed, 210 insertions(+), 226 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index 59ad1b07f27338..eaf51f14cb3660 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -75,10 +75,6 @@ The :mod:`glob` module defines the following functions: Using the "``**``" pattern in large directory trees may consume an inordinate amount of time. - .. note:: - This function may return duplicate path names if *pathname* - contains multiple "``**``" patterns and *recursive* is true. - .. versionchanged:: 3.5 Support for recursive globs using "``**``". @@ -88,6 +84,11 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. + .. versionchanged:: 3.14 + Matching path names are returned only once. In previous versions, this + function may return duplicate path names if *pathname* contains multiple + "``**``" patterns and *recursive* is true. + .. function:: iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, \ include_hidden=False) @@ -98,10 +99,6 @@ The :mod:`glob` module defines the following functions: .. audit-event:: glob.glob pathname,recursive glob.iglob .. audit-event:: glob.glob/2 pathname,recursive,root_dir,dir_fd glob.iglob - .. note:: - This function may return duplicate path names if *pathname* - contains multiple "``**``" patterns and *recursive* is true. - .. versionchanged:: 3.5 Support for recursive globs using "``**``". @@ -111,6 +108,11 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. + .. versionchanged:: 3.14 + Matching path names are yielded only once. In previous versions, this + function may yield duplicate path names if *pathname* contains multiple + "``**``" patterns and *recursive* is true. + .. function:: escape(pathname) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 89644a509a0bb4..5ad5021e89fcdc 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -234,6 +234,14 @@ difflib (Contributed by Jiahao Li in :gh:`134580`.) +glob +---- + +* Reduce the number of system calls in :func:`glob.glob` and :func:`~glob.iglob`, + thereby improving the speed of globbing operations by 20-80%. + (Contributed by Barney Gale in :gh:`116380`.) + + hashlib ------- diff --git a/Lib/glob.py b/Lib/glob.py index 5d42077003a240..9d6e26d96b6b1a 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -1,13 +1,10 @@ """Filename globbing utility.""" -import contextlib import os import re import fnmatch import functools -import itertools import operator -import stat import sys @@ -45,172 +42,35 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, """ sys.audit("glob.glob", pathname, recursive) sys.audit("glob.glob/2", pathname, recursive, root_dir, dir_fd) - if root_dir is not None: - root_dir = os.fspath(root_dir) - else: - root_dir = pathname[:0] - it = _iglob(pathname, root_dir, dir_fd, recursive, False, - include_hidden=include_hidden) - if not pathname or recursive and _isrecursive(pathname[:2]): - try: - s = next(it) # skip empty string - if s: - it = itertools.chain((s,), it) - except StopIteration: - pass - return it - -def _iglob(pathname, root_dir, dir_fd, recursive, dironly, - include_hidden=False): - dirname, basename = os.path.split(pathname) - if not has_magic(pathname): - assert not dironly - if basename: - if _lexists(_join(root_dir, pathname), dir_fd): - yield pathname - else: - # Patterns ending with a slash should match only directories - if _isdir(_join(root_dir, dirname), dir_fd): - yield pathname - return - if not dirname: - if recursive and _isrecursive(basename): - yield from _glob2(root_dir, basename, dir_fd, dironly, - include_hidden=include_hidden) - else: - yield from _glob1(root_dir, basename, dir_fd, dironly, - include_hidden=include_hidden) - return - # `os.path.split()` returns the argument itself as a dirname if it is a - # drive or UNC path. Prevent an infinite recursion if a drive or UNC path - # contains magic characters (i.e. r'\\?\C:'). - if dirname != pathname and has_magic(dirname): - dirs = _iglob(dirname, root_dir, dir_fd, recursive, True, - include_hidden=include_hidden) - else: - dirs = [dirname] - if has_magic(basename): - if recursive and _isrecursive(basename): - glob_in_dir = _glob2 - else: - glob_in_dir = _glob1 - else: - glob_in_dir = _glob0 - for dirname in dirs: - for name in glob_in_dir(_join(root_dir, dirname), basename, dir_fd, dironly, - include_hidden=include_hidden): - yield os.path.join(dirname, name) - -# These 2 helper functions non-recursively glob inside a literal directory. -# They return a list of basenames. _glob1 accepts a pattern while _glob0 -# takes a literal basename (so it only has to check for its existence). - -def _glob1(dirname, pattern, dir_fd, dironly, include_hidden=False): - names = _listdir(dirname, dir_fd, dironly) - if not (include_hidden or _ishidden(pattern)): - names = (x for x in names if not _ishidden(x)) - return fnmatch.filter(names, pattern) - -def _glob0(dirname, basename, dir_fd, dironly, include_hidden=False): - if basename: - if _lexists(_join(dirname, basename), dir_fd): - return [basename] - else: - # `os.path.split()` returns an empty basename for paths ending with a - # directory separator. 'q*x/' should match only directories. - if _isdir(dirname, dir_fd): - return [basename] - return [] - -# This helper function recursively yields relative pathnames inside a literal -# directory. - -def _glob2(dirname, pattern, dir_fd, dironly, include_hidden=False): - assert _isrecursive(pattern) - if not dirname or _isdir(dirname, dir_fd): - yield pattern[:0] - yield from _rlistdir(dirname, dir_fd, dironly, - include_hidden=include_hidden) - -# If dironly is false, yields all file names inside a directory. -# If dironly is true, yields only directory names. -def _iterdir(dirname, dir_fd, dironly): - try: - fd = None - fsencode = None - if dir_fd is not None: - if dirname: - fd = arg = os.open(dirname, _dir_open_flags, dir_fd=dir_fd) - else: - arg = dir_fd - if isinstance(dirname, bytes): - fsencode = os.fsencode - elif dirname: - arg = dirname - elif isinstance(dirname, bytes): - arg = bytes(os.curdir, 'ASCII') - else: - arg = os.curdir - try: - with os.scandir(arg) as it: - for entry in it: - try: - if not dironly or entry.is_dir(): - if fsencode is not None: - yield fsencode(entry.name) - else: - yield entry.name - except OSError: - pass - finally: - if fd is not None: - os.close(fd) - except OSError: - return - -def _listdir(dirname, dir_fd, dironly): - with contextlib.closing(_iterdir(dirname, dir_fd, dironly)) as it: - return list(it) - -# Recursively yields relative pathnames inside a literal directory. -def _rlistdir(dirname, dir_fd, dironly, include_hidden=False): - names = _listdir(dirname, dir_fd, dironly) - for x in names: - if include_hidden or not _ishidden(x): - yield x - path = _join(dirname, x) if dirname else x - for y in _rlistdir(path, dir_fd, dironly, - include_hidden=include_hidden): - yield _join(x, y) - - -def _lexists(pathname, dir_fd): - # Same as os.path.lexists(), but with dir_fd - if dir_fd is None: - return os.path.lexists(pathname) - try: - os.lstat(pathname, dir_fd=dir_fd) - except (OSError, ValueError): - return False + pathname = os.fspath(pathname) + if isinstance(pathname, bytes): + pathname = os.fsdecode(pathname) + if root_dir is not None: + root_dir = os.fsdecode(root_dir) + paths = _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) + return map(os.fsencode, paths) else: - return True - -def _isdir(pathname, dir_fd): - # Same as os.path.isdir(), but with dir_fd - if dir_fd is None: - return os.path.isdir(pathname) - try: - st = os.stat(pathname, dir_fd=dir_fd) - except (OSError, ValueError): - return False + return _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) + +def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): + if os.path.altsep: + pathname = pathname.replace(os.path.altsep, os.path.sep) + drive, root, tail = os.path.splitroot(pathname) + parts = tail.split(os.path.sep)[::-1] if tail else [] + globber = _StringGlobber(recursive=recursive, include_hidden=include_hidden) + select = globber.selector(parts) + if drive: + root = drive + root + return select(root, dir_fd, root) + elif root: + return select(root, dir_fd, root, exists=True) + elif not root_dir: + return select(root, dir_fd, root, empty=True) else: - return stat.S_ISDIR(st.st_mode) - -def _join(dirname, basename): - # It is common if dirname or basename is empty - if not dirname or not basename: - return dirname or basename - return os.path.join(dirname, basename) + root = os.path.join(root_dir, '') + root_len = len(root) + paths = select(root, dir_fd, root, empty=True) + return (path[root_len:] for path in paths) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') @@ -222,15 +82,6 @@ def has_magic(s): match = magic_check.search(s) return match is not None -def _ishidden(path): - return path[0] in ('.', b'.'[0]) - -def _isrecursive(pattern): - if isinstance(pattern, bytes): - return pattern == b'**' - else: - return pattern == '**' - def escape(pathname): """Escape all special characters. """ @@ -304,12 +155,13 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): return fr'(?s:{res})\z' -@functools.lru_cache(maxsize=512) -def _compile_pattern(pat, seps, case_sensitive, recursive=True): +@functools.lru_cache(maxsize=1024) +def _compile_pattern(pat, seps, case_sensitive, recursive, include_hidden): """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE - regex = translate(pat, recursive=recursive, include_hidden=True, seps=seps) + regex = translate(pat, recursive=recursive, + include_hidden=include_hidden, seps=seps) return re.compile(regex, flags=flags).match @@ -317,11 +169,13 @@ class _GlobberBase: """Abstract class providing shell-style pattern matching and globbing. """ - def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False): + def __init__(self, sep=os.path.sep, case_sensitive=os.name != 'nt', + case_pedantic=False, recursive=False, include_hidden=False): self.sep = sep self.case_sensitive = case_sensitive self.case_pedantic = case_pedantic self.recursive = recursive + self.include_hidden = include_hidden # Abstract methods @@ -331,12 +185,38 @@ def lexists(path): """ raise NotImplementedError + @staticmethod + def lstat(path, dir_fd=None): + """Implements os.lstat() + """ + raise NotImplementedError + + @staticmethod + def open(path, flags, dir_fd=None): + """Implements os.open() + """ + raise NotImplementedError + @staticmethod def scandir(path): """Like os.scandir(), but generates (entry, name, path) tuples. """ raise NotImplementedError + @staticmethod + def scandir_cwd(): + raise NotImplementedError + + @staticmethod + def scandir_fd(fd, prefix): + raise NotImplementedError + + @staticmethod + def close(fd): + """Implements os.close(). + """ + raise NotImplementedError + @staticmethod def concat_path(path, text): """Implements path concatenation. @@ -353,7 +233,7 @@ def stringify_path(path): def compile(self, pat, altsep=None): seps = (self.sep, altsep) if altsep else self.sep - return _compile_pattern(pat, seps, self.case_sensitive, self.recursive) + return _compile_pattern(pat, seps, self.case_sensitive, self.recursive, self.include_hidden) def selector(self, parts): """Returns a function that selects from a given path, walking and @@ -378,10 +258,14 @@ def special_selector(self, part, parts): if parts: part += self.sep select_next = self.selector(parts) + if not part: + return select_next - def select_special(path, exists=False): + def select_special(path, dir_fd=None, rel_path=None, exists=False, empty=False): path = self.concat_path(path, part) - return select_next(path, exists) + if dir_fd is not None: + rel_path = self.concat_path(rel_path, part) + return select_next(path, dir_fd, rel_path, exists) return select_special def literal_selector(self, part, parts): @@ -398,9 +282,11 @@ def literal_selector(self, part, parts): select_next = self.selector(parts) - def select_literal(path, exists=False): + def select_literal(path, dir_fd=None, rel_path=None, exists=False, empty=False): path = self.concat_path(path, part) - return select_next(path, exists=False) + if dir_fd is not None: + rel_path = self.concat_path(rel_path, part) + return select_next(path, dir_fd, rel_path) return select_literal def wildcard_selector(self, part, parts): @@ -408,14 +294,24 @@ def wildcard_selector(self, part, parts): filtering by pattern. """ - match = None if part == '*' else self.compile(part) + match = None if self.include_hidden and part == '*' else self.compile(part) dir_only = bool(parts) if dir_only: select_next = self.selector(parts) - def select_wildcard(path, exists=False): + def select_wildcard(path, dir_fd=None, rel_path=None, exists=False, empty=False): + close_fd = False try: - entries = self.scandir(path) + if dir_fd is None: + fd = None + entries = self.scandir(path) if path else self.scandir_cwd() + elif not rel_path: + fd = dir_fd + entries = self.scandir_fd(fd, path) + else: + fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd) + close_fd = True + entries = self.scandir_fd(fd, path) except OSError: pass else: @@ -428,9 +324,17 @@ def select_wildcard(path, exists=False): except OSError: continue entry_path = self.concat_path(entry_path, self.sep) - yield from select_next(entry_path, exists=True) + if fd is not None: + entry_name = entry_name + self.sep + yield from select_next( + entry_path, fd, entry_name, exists=True) else: + # Optimization: directly yield the path if this is + # last pattern part. yield entry_path + finally: + if close_fd: + self.close(fd) return select_wildcard def recursive_selector(self, part, parts): @@ -452,27 +356,50 @@ def recursive_selector(self, part, parts): while parts and parts[-1] not in _special_parts: part += self.sep + parts.pop() - match = None if part == '**' else self.compile(part) + match = None if self.include_hidden and part == '**' else self.compile(part) dir_only = bool(parts) select_next = self.selector(parts) - def select_recursive(path, exists=False): + def select_recursive(path, dir_fd=None, rel_path=None, exists=False, empty=False): path_str = self.stringify_path(path) match_pos = len(path_str) if match is None or match(path_str, match_pos): - yield from select_next(path, exists) - stack = [path] - while stack: - yield from select_recursive_step(stack, match_pos) + yield from select_next(path, dir_fd, rel_path, exists, empty) + stack = [(path, dir_fd, rel_path)] + try: + while stack: + yield from select_recursive_step(stack, match_pos) + finally: + # Close any file descriptors still on the stack. + while stack: + path, dir_fd, _rel_path = stack.pop() + if path is None: + try: + self.close(dir_fd) + except OSError: + pass def select_recursive_step(stack, match_pos): - path = stack.pop() + path, dir_fd, rel_path = stack.pop() try: - entries = self.scandir(path) + if path is None: + self.close(dir_fd) + return + elif dir_fd is None: + fd = None + entries = self.scandir(path) if path else self.scandir_cwd() + elif not rel_path: + fd = dir_fd + entries = self.scandir_fd(fd, path) + else: + fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd) + # Schedule the file descriptor to be closed next step. + stack.append((None, fd, None)) + entries = self.scandir_fd(fd, path) except OSError: pass else: - for entry, _entry_name, entry_path in entries: + for entry, entry_name, entry_path in entries: is_dir = False try: if entry.is_dir(follow_symlinks=follow_symlinks): @@ -484,25 +411,38 @@ def select_recursive_step(stack, match_pos): entry_path_str = self.stringify_path(entry_path) if dir_only: entry_path = self.concat_path(entry_path, self.sep) + if fd is not None: + entry_name = entry_name + self.sep if match is None or match(entry_path_str, match_pos): if dir_only: - yield from select_next(entry_path, exists=True) + yield from select_next( + entry_path, fd, entry_name, exists=True) else: # Optimization: directly yield the path if this is # last pattern part. yield entry_path if is_dir: - stack.append(entry_path) + stack.append((entry_path, fd, entry_name)) return select_recursive - def select_exists(self, path, exists=False): - """Yields the given path, if it exists. + def select_exists(self, path, dir_fd=None, rel_path=None, exists=False, empty=False): + """Yields the given path, if it exists. If *dir_fd* is given, we check + whether *rel_path* exists relative to the fd. """ - if exists: + if empty: + # Suppress initial path so iglob() doesn't yield the empty string. + pass + elif exists: # Optimization: this path is already known to exist, e.g. because # it was returned from os.scandir(), so we skip calling lstat(). yield path + elif dir_fd is not None: + try: + self.lstat(rel_path, dir_fd=dir_fd) + yield path + except OSError: + pass elif self.lexists(path): yield path @@ -511,6 +451,9 @@ class _StringGlobber(_GlobberBase): """Provides shell-style pattern matching and globbing for string paths. """ lexists = staticmethod(os.path.lexists) + lstat = staticmethod(os.lstat) + open = staticmethod(os.open) + close = staticmethod(os.close) concat_path = operator.add @staticmethod @@ -521,6 +464,20 @@ def scandir(path): entries = list(scandir_it) return ((entry, entry.name, entry.path) for entry in entries) + @staticmethod + def scandir_cwd(): + with os.scandir() as scandir_it: + entries = list(scandir_it) + # Suppress leading dot when scanning current directory. + return ((entry, entry.name, entry.name) for entry in entries) + + @staticmethod + def scandir_fd(fd, prefix): + prefix = os.path.join(prefix, prefix[:0]) + with os.scandir(fd) as scandir_it: + entries = list(scandir_it) + return ((entry, entry.name, prefix + entry.name) for entry in entries) + @staticmethod def stringify_path(path): return path # Already a string. diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index cea1a9fe57eedf..52b5504bd99aed 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -558,7 +558,8 @@ def full_match(self, pattern, *, case_sensitive=None): # paths shouldn't match wildcards, so we change it to the empty string. path = str(self) if self.parts else '' pattern = str(pattern) if pattern.parts else '' - globber = _StringGlobber(self.parser.sep, case_sensitive, recursive=True) + globber = _StringGlobber(self.parser.sep, case_sensitive, + recursive=True, include_hidden=True) return globber.compile(pattern)(path) is not None def match(self, path_pattern, *, case_sensitive=None): @@ -849,7 +850,8 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): case_pedantic = True parts = self._parse_pattern(pattern) recursive = True if recurse_symlinks else _no_recurse_symlinks - globber = _StringGlobber(self.parser.sep, case_sensitive, case_pedantic, recursive) + globber = _StringGlobber(self.parser.sep, case_sensitive, case_pedantic, + recursive, include_hidden=True) select = globber.selector(parts[::-1]) root = str(self) paths = select(self.parser.join(root, '')) diff --git a/Lib/pathlib/types.py b/Lib/pathlib/types.py index 42b80221608bcc..ff7d86786fcb49 100644 --- a/Lib/pathlib/types.py +++ b/Lib/pathlib/types.py @@ -240,7 +240,8 @@ def full_match(self, pattern): pattern is matched against the entire path. """ case_sensitive = self.parser.normcase('Aa') == 'Aa' - globber = _PathGlobber(self.parser.sep, case_sensitive, recursive=True) + globber = _PathGlobber(self.parser.sep, case_sensitive, + recursive=True, include_hidden=True) match = globber.compile(pattern, altsep=self.parser.altsep) return match(vfspath(self)) is not None @@ -309,7 +310,8 @@ def glob(self, pattern, *, recurse_symlinks=True): elif not recurse_symlinks: raise NotImplementedError("recurse_symlinks=False is unsupported") case_sensitive = self.parser.normcase('Aa') == 'Aa' - globber = _PathGlobber(self.parser.sep, case_sensitive, recursive=True) + globber = _PathGlobber(self.parser.sep, case_sensitive, + recursive=True, include_hidden=True) select = globber.selector(parts) return select(self.joinpath('')) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 9e4e82b2542c15..a7743489efa63f 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -5,7 +5,7 @@ import sys import unittest -from test.support import is_wasi, Py_DEBUG +from test.support import is_wasi, Py_DEBUG, infinite_recursion from test.support.os_helper import (TESTFN, skip_unless_symlink, can_symlink, create_empty_file, change_cwd) @@ -175,20 +175,18 @@ def test_glob_directory_with_trailing_slash(self): self.assertEqual(glob.glob(self.norm('Z*Z') + sep), []) self.assertEqual(glob.glob(self.norm('ZZZ') + sep), []) self.assertEqual(glob.glob(self.norm('aaa') + sep), - [self.norm('aaa') + sep]) - # Preserving the redundant separators is an implementation detail. + [self.norm('aaa') + os.sep]) + # Redundant separators are preserved and normalized self.assertEqual(glob.glob(self.norm('aaa') + sep*2), - [self.norm('aaa') + sep*2]) + [self.norm('aaa') + os.sep*2]) # When there is a wildcard pattern which ends with a pathname # separator, glob() doesn't blow. # The result should end with the pathname separator. - # Normalizing the trailing separator is an implementation detail. eq = self.assertSequencesEqual_noorder eq(glob.glob(self.norm('aa*') + sep), [self.norm('aaa') + os.sep, self.norm('aab') + os.sep]) - # Stripping the redundant separators is an implementation detail. eq(glob.glob(self.norm('aa*') + sep*2), - [self.norm('aaa') + os.sep, self.norm('aab') + os.sep]) + [self.norm('aaa') + os.sep*2, self.norm('aab') + os.sep*2]) def test_glob_bytes_directory_with_trailing_slash(self): # Same as test_glob_directory_with_trailing_slash, but with a @@ -198,16 +196,16 @@ def test_glob_bytes_directory_with_trailing_slash(self): self.assertEqual(glob.glob(os.fsencode(self.norm('Z*Z') + sep)), []) self.assertEqual(glob.glob(os.fsencode(self.norm('ZZZ') + sep)), []) self.assertEqual(glob.glob(os.fsencode(self.norm('aaa') + sep)), - [os.fsencode(self.norm('aaa') + sep)]) + [os.fsencode(self.norm('aaa') + os.sep)]) self.assertEqual(glob.glob(os.fsencode(self.norm('aaa') + sep*2)), - [os.fsencode(self.norm('aaa') + sep*2)]) + [os.fsencode(self.norm('aaa') + os.sep*2)]) eq = self.assertSequencesEqual_noorder eq(glob.glob(os.fsencode(self.norm('aa*') + sep)), [os.fsencode(self.norm('aaa') + os.sep), os.fsencode(self.norm('aab') + os.sep)]) eq(glob.glob(os.fsencode(self.norm('aa*') + sep*2)), - [os.fsencode(self.norm('aaa') + os.sep), - os.fsencode(self.norm('aab') + os.sep)]) + [os.fsencode(self.norm('aaa') + os.sep*2), + os.fsencode(self.norm('aab') + os.sep*2)]) @skip_unless_symlink def test_glob_symlinks(self): @@ -324,8 +322,12 @@ def test_recursive_glob(self): with change_cwd(self.tempdir): join = os.path.join eq(glob.glob('**', recursive=True), [join(*i) for i in full]) + eq(glob.glob(join('**', '**'), recursive=True), + [join(*i) for i in full]) eq(glob.glob(join('**', ''), recursive=True), [join(*i) for i in dirs]) + eq(glob.glob(join('**', '**', ''), recursive=True), + [join(*i) for i in dirs]) eq(glob.glob(join('**', '*'), recursive=True), [join(*i) for i in full]) eq(glob.glob(join(os.curdir, '**'), recursive=True), @@ -392,6 +394,15 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) + def test_glob_above_recursion_limit(self): + depth = 30 + base = os.path.join(self.tempdir, 'deep') + p = os.path.join(base, *(['d']*depth)) + os.makedirs(p) + pattern = os.path.join(base, '**', 'd') + with infinite_recursion(depth - 5): + glob.glob(pattern, recursive=True) + def test_translate_matching(self): match = re.compile(glob.translate('*')).match self.assertIsNotNone(match('foo')) diff --git a/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst new file mode 100644 index 00000000000000..b7f27ab7191a96 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst @@ -0,0 +1,2 @@ +Speed up :func:`glob.glob` and :func:`glob.iglob` by making use of +:func:`glob.translate` and tracking path existence more precisely. From 7cc555f5416aeef4e28fa701a38e89e0d65c1d6c Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Aug 2025 17:39:18 +0100 Subject: [PATCH 2/7] Fix version number --- Doc/library/glob.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index eaf51f14cb3660..df86ef545cb625 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -84,7 +84,7 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. - .. versionchanged:: 3.14 + .. versionchanged:: 3.15 Matching path names are returned only once. In previous versions, this function may return duplicate path names if *pathname* contains multiple "``**``" patterns and *recursive* is true. @@ -108,7 +108,7 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. - .. versionchanged:: 3.14 + .. versionchanged:: 3.15 Matching path names are yielded only once. In previous versions, this function may yield duplicate path names if *pathname* contains multiple "``**``" patterns and *recursive* is true. From 51515ce19008bb94d3370500596b5b23f0c1fa94 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Wed, 6 Aug 2025 18:58:03 +0100 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Doc/library/glob.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index df86ef545cb625..b5dbfaa6671006 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -84,10 +84,10 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. - .. versionchanged:: 3.15 - Matching path names are returned only once. In previous versions, this - function may return duplicate path names if *pathname* contains multiple - "``**``" patterns and *recursive* is true. + .. versionchanged:: next + Matching path names are only returned once. In previous versions, this + function could have returned duplicate path names if *pathname* + contained multiple "``**``" patterns and *recursive* was true. .. function:: iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, \ @@ -109,9 +109,9 @@ The :mod:`glob` module defines the following functions: Added the *include_hidden* parameter. .. versionchanged:: 3.15 - Matching path names are yielded only once. In previous versions, this - function may yield duplicate path names if *pathname* contains multiple - "``**``" patterns and *recursive* is true. + Matching path names are only yielded once. In previous versions, this + function could have returned duplicate path names if *pathname* + contained multiple "``**``" patterns and *recursive* was true. .. function:: escape(pathname) From 9843b6ae21120f92229fe2dfe658ea2fffd83727 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Aug 2025 18:58:55 +0100 Subject: [PATCH 4/7] Address review feedback --- Doc/library/glob.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index b5dbfaa6671006..b6ed6e79ac19b7 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -87,7 +87,7 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: next Matching path names are only returned once. In previous versions, this function could have returned duplicate path names if *pathname* - contained multiple "``**``" patterns and *recursive* was true. + contained multiple "``**``" patterns and *recursive* was true. .. function:: iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, \ @@ -108,10 +108,10 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. - .. versionchanged:: 3.15 + .. versionchanged:: next Matching path names are only yielded once. In previous versions, this function could have returned duplicate path names if *pathname* - contained multiple "``**``" patterns and *recursive* was true. + contained multiple "``**``" patterns and *recursive* was true. .. function:: escape(pathname) From a116d94cf26998a9853a4e4c1b4272ed20e895ff Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Aug 2025 19:02:51 +0100 Subject: [PATCH 5/7] Errant tabs --- Doc/library/glob.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index b6ed6e79ac19b7..35d332d4a91ff0 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -87,7 +87,7 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: next Matching path names are only returned once. In previous versions, this function could have returned duplicate path names if *pathname* - contained multiple "``**``" patterns and *recursive* was true. + contained multiple "``**``" patterns and *recursive* was true. .. function:: iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, \ @@ -111,7 +111,7 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: next Matching path names are only yielded once. In previous versions, this function could have returned duplicate path names if *pathname* - contained multiple "``**``" patterns and *recursive* was true. + contained multiple "``**``" patterns and *recursive* was true. .. function:: escape(pathname) From e2bb3cb7fa099532b60950f10310da643921d0f9 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 7 Aug 2025 21:30:43 +0100 Subject: [PATCH 6/7] Make Globber arguments keyword-only --- Lib/glob.py | 2 +- Lib/pathlib/__init__.py | 16 +++++++++++----- Lib/pathlib/types.py | 12 ++++++++---- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 9d6e26d96b6b1a..eb433b878a979e 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -169,7 +169,7 @@ class _GlobberBase: """Abstract class providing shell-style pattern matching and globbing. """ - def __init__(self, sep=os.path.sep, case_sensitive=os.name != 'nt', + def __init__(self, *, sep=os.path.sep, case_sensitive=os.name != 'nt', case_pedantic=False, recursive=False, include_hidden=False): self.sep = sep self.case_sensitive = case_sensitive diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 52b5504bd99aed..dfad0102fbdda6 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -558,8 +558,10 @@ def full_match(self, pattern, *, case_sensitive=None): # paths shouldn't match wildcards, so we change it to the empty string. path = str(self) if self.parts else '' pattern = str(pattern) if pattern.parts else '' - globber = _StringGlobber(self.parser.sep, case_sensitive, - recursive=True, include_hidden=True) + globber = _StringGlobber(sep=self.parser.sep, + case_sensitive=case_sensitive, + recursive=True, + include_hidden=True) return globber.compile(pattern)(path) is not None def match(self, path_pattern, *, case_sensitive=None): @@ -581,7 +583,8 @@ def match(self, path_pattern, *, case_sensitive=None): return False if len(path_parts) > len(pattern_parts) and path_pattern.anchor: return False - globber = _StringGlobber(self.parser.sep, case_sensitive) + globber = _StringGlobber(sep=self.parser.sep, + case_sensitive=case_sensitive) for path_part, pattern_part in zip(path_parts, pattern_parts): match = globber.compile(pattern_part) if match(path_part) is None: @@ -850,8 +853,11 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): case_pedantic = True parts = self._parse_pattern(pattern) recursive = True if recurse_symlinks else _no_recurse_symlinks - globber = _StringGlobber(self.parser.sep, case_sensitive, case_pedantic, - recursive, include_hidden=True) + globber = _StringGlobber(sep=self.parser.sep, + case_sensitive=case_sensitive, + case_pedantic=case_pedantic, + recursive=recursive, + include_hidden=True) select = globber.selector(parts[::-1]) root = str(self) paths = select(self.parser.join(root, '')) diff --git a/Lib/pathlib/types.py b/Lib/pathlib/types.py index ff7d86786fcb49..cfaec3594bd9a6 100644 --- a/Lib/pathlib/types.py +++ b/Lib/pathlib/types.py @@ -240,8 +240,10 @@ def full_match(self, pattern): pattern is matched against the entire path. """ case_sensitive = self.parser.normcase('Aa') == 'Aa' - globber = _PathGlobber(self.parser.sep, case_sensitive, - recursive=True, include_hidden=True) + globber = _PathGlobber(sep=self.parser.sep, + case_sensitive=case_sensitive, + recursive=True, + include_hidden=True) match = globber.compile(pattern, altsep=self.parser.altsep) return match(vfspath(self)) is not None @@ -310,8 +312,10 @@ def glob(self, pattern, *, recurse_symlinks=True): elif not recurse_symlinks: raise NotImplementedError("recurse_symlinks=False is unsupported") case_sensitive = self.parser.normcase('Aa') == 'Aa' - globber = _PathGlobber(self.parser.sep, case_sensitive, - recursive=True, include_hidden=True) + globber = _PathGlobber(sep=self.parser.sep, + case_sensitive=case_sensitive, + recursive=True, + include_hidden=True) select = globber.selector(parts) return select(self.joinpath('')) From c093c4eaa287930220dfdee31ecf262b5c1c5b7d Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 7 Aug 2025 21:39:28 +0100 Subject: [PATCH 7/7] Tweak Globber argument defaults --- Lib/glob.py | 2 +- Lib/pathlib/__init__.py | 7 ++----- Lib/pathlib/types.py | 8 ++------ 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index eb433b878a979e..90190dd7084763 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -170,7 +170,7 @@ class _GlobberBase: """ def __init__(self, *, sep=os.path.sep, case_sensitive=os.name != 'nt', - case_pedantic=False, recursive=False, include_hidden=False): + case_pedantic=False, recursive=True, include_hidden=True): self.sep = sep self.case_sensitive = case_sensitive self.case_pedantic = case_pedantic diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index dfad0102fbdda6..e5dcc6f3f05f4e 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -559,9 +559,7 @@ def full_match(self, pattern, *, case_sensitive=None): path = str(self) if self.parts else '' pattern = str(pattern) if pattern.parts else '' globber = _StringGlobber(sep=self.parser.sep, - case_sensitive=case_sensitive, - recursive=True, - include_hidden=True) + case_sensitive=case_sensitive) return globber.compile(pattern)(path) is not None def match(self, path_pattern, *, case_sensitive=None): @@ -856,8 +854,7 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): globber = _StringGlobber(sep=self.parser.sep, case_sensitive=case_sensitive, case_pedantic=case_pedantic, - recursive=recursive, - include_hidden=True) + recursive=recursive) select = globber.selector(parts[::-1]) root = str(self) paths = select(self.parser.join(root, '')) diff --git a/Lib/pathlib/types.py b/Lib/pathlib/types.py index cfaec3594bd9a6..11b143d721e2b1 100644 --- a/Lib/pathlib/types.py +++ b/Lib/pathlib/types.py @@ -241,9 +241,7 @@ def full_match(self, pattern): """ case_sensitive = self.parser.normcase('Aa') == 'Aa' globber = _PathGlobber(sep=self.parser.sep, - case_sensitive=case_sensitive, - recursive=True, - include_hidden=True) + case_sensitive=case_sensitive) match = globber.compile(pattern, altsep=self.parser.altsep) return match(vfspath(self)) is not None @@ -313,9 +311,7 @@ def glob(self, pattern, *, recurse_symlinks=True): raise NotImplementedError("recurse_symlinks=False is unsupported") case_sensitive = self.parser.normcase('Aa') == 'Aa' globber = _PathGlobber(sep=self.parser.sep, - case_sensitive=case_sensitive, - recursive=True, - include_hidden=True) + case_sensitive=case_sensitive) select = globber.selector(parts) return select(self.joinpath(''))