From 74a495c6b1dee7b1c71e0b82ca1ab8362aceda78 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 19 May 2024 20:07:23 +0100 Subject: [PATCH 1/8] GH-119169: Speed up `os.walk()` Handle errors from `os.scandir()` and `ScandirIterator` similarly, which lets us loop over directory entries with `for`. In top-down mode, call `os.path.join()` at most once per iteration. --- Lib/os.py | 70 ++++++++----------- ...-05-19-20-13-09.gh-issue-119169.CuE7q0.rst | 2 + 2 files changed, 31 insertions(+), 41 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst diff --git a/Lib/os.py b/Lib/os.py index 7661ce68ca3be2..3ced7c18522fd4 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -361,39 +361,23 @@ def walk(top, topdown=True, onerror=None, followlinks=False): # minor reason when (say) a thousand readable directories are still # left to visit. try: - scandir_it = scandir(top) - except OSError as error: - if onerror is not None: - onerror(error) - continue - - cont = False - with scandir_it: - while True: - try: + with scandir(top) as entries: + for entry in entries: try: - entry = next(scandir_it) - except StopIteration: - break - except OSError as error: - if onerror is not None: - onerror(error) - cont = True - break - - try: - is_dir = entry.is_dir() - except OSError: - # If is_dir() raises an OSError, consider the entry not to - # be a directory, same behaviour as os.path.isdir(). - is_dir = False - - if is_dir: - dirs.append(entry.name) - else: - nondirs.append(entry.name) + is_dir = entry.is_dir() + except OSError: + # If is_dir() raises an OSError, consider the entry not to + # be a directory, same behaviour as os.path.isdir(). + is_dir = False + + if is_dir: + dirs.append(entry.name) + else: + nondirs.append(entry.name) + continue + if topdown: + continue - if not topdown and is_dir: # Bottom-up: traverse into sub-directory, but exclude # symlinks to directories if followlinks is False if followlinks: @@ -410,21 +394,25 @@ def walk(top, topdown=True, onerror=None, followlinks=False): if walk_into: walk_dirs.append(entry.path) - if cont: + except OSError as error: + if onerror is not None: + onerror(error) continue if topdown: # Yield before sub-directory traversal if going top down yield top, dirs, nondirs - # Traverse into sub-directories - for dirname in reversed(dirs): - new_path = join(top, dirname) - # bpo-23605: os.path.islink() is used instead of caching - # entry.is_symlink() result during the loop on os.scandir() because - # the caller can replace the directory entry during the "yield" - # above. - if followlinks or not islink(new_path): - stack.append(new_path) + if dirs: + # Traverse into sub-directories + prefix = join(top, top[:0]) + for dirname in reversed(dirs): + new_path = prefix + dirname + # bpo-23605: os.path.islink() is used instead of caching + # entry.is_symlink() result during the loop on os.scandir() because + # the caller can replace the directory entry during the "yield" + # above. + if followlinks or not islink(new_path): + stack.append(new_path) else: # Yield after sub-directory traversal if going bottom up stack.append((top, dirs, nondirs)) diff --git a/Misc/NEWS.d/next/Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst b/Misc/NEWS.d/next/Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst new file mode 100644 index 00000000000000..4e1b3258f63c32 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst @@ -0,0 +1,2 @@ +Slightly speed up :func:`os.walk` by simplifying exception handling and +reducing path joining. From 46a3ba54ac805786649f8ab0e5dd0ace2051f58b Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 23 May 2024 09:29:00 +0100 Subject: [PATCH 2/8] Simplify bottom-up implementation. --- Lib/os.py | 43 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index 3ced7c18522fd4..32f47078f1bd98 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -353,7 +353,9 @@ def walk(top, topdown=True, onerror=None, followlinks=False): dirs = [] nondirs = [] - walk_dirs = [] + if not topdown: + # Yield after sub-directory traversal if going bottom up + stack.append((top, dirs, nondirs)) # We may not have read permission for top, in which case we can't # get a list of the files the directory contains. @@ -363,40 +365,31 @@ def walk(top, topdown=True, onerror=None, followlinks=False): try: with scandir(top) as entries: for entry in entries: + is_dir = False try: - is_dir = entry.is_dir() + if entry.is_dir(): + is_dir = True + # Bottom-up: traverse into sub-directory, but exclude + # symlinks to directories if followlinks is False + if not topdown and (followlinks or not entry.is_symlink()): + stack.append(entry.path) except OSError: # If is_dir() raises an OSError, consider the entry not to # be a directory, same behaviour as os.path.isdir(). - is_dir = False + pass if is_dir: dirs.append(entry.name) else: nondirs.append(entry.name) - continue - if topdown: - continue - # Bottom-up: traverse into sub-directory, but exclude - # symlinks to directories if followlinks is False - if followlinks: - walk_into = True - else: - try: - is_symlink = entry.is_symlink() - except OSError: - # If is_symlink() raises an OSError, consider the - # entry not to be a symbolic link, same behaviour - # as os.path.islink(). - is_symlink = False - walk_into = not is_symlink - - if walk_into: - walk_dirs.append(entry.path) except OSError as error: if onerror is not None: onerror(error) + if not topdown: + # Undo additions to stack. + while not isinstance(stack.pop(), tuple): + pass continue if topdown: @@ -413,12 +406,6 @@ def walk(top, topdown=True, onerror=None, followlinks=False): # above. if followlinks or not islink(new_path): stack.append(new_path) - else: - # Yield after sub-directory traversal if going bottom up - stack.append((top, dirs, nondirs)) - # Traverse into sub-directories - for new_path in reversed(walk_dirs): - stack.append(new_path) __all__.append("walk") From a6382026d696e3be9f6d057e347949bd1204e3b1 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 30 May 2024 05:16:37 +0100 Subject: [PATCH 3/8] Apply the same changes to fwalk() --- Lib/os.py | 61 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index d88cb760e0e24a..46c3b0dbfffffd 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -401,8 +401,8 @@ def walk(top, topdown=True, onerror=None, followlinks=False): if topdown: # Yield before sub-directory traversal if going top down yield top, dirs, nondirs + # Traverse into sub-directories if dirs: - # Traverse into sub-directories prefix = join(top, top[:0]) for dirname in reversed(dirs): new_path = prefix + dirname @@ -498,43 +498,50 @@ def _fwalk(stack, isbytes, topdown, onerror, follow_symlinks): if not path.samestat(orig_st, stat(topfd)): return - scandir_it = scandir(topfd) dirs = [] nondirs = [] - entries = None if topdown or follow_symlinks else [] - for entry in scandir_it: - name = entry.name - if isbytes: - name = fsencode(name) - try: - if entry.is_dir(): + if not topdown: + # Yield after sub-directory traversal if going bottom up. + stack.append((_fwalk_yield, (toppath, dirs, nondirs, topfd))) + + topprefix = path.join(toppath, toppath[:0]) # Add trailing slash. + try: + for entry in scandir(topfd): + name = entry.name + if isbytes: + name = fsencode(name) + is_dir = False + try: + is_dir = entry.is_dir() + if is_dir and not topdown: + # Bottom-up: traverse into sub-directory. + stack.append( + (_fwalk_walk, ( + False, topfd, topprefix + name, name, + None if follow_symlinks else entry))) + except OSError: + # If is_dir() raises an OSError, consider the entry not to + # be a directory, same behaviour as os.path.isdir(). + pass + + if is_dir: dirs.append(name) - if entries is not None: - entries.append(entry) else: nondirs.append(name) - except OSError: - try: - # Add dangling symlinks, ignore disappeared files - if entry.is_symlink(): - nondirs.append(name) - except OSError: + except: + # Undo additions to stack in bottom-up mode. + if not topdown: + while stack.pop()[0] != _fwalk_yield: pass + raise if topdown: + # Yield before sub-directory traversal if going top down yield toppath, dirs, nondirs, topfd - else: - stack.append((_fwalk_yield, (toppath, dirs, nondirs, topfd))) - - toppath = path.join(toppath, toppath[:0]) # Add trailing slash. - if entries is None: + # Traverse into sub-directories stack.extend( - (_fwalk_walk, (False, topfd, toppath + name, name, None)) + (_fwalk_walk, (False, topfd, topprefix + name, name, None)) for name in dirs[::-1]) - else: - stack.extend( - (_fwalk_walk, (False, topfd, toppath + name, name, entry)) - for name, entry in zip(dirs[::-1], entries[::-1])) __all__.append("fwalk") From 91b5a78584c1701af18938870c7fb4b33cd5ba4c Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 30 May 2024 05:27:49 +0100 Subject: [PATCH 4/8] Simplify code slightly --- Lib/os.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index 46c3b0dbfffffd..f15345e32f0c29 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -510,21 +510,21 @@ def _fwalk(stack, isbytes, topdown, onerror, follow_symlinks): name = entry.name if isbytes: name = fsencode(name) - is_dir = False + try: is_dir = entry.is_dir() - if is_dir and not topdown: - # Bottom-up: traverse into sub-directory. - stack.append( - (_fwalk_walk, ( - False, topfd, topprefix + name, name, - None if follow_symlinks else entry))) except OSError: # If is_dir() raises an OSError, consider the entry not to # be a directory, same behaviour as os.path.isdir(). - pass + is_dir = False if is_dir: + if not topdown: + # Bottom-up: traverse into sub-directory. + stack.append( + (_fwalk_walk, ( + False, topfd, topprefix + name, name, + None if follow_symlinks else entry))) dirs.append(name) else: nondirs.append(name) From b32c1c0641f9cec2e6788b0ae49ba61d9b9b3d75 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Jul 2024 14:39:56 +0100 Subject: [PATCH 5/8] Simplify exception handling in fwalk() --- Lib/os.py | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index f15345e32f0c29..26a739c8b0f9e6 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -505,35 +505,28 @@ def _fwalk(stack, isbytes, topdown, onerror, follow_symlinks): stack.append((_fwalk_yield, (toppath, dirs, nondirs, topfd))) topprefix = path.join(toppath, toppath[:0]) # Add trailing slash. - try: - for entry in scandir(topfd): - name = entry.name - if isbytes: - name = fsencode(name) - - try: - is_dir = entry.is_dir() - except OSError: - # If is_dir() raises an OSError, consider the entry not to - # be a directory, same behaviour as os.path.isdir(). - is_dir = False + for entry in scandir(topfd): + name = entry.name + if isbytes: + name = fsencode(name) - if is_dir: - if not topdown: - # Bottom-up: traverse into sub-directory. - stack.append( - (_fwalk_walk, ( - False, topfd, topprefix + name, name, - None if follow_symlinks else entry))) - dirs.append(name) - else: - nondirs.append(name) - except: - # Undo additions to stack in bottom-up mode. - if not topdown: - while stack.pop()[0] != _fwalk_yield: - pass - raise + try: + is_dir = entry.is_dir() + except OSError: + # If is_dir() raises an OSError, consider the entry not to + # be a directory, same behaviour as os.path.isdir(). + is_dir = False + + if is_dir: + if not topdown: + # Bottom-up: traverse into sub-directory. + stack.append( + (_fwalk_walk, ( + False, topfd, topprefix + name, name, + None if follow_symlinks else entry))) + dirs.append(name) + else: + nondirs.append(name) if topdown: # Yield before sub-directory traversal if going top down From efd31e2ef5ce2c8d5132d165bef861b8dca6ae90 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Jul 2024 14:43:12 +0100 Subject: [PATCH 6/8] Undo trailing slash optimisation (moved to another PR) --- Lib/os.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index 5fb36603ad18be..6eb07b8300def0 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -410,16 +410,14 @@ def walk(top, topdown=True, onerror=None, followlinks=False): # Yield before sub-directory traversal if going top down yield top, dirs, nondirs # Traverse into sub-directories - if dirs: - prefix = join(top, top[:0]) - for dirname in reversed(dirs): - new_path = prefix + dirname - # bpo-23605: os.path.islink() is used instead of caching - # entry.is_symlink() result during the loop on os.scandir() because - # the caller can replace the directory entry during the "yield" - # above. - if followlinks or not islink(new_path): - stack.append(new_path) + for dirname in reversed(dirs): + new_path = join(top, dirname) + # bpo-23605: os.path.islink() is used instead of caching + # entry.is_symlink() result during the loop on os.scandir() because + # the caller can replace the directory entry during the "yield" + # above. + if followlinks or not islink(new_path): + stack.append(new_path) __all__.append("walk") From 5b89c5759f898f616fd2c51f3b38294c8c079369 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Jul 2024 14:46:06 +0100 Subject: [PATCH 7/8] Undo fwalk() changes. --- Lib/os.py | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index 6eb07b8300def0..24f1706c758f02 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -511,43 +511,43 @@ def _fwalk(stack, isbytes, topdown, onerror, follow_symlinks): if not path.samestat(orig_st, stat(topfd)): return + scandir_it = scandir(topfd) dirs = [] nondirs = [] - if not topdown: - # Yield after sub-directory traversal if going bottom up. - stack.append((_fwalk_yield, (toppath, dirs, nondirs, topfd))) - - topprefix = path.join(toppath, toppath[:0]) # Add trailing slash. - for entry in scandir(topfd): + entries = None if topdown or follow_symlinks else [] + for entry in scandir_it: name = entry.name if isbytes: name = fsencode(name) - try: - is_dir = entry.is_dir() + if entry.is_dir(): + dirs.append(name) + if entries is not None: + entries.append(entry) + else: + nondirs.append(name) except OSError: - # If is_dir() raises an OSError, consider the entry not to - # be a directory, same behaviour as os.path.isdir(). - is_dir = False - - if is_dir: - if not topdown: - # Bottom-up: traverse into sub-directory. - stack.append( - (_fwalk_walk, ( - False, topfd, topprefix + name, name, - None if follow_symlinks else entry))) - dirs.append(name) - else: - nondirs.append(name) + try: + # Add dangling symlinks, ignore disappeared files + if entry.is_symlink(): + nondirs.append(name) + except OSError: + pass if topdown: - # Yield before sub-directory traversal if going top down yield toppath, dirs, nondirs, topfd - # Traverse into sub-directories + else: + stack.append((_fwalk_yield, (toppath, dirs, nondirs, topfd))) + + toppath = path.join(toppath, toppath[:0]) # Add trailing slash. + if entries is None: stack.extend( - (_fwalk_walk, (False, topfd, topprefix + name, name, None)) + (_fwalk_walk, (False, topfd, toppath + name, name, None)) for name in dirs[::-1]) + else: + stack.extend( + (_fwalk_walk, (False, topfd, toppath + name, name, entry)) + for name, entry in zip(dirs[::-1], entries[::-1])) __all__.append("fwalk") From 525618ce65a525eae29c64d414644f45f994647f Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Jul 2024 14:48:04 +0100 Subject: [PATCH 8/8] Fix news entry. --- .../Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst b/Misc/NEWS.d/next/Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst index 4e1b3258f63c32..5d9b50d452a9cd 100644 --- a/Misc/NEWS.d/next/Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst +++ b/Misc/NEWS.d/next/Library/2024-05-19-20-13-09.gh-issue-119169.CuE7q0.rst @@ -1,2 +1 @@ -Slightly speed up :func:`os.walk` by simplifying exception handling and -reducing path joining. +Slightly speed up :func:`os.walk` by simplifying exception handling.