Skip to content

Commit dfedd03

Browse files
committed
dvcignore: optimize matching x10
On a repo with a dvcignore with 1 pattern and a directory with 400K files, `dvc status` now takes ~8 sec instead of ~30 sec. To achieve that, we make some assumptions about the paths formats that we are dealing with, so we could use simpler logic instead of using very slow `relpath`, `abspath` etc on every entry in a directory. It is also clear that CleanTree behavior is inconsistent (even tests expect very different outputs from it), so we will need to look into this later.
1 parent 2fc7033 commit dfedd03

File tree

2 files changed

+15
-8
lines changed

2 files changed

+15
-8
lines changed

dvc/ignore.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from pathspec.patterns import GitWildMatchPattern
77

88
from dvc.scm.tree import BaseTree
9-
from dvc.utils import relpath
109

1110
logger = logging.getLogger(__name__)
1211

@@ -35,12 +34,19 @@ def __call__(self, root, dirs, files):
3534
return dirs, files
3635

3736
def matches(self, dirname, basename):
38-
abs_path = os.path.join(dirname, basename)
39-
rel_path = relpath(abs_path, self.dirname)
40-
41-
if os.pardir + os.sep in rel_path:
37+
# NOTE: `relpath` is too slow, so we have to assume that both
38+
# `dirname` and `self.dirname` are relative or absolute together.
39+
prefix = self.dirname + os.sep
40+
if dirname == self.dirname:
41+
path = basename
42+
elif dirname.startswith(prefix):
43+
rel = dirname[len(prefix) :]
44+
# NOTE: `os.path.join` is ~x5.5 slower
45+
path = f"{rel}{os.sep}{basename}"
46+
else:
4247
return False
43-
return self.ignore_spec.match_file(rel_path)
48+
49+
return self.ignore_spec.match_file(path)
4450

4551
def __hash__(self):
4652
return hash(self.ignore_file_path)
@@ -135,7 +141,9 @@ def isexec(self, path):
135141

136142
def walk(self, top, topdown=True):
137143
for root, dirs, files in self.tree.walk(top, topdown):
138-
dirs[:], files[:] = self.dvcignore(root, dirs, files)
144+
dirs[:], files[:] = self.dvcignore(
145+
os.path.abspath(root), dirs, files
146+
)
139147

140148
yield root, dirs, files
141149

tests/unit/test_ignore.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def test_ignore_from_file_should_filter_dirs_and_files():
6969
),
7070
("dont_ignore.txt", ["dont_ignore"], False),
7171
("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False),
72-
("../../../something.txt", ["**/something.txt"], False),
7372
],
7473
)
7574
def test_match_ignore_from_file(

0 commit comments

Comments
 (0)