Skip to content

Commit 58f6534

Browse files
karajan1001Suorefiop
authored
Optimize ignore performance (#4120)
* Remove Duplicate ignore Match * Continue Optimize Dvcignore fix #3869 * Add a new test with multi ignore files * Solve merging two dvc files * Solve Code Climate * For Windows * Complete addition of patterns. Add one test * Systematic test * Change request * Change request * Seperate path sepcification math * Rename and add comment * rename change_dirname to private * Update dvc/pathspec_math.py list comprehension Co-authored-by: Alexander Schepanovski <[email protected]> * Change request * Update dvc/ignore.py Co-authored-by: karajan1001 <[email protected]> Co-authored-by: Alexander Schepanovski <[email protected]> Co-authored-by: Ruslan Kuprieiev <[email protected]>
1 parent 66def03 commit 58f6534

File tree

6 files changed

+341
-29
lines changed

6 files changed

+341
-29
lines changed

dvc/ignore.py

Lines changed: 73 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
from funcy import cached_property
77
from pathspec.patterns import GitWildMatchPattern
88
from pathspec.util import normalize_file
9+
from pygtrie import StringTrie
910

1011
from dvc.path_info import PathInfo
12+
from dvc.pathspec_math import merge_patterns
1113
from dvc.scm.tree import BaseTree
1214
from dvc.system import System
1315
from dvc.utils import relpath
@@ -23,25 +25,33 @@ def __call__(self, root, dirs, files):
2325

2426

2527
class DvcIgnorePatterns(DvcIgnore):
26-
def __init__(self, ignore_file_path, tree):
27-
assert os.path.isabs(ignore_file_path)
28+
def __init__(self, pattern_list, dirname):
29+
30+
self.pattern_list = pattern_list
31+
self.dirname = dirname
32+
self.prefix = self.dirname + os.sep
2833

29-
self.ignore_file_path = ignore_file_path
30-
self.dirname = os.path.normpath(os.path.dirname(ignore_file_path))
34+
regex_pattern_list = map(
35+
GitWildMatchPattern.pattern_to_regex, pattern_list
36+
)
3137

38+
self.ignore_spec = [
39+
(ignore, re.compile("|".join(item[0] for item in group)))
40+
for ignore, group in groupby(regex_pattern_list, lambda x: x[1])
41+
if ignore is not None
42+
]
43+
44+
@classmethod
45+
def from_files(cls, ignore_file_path, tree):
46+
assert os.path.isabs(ignore_file_path)
47+
dirname = os.path.normpath(os.path.dirname(ignore_file_path))
3248
with tree.open(ignore_file_path, encoding="utf-8") as fobj:
33-
path_spec_lines = fobj.readlines()
34-
regex_pattern_list = map(
35-
GitWildMatchPattern.pattern_to_regex, path_spec_lines
36-
)
37-
self.ignore_spec = [
38-
(ignore, re.compile("|".join(item[0] for item in group)))
39-
for ignore, group in groupby(
40-
regex_pattern_list, lambda x: x[1]
41-
)
42-
if ignore is not None
49+
path_spec_lines = [
50+
line for line in map(str.strip, fobj.readlines()) if line
4351
]
4452

53+
return cls(path_spec_lines, dirname)
54+
4555
def __call__(self, root, dirs, files):
4656
files = [f for f in files if not self.matches(root, f)]
4757
dirs = [d for d in dirs if not self.matches(root, d, True)]
@@ -51,11 +61,10 @@ def __call__(self, root, dirs, files):
5161
def matches(self, dirname, basename, is_dir=False):
5262
# NOTE: `relpath` is too slow, so we have to assume that both
5363
# `dirname` and `self.dirname` are relative or absolute together.
54-
prefix = self.dirname + os.sep
5564
if dirname == self.dirname:
5665
path = basename
57-
elif dirname.startswith(prefix):
58-
rel = dirname[len(prefix) :]
66+
elif dirname.startswith(self.prefix):
67+
rel = dirname[len(self.prefix) :]
5968
# NOTE: `os.path.join` is ~x5.5 slower
6069
path = f"{rel}{os.sep}{basename}"
6170
else:
@@ -79,13 +88,47 @@ def ignore(self, path, is_dir):
7988
return result
8089

8190
def __hash__(self):
82-
return hash(self.ignore_file_path)
91+
return hash(self.dirname + ":" + "\n".join(self.pattern_list))
8392

8493
def __eq__(self, other):
8594
if not isinstance(other, DvcIgnorePatterns):
8695
return NotImplemented
96+
return (self.dirname == other.dirname) & (
97+
self.pattern_list == other.pattern_list
98+
)
99+
100+
def __bool__(self):
101+
return bool(self.pattern_list)
87102

88-
return self.ignore_file_path == other.ignore_file_path
103+
104+
class DvcIgnorePatternsTrie(DvcIgnore):
105+
trie = None
106+
107+
def __init__(self):
108+
if self.trie is None:
109+
self.trie = StringTrie(separator=os.sep)
110+
111+
def __call__(self, root, dirs, files):
112+
ignore_pattern = self[root]
113+
if ignore_pattern:
114+
return ignore_pattern(root, dirs, files)
115+
return dirs, files
116+
117+
def __setitem__(self, root, ignore_pattern):
118+
base_pattern = self[root]
119+
common_dirname, merged_pattern = merge_patterns(
120+
base_pattern.dirname,
121+
base_pattern.pattern_list,
122+
ignore_pattern.dirname,
123+
ignore_pattern.pattern_list,
124+
)
125+
self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname)
126+
127+
def __getitem__(self, root):
128+
ignore_pattern = self.trie.longest_prefix(root)
129+
if ignore_pattern:
130+
return ignore_pattern.value
131+
return DvcIgnorePatterns([], root)
89132

90133

91134
class DvcIgnoreDirs(DvcIgnore):
@@ -127,14 +170,19 @@ def __init__(self, tree, root_dir):
127170
DvcIgnoreDirs([".git", ".hg", ".dvc"]),
128171
DvcIgnoreRepo(),
129172
}
130-
for root, dirs, files in self.tree.walk(self.root_dir):
131-
self._update(root)
132-
dirs[:], files[:] = self(root, dirs, files)
133-
134-
def _update(self, dirname):
173+
ignore_pattern_trie = DvcIgnorePatternsTrie()
174+
for root, dirs, _ in self.tree.walk(self.root_dir):
175+
ignore_pattern = self._get_ignore_pattern(root)
176+
if ignore_pattern:
177+
ignore_pattern_trie[root] = ignore_pattern
178+
self.ignores.add(ignore_pattern_trie)
179+
dirs[:], _ = self(root, dirs, [])
180+
181+
def _get_ignore_pattern(self, dirname):
135182
ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
136183
if self.tree.exists(ignore_file_path):
137-
self.ignores.add(DvcIgnorePatterns(ignore_file_path, self.tree))
184+
return DvcIgnorePatterns.from_files(ignore_file_path, self.tree)
185+
return None
138186

139187
def __call__(self, root, dirs, files):
140188
for ignore in self.ignores:

dvc/pathspec_math.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Path Specification Pattern Math
2+
# Including changing base dir of path specification patterns and merging
3+
# of two path specification patterns with different base
4+
# All the operations follow the documents of `gitignore`
5+
import os
6+
7+
from pathspec.util import normalize_file
8+
9+
10+
def _not_ignore(rule):
11+
return (True, rule[1:]) if rule.startswith("!") else (False, rule)
12+
13+
14+
def _is_comment(rule):
15+
return rule.startswith("#")
16+
17+
18+
def _remove_slash(rule):
19+
if rule.startswith("\\"):
20+
return rule[1:]
21+
return rule
22+
23+
24+
def _match_all_level(rule):
25+
if rule[:-1].find("/") >= 0 and not rule.startswith("**/"):
26+
if rule.startswith("/"):
27+
rule = rule[1:]
28+
return False, rule
29+
if rule.startswith("**/"):
30+
rule = rule[3:]
31+
return True, rule
32+
33+
34+
def change_rule(rule, rel):
35+
rule = rule.strip()
36+
if _is_comment(rule):
37+
return rule
38+
not_ignore, rule = _not_ignore(rule)
39+
match_all, rule = _match_all_level(rule)
40+
rule = _remove_slash(rule)
41+
if not match_all:
42+
rule = f"/{rule}"
43+
else:
44+
rule = f"/**/{rule}"
45+
if not_ignore:
46+
rule = f"!/{rel}{rule}"
47+
else:
48+
rule = f"/{rel}{rule}"
49+
rule = normalize_file(rule)
50+
return rule
51+
52+
53+
def _change_dirname(dirname, pattern_list, new_dirname):
54+
if new_dirname == dirname:
55+
return pattern_list
56+
rel = os.path.relpath(dirname, new_dirname)
57+
if rel.startswith(".."):
58+
raise ValueError("change dirname can only change to parent path")
59+
60+
return [change_rule(rule, rel) for rule in pattern_list]
61+
62+
63+
def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b):
64+
"""
65+
Merge two path specification patterns.
66+
67+
This implementation merge two path specification patterns on different
68+
bases. It returns the longest common parent directory, and the patterns
69+
based on this new base directory.
70+
"""
71+
if not pattern_a:
72+
return prefix_b, pattern_b
73+
elif not pattern_b:
74+
return prefix_a, pattern_a
75+
76+
longest_common_dir = os.path.commonpath([prefix_a, prefix_b])
77+
new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir)
78+
new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir)
79+
80+
if len(prefix_a) < len(prefix_b):
81+
merged_pattern = new_pattern_a + new_pattern_b
82+
else:
83+
merged_pattern = new_pattern_b + new_pattern_a
84+
85+
return longest_common_dir, merged_pattern

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ count=true
1717
[isort]
1818
include_trailing_comma=true
1919
known_first_party=dvc,tests
20-
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,grandalf,mock,moto,nanotime,networkx,packaging,pathspec,pylint,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc
20+
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,grandalf,mock,moto,nanotime,networkx,packaging,pathspec,pygtrie,pylint,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc
2121
line_length=79
2222
force_grid_wrap=0
2323
use_parentheses=True

tests/func/test_ignore.py

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
DvcIgnore,
99
DvcIgnoreDirs,
1010
DvcIgnorePatterns,
11+
DvcIgnorePatternsTrie,
1112
DvcIgnoreRepo,
1213
)
1314
from dvc.repo import Repo
@@ -98,12 +99,19 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname):
9899

99100
assert len(dvc.tree.dvcignore.ignores) == 3
100101
assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores
102+
ignore_pattern_trie = None
103+
for ignore in dvc.tree.dvcignore.ignores:
104+
if isinstance(ignore, DvcIgnorePatternsTrie):
105+
ignore_pattern_trie = ignore
106+
107+
assert ignore_pattern_trie is not None
101108
assert (
102-
DvcIgnorePatterns(
109+
DvcIgnorePatterns.from_files(
103110
os.fspath(top_ignore_file), WorkingTree(dvc.root_dir)
104111
)
105-
in dvc.tree.dvcignore.ignores
112+
== ignore_pattern_trie[os.fspath(ignore_file)]
106113
)
114+
107115
assert any(
108116
i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo)
109117
)
@@ -236,3 +244,102 @@ def test_ignore_directory(tmp_dir, dvc):
236244
assert _files_set("dir", dvc.tree) == {
237245
"dir/{}".format(DvcIgnore.DVCIGNORE_FILE),
238246
}
247+
248+
249+
def test_multi_ignore_file(tmp_dir, dvc, monkeypatch):
250+
tmp_dir.gen({"dir": {"subdir": {"should_ignore": "1", "not_ignore": "1"}}})
251+
tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/subdir/*_ignore")
252+
tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "!subdir/not_ignore"}})
253+
254+
assert _files_set("dir", dvc.tree) == {
255+
"dir/subdir/not_ignore",
256+
"dir/{}".format(DvcIgnore.DVCIGNORE_FILE),
257+
}
258+
259+
260+
def test_pattern_trie_tree(tmp_dir, dvc):
261+
tmp_dir.gen(
262+
{
263+
"top": {
264+
"first": {
265+
DvcIgnore.DVCIGNORE_FILE: "a\nb\nc",
266+
"middle": {
267+
"second": {
268+
DvcIgnore.DVCIGNORE_FILE: "d\ne\nf",
269+
"bottom": {},
270+
}
271+
},
272+
},
273+
},
274+
"other": {DvcIgnore.DVCIGNORE_FILE: "1\n2\n3"},
275+
}
276+
)
277+
ignore_pattern_trie = None
278+
for ignore in dvc.tree.dvcignore.ignores:
279+
if isinstance(ignore, DvcIgnorePatternsTrie):
280+
ignore_pattern_trie = ignore
281+
break
282+
283+
assert ignore_pattern_trie is not None
284+
ignore_pattern_top = ignore_pattern_trie[os.fspath(tmp_dir / "top")]
285+
ignore_pattern_other = ignore_pattern_trie[os.fspath(tmp_dir / "other")]
286+
ignore_pattern_first = ignore_pattern_trie[
287+
os.fspath(tmp_dir / "top" / "first")
288+
]
289+
ignore_pattern_middle = ignore_pattern_trie[
290+
os.fspath(tmp_dir / "top" / "first" / "middle")
291+
]
292+
ignore_pattern_second = ignore_pattern_trie[
293+
os.fspath(tmp_dir / "top" / "first" / "middle" / "second")
294+
]
295+
ignore_pattern_bottom = ignore_pattern_trie[
296+
os.fspath(tmp_dir / "top" / "first" / "middle" / "second" / "bottom")
297+
]
298+
assert not ignore_pattern_top
299+
assert (
300+
DvcIgnorePatterns([], os.fspath(tmp_dir / "top")) == ignore_pattern_top
301+
)
302+
assert (
303+
DvcIgnorePatterns(["1", "2", "3"], os.fspath(tmp_dir / "other"))
304+
== ignore_pattern_other
305+
)
306+
assert (
307+
DvcIgnorePatterns(
308+
["a", "b", "c"], os.fspath(tmp_dir / "top" / "first")
309+
)
310+
== ignore_pattern_first
311+
)
312+
assert (
313+
DvcIgnorePatterns(
314+
["a", "b", "c"], os.fspath(tmp_dir / "top" / "first")
315+
)
316+
== ignore_pattern_middle
317+
)
318+
assert (
319+
DvcIgnorePatterns(
320+
[
321+
"a",
322+
"b",
323+
"c",
324+
"/middle/second/**/d",
325+
"/middle/second/**/e",
326+
"/middle/second/**/f",
327+
],
328+
os.fspath(tmp_dir / "top" / "first"),
329+
)
330+
== ignore_pattern_second
331+
)
332+
assert (
333+
DvcIgnorePatterns(
334+
[
335+
"a",
336+
"b",
337+
"c",
338+
"/middle/second/**/d",
339+
"/middle/second/**/e",
340+
"/middle/second/**/f",
341+
],
342+
os.fspath(tmp_dir / "top" / "first"),
343+
)
344+
== ignore_pattern_bottom
345+
)

tests/unit/test_ignore.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
def mock_dvcignore(dvcignore_path, patterns):
1010
tree = MagicMock()
1111
with patch.object(tree, "open", mock_open(read_data="\n".join(patterns))):
12-
ignore_patterns = DvcIgnorePatterns(dvcignore_path, tree)
12+
ignore_patterns = DvcIgnorePatterns.from_files(dvcignore_path, tree)
1313

1414
return ignore_patterns
1515

0 commit comments

Comments
 (0)