Skip to content

Commit 67896aa

Browse files
authored
Optimize dvcignore (#4242)
* Update 1. unit test update 2. two might cause bugs * dvc ignore * Some refactoring * clean * Config tree use * Solve sub dir repo problem. Add more tests * typo error * Change request Co-authored-by: karajan1001 <[email protected]>
1 parent a4b3c6d commit 67896aa

17 files changed

+171
-209
lines changed

dvc/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def _load_config(self, level):
316316
filename = self.files[level]
317317
tree = self.tree if level == "repo" else self.wtree
318318

319-
if tree.exists(filename):
319+
if tree.exists(filename, use_dvcignore=False):
320320
with tree.open(filename) as fobj:
321321
conf_obj = configobj.ConfigObj(fobj)
322322
else:

dvc/ignore.py

Lines changed: 73 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -99,67 +99,6 @@ def __bool__(self):
9999
return bool(self.pattern_list)
100100

101101

102-
class DvcIgnorePatternsTrie(DvcIgnore):
103-
trie = None
104-
105-
def __init__(self):
106-
if self.trie is None:
107-
self.trie = StringTrie(separator=os.sep)
108-
109-
def __call__(self, root, dirs, files):
110-
ignore_pattern = self[root]
111-
if ignore_pattern:
112-
return ignore_pattern(root, dirs, files)
113-
return dirs, files
114-
115-
def __setitem__(self, root, ignore_pattern):
116-
base_pattern = self[root]
117-
common_dirname, merged_pattern = merge_patterns(
118-
base_pattern.dirname,
119-
base_pattern.pattern_list,
120-
ignore_pattern.dirname,
121-
ignore_pattern.pattern_list,
122-
)
123-
self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname)
124-
125-
def __getitem__(self, root):
126-
ignore_pattern = self.trie.longest_prefix(root)
127-
if ignore_pattern:
128-
return ignore_pattern.value
129-
return DvcIgnorePatterns([], root)
130-
131-
132-
class DvcIgnoreDirs(DvcIgnore):
133-
def __init__(self, basenames):
134-
self.basenames = set(basenames)
135-
136-
def __call__(self, root, dirs, files):
137-
dirs = [d for d in dirs if d not in self.basenames]
138-
139-
return dirs, files
140-
141-
def __hash__(self):
142-
return hash(tuple(self.basenames))
143-
144-
def __eq__(self, other):
145-
if not isinstance(other, DvcIgnoreDirs):
146-
return NotImplemented
147-
148-
return self.basenames == other.basenames
149-
150-
151-
class DvcIgnoreRepo(DvcIgnore):
152-
def __call__(self, root, dirs, files):
153-
def is_dvc_repo(directory):
154-
from dvc.repo import Repo
155-
156-
return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))
157-
158-
dirs = [d for d in dirs if not is_dvc_repo(d)]
159-
160-
return dirs, files
161-
162-
163102
class DvcIgnoreFilterNoop:
164103
def __init__(self, tree, root_dir):
165104
pass
@@ -175,61 +114,99 @@ def is_ignored_file(self, _):
175114

176115

177116
class DvcIgnoreFilter:
117+
@staticmethod
118+
def _is_dvc_repo(root, directory):
119+
from dvc.repo import Repo
120+
121+
return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))
122+
178123
def __init__(self, tree, root_dir):
124+
from dvc.repo import Repo
125+
126+
default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]
127+
179128
self.tree = tree
180129
self.root_dir = root_dir
181-
self.ignores = {
182-
DvcIgnoreDirs([".git", ".hg", ".dvc"]),
183-
DvcIgnoreRepo(),
184-
}
185-
ignore_pattern_trie = DvcIgnorePatternsTrie()
130+
self.ignores_trie_tree = StringTrie(separator=os.sep)
131+
self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
132+
default_ignore_patterns, root_dir
133+
)
186134
for root, dirs, _ in self.tree.walk(self.root_dir):
187-
ignore_pattern = self._get_ignore_pattern(root)
188-
if ignore_pattern:
189-
ignore_pattern_trie[root] = ignore_pattern
190-
self.ignores.add(ignore_pattern_trie)
135+
self._update(root)
136+
self._update_sub_repo(root, dirs)
191137
dirs[:], _ = self(root, dirs, [])
192138

193-
def _get_ignore_pattern(self, dirname):
139+
def _update(self, dirname):
194140
ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
195141
if self.tree.exists(ignore_file_path):
196-
return DvcIgnorePatterns.from_files(ignore_file_path, self.tree)
197-
return None
142+
new_pattern = DvcIgnorePatterns.from_files(
143+
ignore_file_path, self.tree
144+
)
145+
old_pattern = self._get_trie_pattern(dirname)
146+
if old_pattern:
147+
self.ignores_trie_tree[dirname] = DvcIgnorePatterns(
148+
*merge_patterns(
149+
old_pattern.pattern_list,
150+
old_pattern.dirname,
151+
new_pattern.pattern_list,
152+
new_pattern.dirname,
153+
)
154+
)
155+
else:
156+
self.ignores_trie_tree[dirname] = new_pattern
157+
158+
def _update_sub_repo(self, root, dirs):
159+
for d in dirs:
160+
if self._is_dvc_repo(root, d):
161+
old_pattern = self._get_trie_pattern(root)
162+
if old_pattern:
163+
self.ignores_trie_tree[root] = DvcIgnorePatterns(
164+
*merge_patterns(
165+
old_pattern.pattern_list,
166+
old_pattern.dirname,
167+
["/{}/".format(d)],
168+
root,
169+
)
170+
)
171+
else:
172+
self.ignores_trie_tree[root] = DvcIgnorePatterns(
173+
["/{}/".format(d)], root
174+
)
198175

199176
def __call__(self, root, dirs, files):
200-
for ignore in self.ignores:
201-
dirs, files = ignore(root, dirs, files)
177+
ignore_pattern = self._get_trie_pattern(root)
178+
if ignore_pattern:
179+
return ignore_pattern(root, dirs, files)
180+
else:
181+
return dirs, files
202182

203-
return dirs, files
183+
def _get_trie_pattern(self, dirname):
184+
ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value
185+
return ignore_pattern
204186

205-
def is_ignored_dir(self, path):
206-
if not self._parents_exist(path):
187+
def _is_ignored(self, path, is_dir=False):
188+
if self._outside_repo(path):
207189
return True
190+
dirname, basename = os.path.split(os.path.normpath(path))
191+
ignore_pattern = self._get_trie_pattern(dirname)
192+
if ignore_pattern:
193+
return ignore_pattern.matches(dirname, basename, is_dir)
194+
else:
195+
return False
208196

197+
def is_ignored_dir(self, path):
209198
path = os.path.abspath(path)
210199
if path == self.root_dir:
211200
return False
212-
dirname, basename = os.path.split(path)
213-
dirs, _ = self(dirname, [basename], [])
214-
return not dirs
215-
216-
def is_ignored_file(self, path):
217-
if not self._parents_exist(path):
218-
return True
219201

220-
dirname, basename = os.path.split(os.path.normpath(path))
221-
_, files = self(os.path.abspath(dirname), [], [basename])
222-
return not files
202+
return self._is_ignored(path, True)
223203

224-
def _parents_exist(self, path):
225-
from dvc.repo import Repo
204+
def is_ignored_file(self, path):
205+
return self._is_ignored(path, False)
226206

207+
def _outside_repo(self, path):
227208
path = PathInfo(path)
228209

229-
# if parent is root_dir or inside a .dvc dir we can skip this check
230-
if path.parent == self.root_dir or Repo.DVC_DIR in path.parts:
231-
return True
232-
233210
# paths outside of the repo should be ignored
234211
path = relpath(path, self.root_dir)
235212
if path.startswith("..") or (
@@ -238,16 +215,5 @@ def _parents_exist(self, path):
238215
[os.path.abspath(path), self.root_dir]
239216
)
240217
):
241-
return False
242-
243-
# check if parent directories are in our ignores, starting from
244-
# root_dir
245-
for parent_dir in reversed(PathInfo(path).parents):
246-
dirname, basename = os.path.split(parent_dir)
247-
if basename == ".":
248-
# parent_dir == root_dir
249-
continue
250-
dirs, _ = self(os.path.abspath(dirname), [basename], [])
251-
if not dirs:
252-
return False
253-
return True
218+
return True
219+
return False

dvc/pathspec_math.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
from pathspec.util import normalize_file
88

9+
from dvc.utils import relpath
10+
911

1012
def _not_ignore(rule):
1113
return (True, rule[1:]) if rule.startswith("!") else (False, rule)
@@ -53,14 +55,14 @@ def change_rule(rule, rel):
5355
def _change_dirname(dirname, pattern_list, new_dirname):
5456
if new_dirname == dirname:
5557
return pattern_list
56-
rel = os.path.relpath(dirname, new_dirname)
58+
rel = relpath(dirname, new_dirname)
5759
if rel.startswith(".."):
5860
raise ValueError("change dirname can only change to parent path")
5961

6062
return [change_rule(rule, rel) for rule in pattern_list]
6163

6264

63-
def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b):
65+
def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b):
6466
"""
6567
Merge two path specification patterns.
6668
@@ -69,17 +71,17 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b):
6971
based on this new base directory.
7072
"""
7173
if not pattern_a:
72-
return prefix_b, pattern_b
74+
return pattern_b, prefix_b
7375
elif not pattern_b:
74-
return prefix_a, pattern_a
76+
return pattern_a, prefix_a
7577

7678
longest_common_dir = os.path.commonpath([prefix_a, prefix_b])
7779
new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir)
7880
new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir)
7981

80-
if len(prefix_a) < len(prefix_b):
82+
if len(prefix_a) <= len(prefix_b):
8183
merged_pattern = new_pattern_a + new_pattern_b
8284
else:
8385
merged_pattern = new_pattern_b + new_pattern_a
8486

85-
return longest_common_dir, merged_pattern
87+
return merged_pattern, longest_common_dir

dvc/repo/tree.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,9 @@ def open(
277277
)
278278
return self.repo.tree.open(path, mode=mode, encoding=encoding)
279279

280-
def exists(self, path): # pylint: disable=arguments-differ
280+
def exists(
281+
self, path, use_dvcignore=True
282+
): # pylint: disable=arguments-differ
281283
return self.repo.tree.exists(path) or (
282284
self.dvctree and self.dvctree.exists(path)
283285
)

dvc/tree/azure.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def _generate_download_url(self, path_info, expires=3600):
101101
)
102102
return download_url
103103

104-
def exists(self, path_info):
104+
def exists(self, path_info, use_dvcignore=True):
105105
paths = self._list_paths(path_info.bucket, path_info.path)
106106
return any(path_info.path == path for path in paths)
107107

dvc/tree/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def open(self, path_info, mode="r", encoding=None):
156156

157157
raise RemoteActionNotImplemented("open", self.scheme)
158158

159-
def exists(self, path_info):
159+
def exists(self, path_info, use_dvcignore=True):
160160
raise NotImplementedError
161161

162162
# pylint: disable=unused-argument

dvc/tree/gdrive.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ def _get_item_id(self, path_info, create=False, use_cache=True, hint=None):
522522
assert not create
523523
raise FileMissingError(path_info, hint)
524524

525-
def exists(self, path_info):
525+
def exists(self, path_info, use_dvcignore=True):
526526
try:
527527
self._get_item_id(path_info)
528528
except FileMissingError:

dvc/tree/git.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,13 @@ def open(
7676
return io.BytesIO(data)
7777
return io.StringIO(data.decode(encoding))
7878

79-
def exists(self, path): # pylint: disable=arguments-differ
79+
def exists(
80+
self, path, use_dvcignore=True
81+
): # pylint: disable=arguments-differ
8082
if self._git_object_by_path(path) is None:
8183
return False
84+
if not use_dvcignore:
85+
return True
8286

8387
return not self.dvcignore.is_ignored_file(
8488
path

dvc/tree/gs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def _generate_download_url(self, path_info, expires=3600):
119119
)
120120
return signing_credentials.signer.sign(blob)
121121

122-
def exists(self, path_info):
122+
def exists(self, path_info, use_dvcignore=True):
123123
"""Check if the blob exists. If it does not exist,
124124
it could be a part of a directory path.
125125

dvc/tree/hdfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def open(self, path_info, mode="r", encoding=None):
7272
raise FileNotFoundError(*e.args)
7373
raise
7474

75-
def exists(self, path_info):
75+
def exists(self, path_info, use_dvcignore=True):
7676
assert not isinstance(path_info, list)
7777
assert path_info.scheme == "hdfs"
7878
with self.hdfs(path_info) as hdfs:

dvc/tree/http.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def request(self, method, url, **kwargs):
122122
except requests.exceptions.RequestException:
123123
raise DvcException(f"could not perform a {method} request")
124124

125-
def exists(self, path_info):
125+
def exists(self, path_info, use_dvcignore=True):
126126
return bool(self.request("HEAD", path_info.url))
127127

128128
def get_file_hash(self, path_info):

dvc/tree/local.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,16 @@ def dvcignore(self):
6969
def open(path_info, mode="r", encoding=None):
7070
return open(path_info, mode=mode, encoding=encoding)
7171

72-
def exists(self, path_info):
72+
def exists(self, path_info, use_dvcignore=True):
7373
assert isinstance(path_info, str) or path_info.scheme == "local"
7474
if self.repo:
7575
ret = os.path.lexists(path_info)
7676
else:
7777
ret = os.path.exists(path_info)
7878
if not ret:
7979
return False
80+
if not use_dvcignore:
81+
return True
8082

8183
return not self.dvcignore.is_ignored_file(
8284
path_info

dvc/tree/oss.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _generate_download_url(self, path_info, expires=3600):
8888

8989
return self.oss_service.sign_url("GET", path_info.path, expires)
9090

91-
def exists(self, path_info):
91+
def exists(self, path_info, use_dvcignore=True):
9292
paths = self._list_paths(path_info)
9393
return any(path_info.path == path for path in paths)
9494

dvc/tree/s3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def _generate_download_url(self, path_info, expires=3600):
131131
ClientMethod="get_object", Params=params, ExpiresIn=int(expires)
132132
)
133133

134-
def exists(self, path_info):
134+
def exists(self, path_info, use_dvcignore=True):
135135
"""Check if the blob exists. If it does not exist,
136136
it could be a part of a directory path.
137137

0 commit comments

Comments
 (0)