Skip to content

Optimize dvcignore #4242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dvc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def _load_config(self, level):
filename = self.files[level]
tree = self.tree if level == "repo" else self.wtree

if tree.exists(filename):
if tree.exists(filename, use_dvcignore=False):
with tree.open(filename) as fobj:
conf_obj = configobj.ConfigObj(fobj)
else:
Expand Down
180 changes: 73 additions & 107 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,67 +99,6 @@ def __bool__(self):
return bool(self.pattern_list)


class DvcIgnorePatternsTrie(DvcIgnore):
trie = None

def __init__(self):
if self.trie is None:
self.trie = StringTrie(separator=os.sep)

def __call__(self, root, dirs, files):
ignore_pattern = self[root]
if ignore_pattern:
return ignore_pattern(root, dirs, files)
return dirs, files

def __setitem__(self, root, ignore_pattern):
base_pattern = self[root]
common_dirname, merged_pattern = merge_patterns(
base_pattern.dirname,
base_pattern.pattern_list,
ignore_pattern.dirname,
ignore_pattern.pattern_list,
)
self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname)

def __getitem__(self, root):
ignore_pattern = self.trie.longest_prefix(root)
if ignore_pattern:
return ignore_pattern.value
return DvcIgnorePatterns([], root)


class DvcIgnoreDirs(DvcIgnore):
def __init__(self, basenames):
self.basenames = set(basenames)

def __call__(self, root, dirs, files):
dirs = [d for d in dirs if d not in self.basenames]

return dirs, files

def __hash__(self):
return hash(tuple(self.basenames))

def __eq__(self, other):
if not isinstance(other, DvcIgnoreDirs):
return NotImplemented

return self.basenames == other.basenames


class DvcIgnoreRepo(DvcIgnore):
def __call__(self, root, dirs, files):
def is_dvc_repo(directory):
from dvc.repo import Repo

return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))

dirs = [d for d in dirs if not is_dvc_repo(d)]

return dirs, files


class DvcIgnoreFilterNoop:
def __init__(self, tree, root_dir):
pass
Expand All @@ -175,61 +114,99 @@ def is_ignored_file(self, _):


class DvcIgnoreFilter:
@staticmethod
def _is_dvc_repo(root, directory):
from dvc.repo import Repo

return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))

def __init__(self, tree, root_dir):
from dvc.repo import Repo

default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

self.tree = tree
self.root_dir = root_dir
self.ignores = {
DvcIgnoreDirs([".git", ".hg", ".dvc"]),
DvcIgnoreRepo(),
}
ignore_pattern_trie = DvcIgnorePatternsTrie()
self.ignores_trie_tree = StringTrie(separator=os.sep)
self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
default_ignore_patterns, root_dir
)
for root, dirs, _ in self.tree.walk(self.root_dir):
ignore_pattern = self._get_ignore_pattern(root)
if ignore_pattern:
ignore_pattern_trie[root] = ignore_pattern
self.ignores.add(ignore_pattern_trie)
self._update(root)
self._update_sub_repo(root, dirs)
dirs[:], _ = self(root, dirs, [])

def _get_ignore_pattern(self, dirname):
def _update(self, dirname):
ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
if self.tree.exists(ignore_file_path):
return DvcIgnorePatterns.from_files(ignore_file_path, self.tree)
return None
new_pattern = DvcIgnorePatterns.from_files(
ignore_file_path, self.tree
)
old_pattern = self._get_trie_pattern(dirname)
if old_pattern:
self.ignores_trie_tree[dirname] = DvcIgnorePatterns(
*merge_patterns(
old_pattern.pattern_list,
old_pattern.dirname,
new_pattern.pattern_list,
new_pattern.dirname,
)
)
else:
self.ignores_trie_tree[dirname] = new_pattern

def _update_sub_repo(self, root, dirs):
for d in dirs:
if self._is_dvc_repo(root, d):
old_pattern = self._get_trie_pattern(root)
if old_pattern:
self.ignores_trie_tree[root] = DvcIgnorePatterns(
*merge_patterns(
old_pattern.pattern_list,
old_pattern.dirname,
["/{}/".format(d)],
root,
)
)
else:
self.ignores_trie_tree[root] = DvcIgnorePatterns(
["/{}/".format(d)], root
)

def __call__(self, root, dirs, files):
for ignore in self.ignores:
dirs, files = ignore(root, dirs, files)
ignore_pattern = self._get_trie_pattern(root)
if ignore_pattern:
return ignore_pattern(root, dirs, files)
else:
return dirs, files

return dirs, files
def _get_trie_pattern(self, dirname):
ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value
return ignore_pattern

def is_ignored_dir(self, path):
if not self._parents_exist(path):
def _is_ignored(self, path, is_dir=False):
if self._outside_repo(path):
return True
dirname, basename = os.path.split(os.path.normpath(path))
ignore_pattern = self._get_trie_pattern(dirname)
if ignore_pattern:
return ignore_pattern.matches(dirname, basename, is_dir)
else:
return False

def is_ignored_dir(self, path):
path = os.path.abspath(path)
if path == self.root_dir:
return False
dirname, basename = os.path.split(path)
dirs, _ = self(dirname, [basename], [])
return not dirs

def is_ignored_file(self, path):
if not self._parents_exist(path):
return True

dirname, basename = os.path.split(os.path.normpath(path))
_, files = self(os.path.abspath(dirname), [], [basename])
return not files
return self._is_ignored(path, True)

def _parents_exist(self, path):
from dvc.repo import Repo
def is_ignored_file(self, path):
return self._is_ignored(path, False)

def _outside_repo(self, path):
path = PathInfo(path)

# if parent is root_dir or inside a .dvc dir we can skip this check
if path.parent == self.root_dir or Repo.DVC_DIR in path.parts:
return True

# paths outside of the repo should be ignored
path = relpath(path, self.root_dir)
if path.startswith("..") or (
Expand All @@ -238,16 +215,5 @@ def _parents_exist(self, path):
[os.path.abspath(path), self.root_dir]
)
):
return False

# check if parent directories are in our ignores, starting from
# root_dir
for parent_dir in reversed(PathInfo(path).parents):
dirname, basename = os.path.split(parent_dir)
if basename == ".":
# parent_dir == root_dir
continue
dirs, _ = self(os.path.abspath(dirname), [basename], [])
if not dirs:
return False
return True
return True
return False
14 changes: 8 additions & 6 deletions dvc/pathspec_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from pathspec.util import normalize_file

from dvc.utils import relpath


def _not_ignore(rule):
return (True, rule[1:]) if rule.startswith("!") else (False, rule)
Expand Down Expand Up @@ -53,14 +55,14 @@ def change_rule(rule, rel):
def _change_dirname(dirname, pattern_list, new_dirname):
if new_dirname == dirname:
return pattern_list
rel = os.path.relpath(dirname, new_dirname)
rel = relpath(dirname, new_dirname)
if rel.startswith(".."):
raise ValueError("change dirname can only change to parent path")

return [change_rule(rule, rel) for rule in pattern_list]


def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b):
def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b):
"""
Merge two path specification patterns.

Expand All @@ -69,17 +71,17 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b):
based on this new base directory.
"""
if not pattern_a:
return prefix_b, pattern_b
return pattern_b, prefix_b
elif not pattern_b:
return prefix_a, pattern_a
return pattern_a, prefix_a

longest_common_dir = os.path.commonpath([prefix_a, prefix_b])
new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir)
new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir)

if len(prefix_a) < len(prefix_b):
if len(prefix_a) <= len(prefix_b):
merged_pattern = new_pattern_a + new_pattern_b
else:
merged_pattern = new_pattern_b + new_pattern_a

return longest_common_dir, merged_pattern
return merged_pattern, longest_common_dir
4 changes: 3 additions & 1 deletion dvc/repo/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,9 @@ def open(
)
return self.repo.tree.open(path, mode=mode, encoding=encoding)

def exists(self, path): # pylint: disable=arguments-differ
def exists(
self, path, use_dvcignore=True
): # pylint: disable=arguments-differ
return self.repo.tree.exists(path) or (
self.dvctree and self.dvctree.exists(path)
)
Expand Down
2 changes: 1 addition & 1 deletion dvc/tree/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _generate_download_url(self, path_info, expires=3600):
)
return download_url

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
paths = self._list_paths(path_info.bucket, path_info.path)
return any(path_info.path == path for path in paths)

Expand Down
2 changes: 1 addition & 1 deletion dvc/tree/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def open(self, path_info, mode="r", encoding=None):

raise RemoteActionNotImplemented("open", self.scheme)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
raise NotImplementedError

# pylint: disable=unused-argument
Expand Down
2 changes: 1 addition & 1 deletion dvc/tree/gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def _get_item_id(self, path_info, create=False, use_cache=True, hint=None):
assert not create
raise FileMissingError(path_info, hint)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
try:
self._get_item_id(path_info)
except FileMissingError:
Expand Down
6 changes: 5 additions & 1 deletion dvc/tree/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,13 @@ def open(
return io.BytesIO(data)
return io.StringIO(data.decode(encoding))

def exists(self, path): # pylint: disable=arguments-differ
def exists(
self, path, use_dvcignore=True
): # pylint: disable=arguments-differ
if self._git_object_by_path(path) is None:
return False
if not use_dvcignore:
return True

return not self.dvcignore.is_ignored_file(
path
Expand Down
2 changes: 1 addition & 1 deletion dvc/tree/gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def _generate_download_url(self, path_info, expires=3600):
)
return signing_credentials.signer.sign(blob)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
"""Check if the blob exists. If it does not exist,
it could be a part of a directory path.

Expand Down
2 changes: 1 addition & 1 deletion dvc/tree/hdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def open(self, path_info, mode="r", encoding=None):
raise FileNotFoundError(*e.args)
raise

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
assert not isinstance(path_info, list)
assert path_info.scheme == "hdfs"
with self.hdfs(path_info) as hdfs:
Expand Down
2 changes: 1 addition & 1 deletion dvc/tree/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def request(self, method, url, **kwargs):
except requests.exceptions.RequestException:
raise DvcException(f"could not perform a {method} request")

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
return bool(self.request("HEAD", path_info.url))

def get_file_hash(self, path_info):
Expand Down
4 changes: 3 additions & 1 deletion dvc/tree/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,16 @@ def dvcignore(self):
def open(path_info, mode="r", encoding=None):
return open(path_info, mode=mode, encoding=encoding)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
assert isinstance(path_info, str) or path_info.scheme == "local"
if self.repo:
ret = os.path.lexists(path_info)
else:
ret = os.path.exists(path_info)
if not ret:
return False
if not use_dvcignore:
return True

return not self.dvcignore.is_ignored_file(
path_info
Expand Down
2 changes: 1 addition & 1 deletion dvc/tree/oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _generate_download_url(self, path_info, expires=3600):

return self.oss_service.sign_url("GET", path_info.path, expires)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
paths = self._list_paths(path_info)
return any(path_info.path == path for path in paths)

Expand Down
2 changes: 1 addition & 1 deletion dvc/tree/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _generate_download_url(self, path_info, expires=3600):
ClientMethod="get_object", Params=params, ExpiresIn=int(expires)
)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
"""Check if the blob exists. If it does not exist,
it could be a part of a directory path.

Expand Down
Loading