diff --git a/dvc/config.py b/dvc/config.py index d4b3b34eeb..8ed0e56fe0 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -316,7 +316,7 @@ def _load_config(self, level): filename = self.files[level] tree = self.tree if level == "repo" else self.wtree - if tree.exists(filename): + if tree.exists(filename, use_dvcignore=False): with tree.open(filename) as fobj: conf_obj = configobj.ConfigObj(fobj) else: diff --git a/dvc/ignore.py b/dvc/ignore.py index f72623295e..05463403cf 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -99,67 +99,6 @@ def __bool__(self): return bool(self.pattern_list) -class DvcIgnorePatternsTrie(DvcIgnore): - trie = None - - def __init__(self): - if self.trie is None: - self.trie = StringTrie(separator=os.sep) - - def __call__(self, root, dirs, files): - ignore_pattern = self[root] - if ignore_pattern: - return ignore_pattern(root, dirs, files) - return dirs, files - - def __setitem__(self, root, ignore_pattern): - base_pattern = self[root] - common_dirname, merged_pattern = merge_patterns( - base_pattern.dirname, - base_pattern.pattern_list, - ignore_pattern.dirname, - ignore_pattern.pattern_list, - ) - self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname) - - def __getitem__(self, root): - ignore_pattern = self.trie.longest_prefix(root) - if ignore_pattern: - return ignore_pattern.value - return DvcIgnorePatterns([], root) - - -class DvcIgnoreDirs(DvcIgnore): - def __init__(self, basenames): - self.basenames = set(basenames) - - def __call__(self, root, dirs, files): - dirs = [d for d in dirs if d not in self.basenames] - - return dirs, files - - def __hash__(self): - return hash(tuple(self.basenames)) - - def __eq__(self, other): - if not isinstance(other, DvcIgnoreDirs): - return NotImplemented - - return self.basenames == other.basenames - - -class DvcIgnoreRepo(DvcIgnore): - def __call__(self, root, dirs, files): - def is_dvc_repo(directory): - from dvc.repo import Repo - - return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) - - dirs = [d for d in dirs if not is_dvc_repo(d)] - - return dirs, files - - class DvcIgnoreFilterNoop: def __init__(self, tree, root_dir): pass @@ -175,61 +114,99 @@ def is_ignored_file(self, _): class DvcIgnoreFilter: + @staticmethod + def _is_dvc_repo(root, directory): + from dvc.repo import Repo + + return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) + def __init__(self, tree, root_dir): + from dvc.repo import Repo + + default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] + self.tree = tree self.root_dir = root_dir - self.ignores = { - DvcIgnoreDirs([".git", ".hg", ".dvc"]), - DvcIgnoreRepo(), - } - ignore_pattern_trie = DvcIgnorePatternsTrie() + self.ignores_trie_tree = StringTrie(separator=os.sep) + self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( + default_ignore_patterns, root_dir + ) for root, dirs, _ in self.tree.walk(self.root_dir): - ignore_pattern = self._get_ignore_pattern(root) - if ignore_pattern: - ignore_pattern_trie[root] = ignore_pattern - self.ignores.add(ignore_pattern_trie) + self._update(root) + self._update_sub_repo(root, dirs) dirs[:], _ = self(root, dirs, []) - def _get_ignore_pattern(self, dirname): + def _update(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path): - return DvcIgnorePatterns.from_files(ignore_file_path, self.tree) - return None + new_pattern = DvcIgnorePatterns.from_files( + ignore_file_path, self.tree + ) + old_pattern = self._get_trie_pattern(dirname) + if old_pattern: + self.ignores_trie_tree[dirname] = DvcIgnorePatterns( + *merge_patterns( + old_pattern.pattern_list, + old_pattern.dirname, + new_pattern.pattern_list, + new_pattern.dirname, + ) + ) + else: + self.ignores_trie_tree[dirname] = new_pattern + + def _update_sub_repo(self, root, dirs): + for d in dirs: + if self._is_dvc_repo(root, d): + old_pattern = self._get_trie_pattern(root) + if old_pattern: + self.ignores_trie_tree[root] = DvcIgnorePatterns( + *merge_patterns( + old_pattern.pattern_list, + old_pattern.dirname, + ["/{}/".format(d)], + root, + ) + ) + else: + self.ignores_trie_tree[root] = DvcIgnorePatterns( + ["/{}/".format(d)], root + ) def __call__(self, root, dirs, files): - for ignore in self.ignores: - dirs, files = ignore(root, dirs, files) + ignore_pattern = self._get_trie_pattern(root) + if ignore_pattern: + return ignore_pattern(root, dirs, files) + else: + return dirs, files - return dirs, files + def _get_trie_pattern(self, dirname): + ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value + return ignore_pattern - def is_ignored_dir(self, path): - if not self._parents_exist(path): + def _is_ignored(self, path, is_dir=False): + if self._outside_repo(path): return True + dirname, basename = os.path.split(os.path.normpath(path)) + ignore_pattern = self._get_trie_pattern(dirname) + if ignore_pattern: + return ignore_pattern.matches(dirname, basename, is_dir) + else: + return False + def is_ignored_dir(self, path): path = os.path.abspath(path) if path == self.root_dir: return False - dirname, basename = os.path.split(path) - dirs, _ = self(dirname, [basename], []) - return not dirs - - def is_ignored_file(self, path): - if not self._parents_exist(path): - return True - dirname, basename = os.path.split(os.path.normpath(path)) - _, files = self(os.path.abspath(dirname), [], [basename]) - return not files + return self._is_ignored(path, True) - def _parents_exist(self, path): - from dvc.repo import Repo + def is_ignored_file(self, path): + return self._is_ignored(path, False) + def _outside_repo(self, path): path = PathInfo(path) - # if parent is root_dir or inside a .dvc dir we can skip this check - if path.parent == self.root_dir or Repo.DVC_DIR in path.parts: - return True - # paths outside of the repo should be ignored path = relpath(path, self.root_dir) if path.startswith("..") or ( @@ -238,16 +215,5 @@ def _parents_exist(self, path): [os.path.abspath(path), self.root_dir] ) ): - return False - - # check if parent directories are in our ignores, starting from - # root_dir - for parent_dir in reversed(PathInfo(path).parents): - dirname, basename = os.path.split(parent_dir) - if basename == ".": - # parent_dir == root_dir - continue - dirs, _ = self(os.path.abspath(dirname), [basename], []) - if not dirs: - return False - return True + return True + return False diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py index 7377fe4bc4..5af503cd2d 100644 --- a/dvc/pathspec_math.py +++ b/dvc/pathspec_math.py @@ -6,6 +6,8 @@ from pathspec.util import normalize_file +from dvc.utils import relpath + def _not_ignore(rule): return (True, rule[1:]) if rule.startswith("!") else (False, rule) @@ -53,14 +55,14 @@ def change_rule(rule, rel): def _change_dirname(dirname, pattern_list, new_dirname): if new_dirname == dirname: return pattern_list - rel = os.path.relpath(dirname, new_dirname) + rel = relpath(dirname, new_dirname) if rel.startswith(".."): raise ValueError("change dirname can only change to parent path") return [change_rule(rule, rel) for rule in pattern_list] -def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): +def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b): """ Merge two path specification patterns. @@ -69,17 +71,17 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): based on this new base directory. """ if not pattern_a: - return prefix_b, pattern_b + return pattern_b, prefix_b elif not pattern_b: - return prefix_a, pattern_a + return pattern_a, prefix_a longest_common_dir = os.path.commonpath([prefix_a, prefix_b]) new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir) new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir) - if len(prefix_a) < len(prefix_b): + if len(prefix_a) <= len(prefix_b): merged_pattern = new_pattern_a + new_pattern_b else: merged_pattern = new_pattern_b + new_pattern_a - return longest_common_dir, merged_pattern + return merged_pattern, longest_common_dir diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 51ceacfbb9..2b7810c29d 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -277,7 +277,9 @@ def open( ) return self.repo.tree.open(path, mode=mode, encoding=encoding) - def exists(self, path): # pylint: disable=arguments-differ + def exists( + self, path, use_dvcignore=True + ): # pylint: disable=arguments-differ return self.repo.tree.exists(path) or ( self.dvctree and self.dvctree.exists(path) ) diff --git a/dvc/tree/azure.py b/dvc/tree/azure.py index 799ffa35a9..19f9ecdcb9 100644 --- a/dvc/tree/azure.py +++ b/dvc/tree/azure.py @@ -101,7 +101,7 @@ def _generate_download_url(self, path_info, expires=3600): ) return download_url - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): paths = self._list_paths(path_info.bucket, path_info.path) return any(path_info.path == path for path in paths) diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 6f7db8debc..700513b19f 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -156,7 +156,7 @@ def open(self, path_info, mode="r", encoding=None): raise RemoteActionNotImplemented("open", self.scheme) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): raise NotImplementedError # pylint: disable=unused-argument diff --git a/dvc/tree/gdrive.py b/dvc/tree/gdrive.py index d5c7a67c88..5e34fa4551 100644 --- a/dvc/tree/gdrive.py +++ b/dvc/tree/gdrive.py @@ -522,7 +522,7 @@ def _get_item_id(self, path_info, create=False, use_cache=True, hint=None): assert not create raise FileMissingError(path_info, hint) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): try: self._get_item_id(path_info) except FileMissingError: diff --git a/dvc/tree/git.py b/dvc/tree/git.py index a500fa2fb4..ff5be8209f 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -76,9 +76,13 @@ def open( return io.BytesIO(data) return io.StringIO(data.decode(encoding)) - def exists(self, path): # pylint: disable=arguments-differ + def exists( + self, path, use_dvcignore=True + ): # pylint: disable=arguments-differ if self._git_object_by_path(path) is None: return False + if not use_dvcignore: + return True return not self.dvcignore.is_ignored_file( path diff --git a/dvc/tree/gs.py b/dvc/tree/gs.py index c8372ae0ec..767154de35 100644 --- a/dvc/tree/gs.py +++ b/dvc/tree/gs.py @@ -119,7 +119,7 @@ def _generate_download_url(self, path_info, expires=3600): ) return signing_credentials.signer.sign(blob) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): """Check if the blob exists. If it does not exist, it could be a part of a directory path. diff --git a/dvc/tree/hdfs.py b/dvc/tree/hdfs.py index 55b9909416..cdc9a2c339 100644 --- a/dvc/tree/hdfs.py +++ b/dvc/tree/hdfs.py @@ -72,7 +72,7 @@ def open(self, path_info, mode="r", encoding=None): raise FileNotFoundError(*e.args) raise - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): assert not isinstance(path_info, list) assert path_info.scheme == "hdfs" with self.hdfs(path_info) as hdfs: diff --git a/dvc/tree/http.py b/dvc/tree/http.py index a683ec5ea5..abed95ca4f 100644 --- a/dvc/tree/http.py +++ b/dvc/tree/http.py @@ -122,7 +122,7 @@ def request(self, method, url, **kwargs): except requests.exceptions.RequestException: raise DvcException(f"could not perform a {method} request") - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): return bool(self.request("HEAD", path_info.url)) def get_file_hash(self, path_info): diff --git a/dvc/tree/local.py b/dvc/tree/local.py index 61af21e01e..24361f4c8d 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -69,7 +69,7 @@ def dvcignore(self): def open(path_info, mode="r", encoding=None): return open(path_info, mode=mode, encoding=encoding) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): assert isinstance(path_info, str) or path_info.scheme == "local" if self.repo: ret = os.path.lexists(path_info) @@ -77,6 +77,8 @@ def exists(self, path_info): ret = os.path.exists(path_info) if not ret: return False + if not use_dvcignore: + return True return not self.dvcignore.is_ignored_file( path_info diff --git a/dvc/tree/oss.py b/dvc/tree/oss.py index 634ad0d8d4..3e0a3d4775 100644 --- a/dvc/tree/oss.py +++ b/dvc/tree/oss.py @@ -88,7 +88,7 @@ def _generate_download_url(self, path_info, expires=3600): return self.oss_service.sign_url("GET", path_info.path, expires) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): paths = self._list_paths(path_info) return any(path_info.path == path for path in paths) diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index d07a5389a1..44368d7461 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -131,7 +131,7 @@ def _generate_download_url(self, path_info, expires=3600): ClientMethod="get_object", Params=params, ExpiresIn=int(expires) ) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): """Check if the blob exists. If it does not exist, it could be a part of a directory path. diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index 916e5a6521..26bee19388 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -156,7 +156,7 @@ def open(self, path_info, mode="r", encoding=None): else: yield io.TextIOWrapper(fd, encoding=encoding) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): with self.ssh(path_info) as ssh: return ssh.exists(path_info.path) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 6becd800ac..16afb978af 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -4,16 +4,10 @@ import pytest from dvc.exceptions import DvcIgnoreInCollectedDirError -from dvc.ignore import ( - DvcIgnore, - DvcIgnoreDirs, - DvcIgnorePatterns, - DvcIgnorePatternsTrie, - DvcIgnoreRepo, -) +from dvc.ignore import DvcIgnore, DvcIgnorePatterns from dvc.path_info import PathInfo +from dvc.pathspec_math import merge_patterns from dvc.repo import Repo -from dvc.tree.local import LocalTree from dvc.utils import relpath from dvc.utils.fs import get_mtime_and_size from tests.dir_helpers import TmpDir @@ -103,23 +97,23 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): ignore_file = tmp_dir / dname / DvcIgnore.DVCIGNORE_FILE ignore_file.write_text("foo") - assert len(dvc.tree.dvcignore.ignores) == 3 - assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores - ignore_pattern_trie = None - for ignore in dvc.tree.dvcignore.ignores: - if isinstance(ignore, DvcIgnorePatternsTrie): - ignore_pattern_trie = ignore + dvcignore = dvc.tree.dvcignore + + top_ignore_path = os.path.dirname(os.fspath(top_ignore_file)) + + sub_dir_path = os.path.dirname(os.fspath(ignore_file)) - assert ignore_pattern_trie is not None assert ( - DvcIgnorePatterns.from_files( - os.fspath(top_ignore_file), LocalTree(None, {"url": dvc.root_dir}), + DvcIgnorePatterns( + *merge_patterns( + [".hg/", ".git/", ".dvc/"], + os.fspath(tmp_dir), + [os.path.basename(dname)], + top_ignore_path, + ) ) - == ignore_pattern_trie[os.fspath(ignore_file)] - ) - - assert any( - i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo) + == dvcignore._get_trie_pattern(top_ignore_path) + == dvcignore._get_trie_pattern(sub_dir_path) ) @@ -294,73 +288,50 @@ def test_pattern_trie_tree(tmp_dir, dvc): } ) dvc.tree.__dict__.pop("dvcignore", None) - ignore_pattern_trie = None - for ignore in dvc.tree.dvcignore.ignores: - if isinstance(ignore, DvcIgnorePatternsTrie): - ignore_pattern_trie = ignore - break - - assert ignore_pattern_trie is not None - ignore_pattern_top = ignore_pattern_trie[os.fspath(tmp_dir / "top")] - ignore_pattern_other = ignore_pattern_trie[os.fspath(tmp_dir / "other")] - ignore_pattern_first = ignore_pattern_trie[ + dvcignore = dvc.tree.dvcignore + + ignore_pattern_top = dvcignore._get_trie_pattern( + os.fspath(tmp_dir / "top") + ) + ignore_pattern_other = dvcignore._get_trie_pattern( + os.fspath(tmp_dir / "other") + ) + ignore_pattern_first = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top" / "first") - ] - ignore_pattern_middle = ignore_pattern_trie[ + ) + ignore_pattern_middle = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top" / "first" / "middle") - ] - ignore_pattern_second = ignore_pattern_trie[ + ) + ignore_pattern_second = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top" / "first" / "middle" / "second") - ] - ignore_pattern_bottom = ignore_pattern_trie[ + ) + ignore_pattern_bottom = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top" / "first" / "middle" / "second" / "bottom") - ] - assert not ignore_pattern_top - assert ( - DvcIgnorePatterns([], os.fspath(tmp_dir / "top")) == ignore_pattern_top ) - assert ( - DvcIgnorePatterns(["1", "2", "3"], os.fspath(tmp_dir / "other")) - == ignore_pattern_other + + base_pattern = [".hg/", ".git/", ".dvc/"], os.fspath(tmp_dir) + first_pattern = merge_patterns( + *base_pattern, ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") ) - assert ( - DvcIgnorePatterns( - ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") - ) - == ignore_pattern_first + second_pattern = merge_patterns( + *first_pattern, + ["d", "e", "f"], + os.fspath(tmp_dir / "top" / "first" / "middle" / "second") ) + other_pattern = merge_patterns( + *base_pattern, ["1", "2", "3"], os.fspath(tmp_dir / "other") + ) + + assert DvcIgnorePatterns(*base_pattern) == ignore_pattern_top + assert DvcIgnorePatterns(*other_pattern) == ignore_pattern_other assert ( - DvcIgnorePatterns( - ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") - ) + DvcIgnorePatterns(*first_pattern) + == ignore_pattern_first == ignore_pattern_middle ) assert ( - DvcIgnorePatterns( - [ - "a", - "b", - "c", - "/middle/second/**/d", - "/middle/second/**/e", - "/middle/second/**/f", - ], - os.fspath(tmp_dir / "top" / "first"), - ) + DvcIgnorePatterns(*second_pattern) == ignore_pattern_second - ) - assert ( - DvcIgnorePatterns( - [ - "a", - "b", - "c", - "/middle/second/**/d", - "/middle/second/**/e", - "/middle/second/**/f", - ], - os.fspath(tmp_dir / "top" / "first"), - ) == ignore_pattern_bottom ) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 0dcde9c910..3bc0344e79 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -3,7 +3,7 @@ import pytest from mock import MagicMock, mock_open, patch -from dvc.ignore import DvcIgnoreDirs, DvcIgnorePatterns +from dvc.ignore import DvcIgnorePatterns def mock_dvcignore(dvcignore_path, patterns): @@ -85,6 +85,14 @@ def mock_dvcignore(dvcignore_path, patterns): ("to_ignore.txt", ["/*.txt"], True), (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False), (os.path.join("data", "file.txt"), ["data/*"], True), + (os.path.join("data", "subdir", "file.txt"), ["data/*"], False), + (os.path.join("data", "file.txt"), ["data/"], True), + (os.path.join("data", "subdir", "file.txt"), ["data/"], True), + (os.path.join("data", "subdir", "file.txt"), ["subdir/"], True), + (os.path.join("data", "subdir", "file.txt"), ["/subdir/"], False), + (os.path.join("data", "path"), ["path/"], False), + (os.path.join(".git", "file.txt"), [".git/"], True), + (os.path.join("data", ".dvc", "file.txt"), [".dvc/"], True), # wait for Git # (os.path.join("data", "sub", "file.txt"), ["data/*"], True), ( @@ -181,14 +189,21 @@ def test_match_ignore_from_file( ) +@pytest.mark.parametrize("sub_dir", ["", "dir"]) @pytest.mark.parametrize("omit_dir", [".git", ".hg", ".dvc"]) -def test_should_ignore_dir(omit_dir): - ignore = DvcIgnoreDirs([".git", ".hg", ".dvc"]) - +def test_should_ignore_dir(omit_dir, sub_dir): root = os.path.join(os.path.sep, "walk", "dir", "root") + ignore = DvcIgnorePatterns([".git/", ".hg/", ".dvc/"], root) + dirs = [omit_dir, "dir1", "dir2"] - files = [] + files = [omit_dir, "file1", "file2"] + + if sub_dir: + current = os.path.join(root, sub_dir) + else: + current = root - new_dirs, _ = ignore(root, dirs, files) + new_dirs, new_files = ignore(current, dirs, files) assert set(new_dirs) == {"dir1", "dir2"} + assert set(new_files) == {"file1", "file2", omit_dir}