From 63ad69aa4d5b6324adb7b27fff0abfd47c65faa3 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sat, 18 Jul 2020 22:51:00 +0800 Subject: [PATCH 1/8] Update 1. unit test update 2. two might cause bugs --- dvc/pathspec_math.py | 6 ++++-- tests/unit/test_ignore.py | 22 ++++++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py index 7377fe4bc4..5c15967453 100644 --- a/dvc/pathspec_math.py +++ b/dvc/pathspec_math.py @@ -6,6 +6,8 @@ from pathspec.util import normalize_file +from dvc.utils import relpath + def _not_ignore(rule): return (True, rule[1:]) if rule.startswith("!") else (False, rule) @@ -53,7 +55,7 @@ def change_rule(rule, rel): def _change_dirname(dirname, pattern_list, new_dirname): if new_dirname == dirname: return pattern_list - rel = os.path.relpath(dirname, new_dirname) + rel = relpath(dirname, new_dirname) if rel.startswith(".."): raise ValueError("change dirname can only change to parent path") @@ -77,7 +79,7 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir) new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir) - if len(prefix_a) < len(prefix_b): + if len(prefix_a) <= len(prefix_b): merged_pattern = new_pattern_a + new_pattern_b else: merged_pattern = new_pattern_b + new_pattern_a diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 0dcde9c910..147ae80d57 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -3,7 +3,7 @@ import pytest from mock import MagicMock, mock_open, patch -from dvc.ignore import DvcIgnoreDirs, DvcIgnorePatterns +from dvc.ignore import DvcIgnorePatterns def mock_dvcignore(dvcignore_path, patterns): @@ -85,6 +85,9 @@ def mock_dvcignore(dvcignore_path, patterns): ("to_ignore.txt", ["/*.txt"], True), (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False), (os.path.join("data", "file.txt"), ["data/*"], True), + (os.path.join("data", "path"), ["path/"], False), + (os.path.join(".git", "file.txt"), [".git/"], True), + (os.path.join("data", ".dvc", "file.txt"), [".dvc/"], True), # wait for Git # (os.path.join("data", "sub", "file.txt"), ["data/*"], True), ( @@ -181,14 +184,21 @@ def test_match_ignore_from_file( ) +@pytest.mark.parametrize("sub_dir", ["", "dir"]) @pytest.mark.parametrize("omit_dir", [".git", ".hg", ".dvc"]) -def test_should_ignore_dir(omit_dir): - ignore = DvcIgnoreDirs([".git", ".hg", ".dvc"]) - +def test_should_ignore_dir(omit_dir, sub_dir): root = os.path.join(os.path.sep, "walk", "dir", "root") + ignore = DvcIgnorePatterns([".git/", ".hg/", ".dvc/"], root) + dirs = [omit_dir, "dir1", "dir2"] - files = [] + files = [omit_dir, "file1", "file2"] + + if sub_dir: + current = os.path.join(root, sub_dir) + else: + current = root - new_dirs, _ = ignore(root, dirs, files) + new_dirs, new_files = ignore(current, dirs, files) assert set(new_dirs) == {"dir1", "dir2"} + assert set(new_files) == {"file1", "file2", omit_dir} From 24bad492f6c628d47c1214650814a3539443cc99 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sun, 19 Jul 2020 22:03:46 +0800 Subject: [PATCH 2/8] dvc ignore --- dvc/ignore.py | 111 ++++++++++---------------------------- dvc/pathspec_math.py | 8 +-- dvc/repo/__init__.py | 2 +- tests/func/test_ignore.py | 100 ++++++++++++---------------------- 4 files changed, 66 insertions(+), 155 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index f72623295e..5e68266493 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -100,11 +100,9 @@ def __bool__(self): class DvcIgnorePatternsTrie(DvcIgnore): - trie = None - - def __init__(self): - if self.trie is None: - self.trie = StringTrie(separator=os.sep) + def __init__(self, default_ignore_pattern, root): + self.trie = StringTrie(separator=os.sep) + self.trie[root] = DvcIgnorePatterns(default_ignore_pattern, root) def __call__(self, root, dirs, files): ignore_pattern = self[root] @@ -114,50 +112,18 @@ def __call__(self, root, dirs, files): def __setitem__(self, root, ignore_pattern): base_pattern = self[root] - common_dirname, merged_pattern = merge_patterns( - base_pattern.dirname, - base_pattern.pattern_list, - ignore_pattern.dirname, - ignore_pattern.pattern_list, + self.trie[root] = DvcIgnorePatterns( + *merge_patterns( + base_pattern.pattern_list, + base_pattern.dirname, + ignore_pattern.pattern_list, + ignore_pattern.dirname, + ) ) - self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname) def __getitem__(self, root): ignore_pattern = self.trie.longest_prefix(root) - if ignore_pattern: - return ignore_pattern.value - return DvcIgnorePatterns([], root) - - -class DvcIgnoreDirs(DvcIgnore): - def __init__(self, basenames): - self.basenames = set(basenames) - - def __call__(self, root, dirs, files): - dirs = [d for d in dirs if d not in self.basenames] - - return dirs, files - - def __hash__(self): - return hash(tuple(self.basenames)) - - def __eq__(self, other): - if not isinstance(other, DvcIgnoreDirs): - return NotImplemented - - return self.basenames == other.basenames - - -class DvcIgnoreRepo(DvcIgnore): - def __call__(self, root, dirs, files): - def is_dvc_repo(directory): - from dvc.repo import Repo - - return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) - - dirs = [d for d in dirs if not is_dvc_repo(d)] - - return dirs, files + return ignore_pattern.value class DvcIgnoreFilterNoop: @@ -176,34 +142,30 @@ def is_ignored_file(self, _): class DvcIgnoreFilter: def __init__(self, tree, root_dir): + from dvc.repo import Repo + self.tree = tree self.root_dir = root_dir - self.ignores = { - DvcIgnoreDirs([".git", ".hg", ".dvc"]), - DvcIgnoreRepo(), - } - ignore_pattern_trie = DvcIgnorePatternsTrie() + default_ignore_pattern = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] + self.ignores_trie_tree = DvcIgnorePatternsTrie( + default_ignore_pattern, root_dir + ) for root, dirs, _ in self.tree.walk(self.root_dir): - ignore_pattern = self._get_ignore_pattern(root) - if ignore_pattern: - ignore_pattern_trie[root] = ignore_pattern - self.ignores.add(ignore_pattern_trie) + self._update(root) dirs[:], _ = self(root, dirs, []) - def _get_ignore_pattern(self, dirname): + def _update(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path): - return DvcIgnorePatterns.from_files(ignore_file_path, self.tree) - return None + self.ignores_trie_tree[dirname] = DvcIgnorePatterns.from_files( + ignore_file_path, self.tree + ) def __call__(self, root, dirs, files): - for ignore in self.ignores: - dirs, files = ignore(root, dirs, files) - - return dirs, files + return self.ignores_trie_tree(root, dirs, files) def is_ignored_dir(self, path): - if not self._parents_exist(path): + if self._outside_repo(path): return True path = os.path.abspath(path) @@ -214,22 +176,16 @@ def is_ignored_dir(self, path): return not dirs def is_ignored_file(self, path): - if not self._parents_exist(path): + if self._outside_repo(path): return True dirname, basename = os.path.split(os.path.normpath(path)) _, files = self(os.path.abspath(dirname), [], [basename]) return not files - def _parents_exist(self, path): - from dvc.repo import Repo - + def _outside_repo(self, path): path = PathInfo(path) - # if parent is root_dir or inside a .dvc dir we can skip this check - if path.parent == self.root_dir or Repo.DVC_DIR in path.parts: - return True - # paths outside of the repo should be ignored path = relpath(path, self.root_dir) if path.startswith("..") or ( @@ -238,16 +194,5 @@ def _parents_exist(self, path): [os.path.abspath(path), self.root_dir] ) ): - return False - - # check if parent directories are in our ignores, starting from - # root_dir - for parent_dir in reversed(PathInfo(path).parents): - dirname, basename = os.path.split(parent_dir) - if basename == ".": - # parent_dir == root_dir - continue - dirs, _ = self(os.path.abspath(dirname), [basename], []) - if not dirs: - return False - return True + return True + return False diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py index 5c15967453..5af503cd2d 100644 --- a/dvc/pathspec_math.py +++ b/dvc/pathspec_math.py @@ -62,7 +62,7 @@ def _change_dirname(dirname, pattern_list, new_dirname): return [change_rule(rule, rel) for rule in pattern_list] -def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): +def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b): """ Merge two path specification patterns. @@ -71,9 +71,9 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): based on this new base directory. """ if not pattern_a: - return prefix_b, pattern_b + return pattern_b, prefix_b elif not pattern_b: - return prefix_a, pattern_a + return pattern_a, prefix_a longest_common_dir = os.path.commonpath([prefix_a, prefix_b]) new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir) @@ -84,4 +84,4 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): else: merged_pattern = new_pattern_b + new_pattern_a - return longest_common_dir, merged_pattern + return merged_pattern, longest_common_dir diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index d26cf4c68c..a8ff867b23 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -99,7 +99,7 @@ def __init__(self, root_dir=None, scm=None, rev=None): ) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) - self.config = Config(self.dvc_dir, tree=self.tree) + self.config = Config(self.dvc_dir) if not scm: no_scm = self.config["core"].get("no_scm", False) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 41533ac389..34d263771d 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -4,16 +4,10 @@ import pytest from dvc.exceptions import DvcIgnoreInCollectedDirError -from dvc.ignore import ( - DvcIgnore, - DvcIgnoreDirs, - DvcIgnorePatterns, - DvcIgnorePatternsTrie, - DvcIgnoreRepo, -) +from dvc.ignore import DvcIgnore, DvcIgnorePatterns from dvc.path_info import PathInfo +from dvc.pathspec_math import merge_patterns from dvc.repo import Repo -from dvc.tree.local import LocalRemoteTree from dvc.utils import relpath from dvc.utils.fs import get_mtime_and_size from tests.dir_helpers import TmpDir @@ -103,24 +97,23 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): ignore_file = tmp_dir / dname / DvcIgnore.DVCIGNORE_FILE ignore_file.write_text("foo") - assert len(dvc.tree.dvcignore.ignores) == 3 - assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores - ignore_pattern_trie = None - for ignore in dvc.tree.dvcignore.ignores: - if isinstance(ignore, DvcIgnorePatternsTrie): - ignore_pattern_trie = ignore + ignore_pattern_trie = dvc.tree.dvcignore.ignores_trie_tree + + top_ignore_path = os.path.dirname(os.fspath(top_ignore_file)) + + sub_dir_path = os.path.dirname(os.fspath(ignore_file)) - assert ignore_pattern_trie is not None assert ( - DvcIgnorePatterns.from_files( - os.fspath(top_ignore_file), - LocalRemoteTree(None, {"url": dvc.root_dir}), + DvcIgnorePatterns( + *merge_patterns( + [".hg/", ".git/", ".dvc/"], + os.fspath(tmp_dir), + [os.path.basename(dname)], + top_ignore_path, + ) ) - == ignore_pattern_trie[os.fspath(ignore_file)] - ) - - assert any( - i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo) + == ignore_pattern_trie[top_ignore_path] + == ignore_pattern_trie[sub_dir_path] ) @@ -295,13 +288,8 @@ def test_pattern_trie_tree(tmp_dir, dvc): } ) dvc.tree.__dict__.pop("dvcignore", None) - ignore_pattern_trie = None - for ignore in dvc.tree.dvcignore.ignores: - if isinstance(ignore, DvcIgnorePatternsTrie): - ignore_pattern_trie = ignore - break + ignore_pattern_trie = dvc.tree.dvcignore.ignores_trie_tree - assert ignore_pattern_trie is not None ignore_pattern_top = ignore_pattern_trie[os.fspath(tmp_dir / "top")] ignore_pattern_other = ignore_pattern_trie[os.fspath(tmp_dir / "other")] ignore_pattern_first = ignore_pattern_trie[ @@ -316,52 +304,30 @@ def test_pattern_trie_tree(tmp_dir, dvc): ignore_pattern_bottom = ignore_pattern_trie[ os.fspath(tmp_dir / "top" / "first" / "middle" / "second" / "bottom") ] - assert not ignore_pattern_top - assert ( - DvcIgnorePatterns([], os.fspath(tmp_dir / "top")) == ignore_pattern_top + + base_pattern = [".hg/", ".git/", ".dvc/"], os.fspath(tmp_dir) + first_pattern = merge_patterns( + *base_pattern, ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") ) - assert ( - DvcIgnorePatterns(["1", "2", "3"], os.fspath(tmp_dir / "other")) - == ignore_pattern_other + second_pattern = merge_patterns( + *first_pattern, + ["d", "e", "f"], + os.fspath(tmp_dir / "top" / "first" / "middle" / "second") ) - assert ( - DvcIgnorePatterns( - ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") - ) - == ignore_pattern_first + other_pattern = merge_patterns( + *base_pattern, ["1", "2", "3"], os.fspath(tmp_dir / "other") ) + + assert DvcIgnorePatterns(*base_pattern) == ignore_pattern_top + assert DvcIgnorePatterns(*other_pattern) == ignore_pattern_other assert ( - DvcIgnorePatterns( - ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") - ) + DvcIgnorePatterns(*first_pattern) + == ignore_pattern_first == ignore_pattern_middle ) assert ( - DvcIgnorePatterns( - [ - "a", - "b", - "c", - "/middle/second/**/d", - "/middle/second/**/e", - "/middle/second/**/f", - ], - os.fspath(tmp_dir / "top" / "first"), - ) + DvcIgnorePatterns(*second_pattern) == ignore_pattern_second - ) - assert ( - DvcIgnorePatterns( - [ - "a", - "b", - "c", - "/middle/second/**/d", - "/middle/second/**/e", - "/middle/second/**/f", - ], - os.fspath(tmp_dir / "top" / "first"), - ) == ignore_pattern_bottom ) From 8740e2cb87c6ab0a0a1e86934e5c2df34bb18c5f Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sun, 19 Jul 2020 22:35:32 +0800 Subject: [PATCH 3/8] Some refactoring --- dvc/ignore.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 5e68266493..2b23171d97 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -111,15 +111,7 @@ def __call__(self, root, dirs, files): return dirs, files def __setitem__(self, root, ignore_pattern): - base_pattern = self[root] - self.trie[root] = DvcIgnorePatterns( - *merge_patterns( - base_pattern.pattern_list, - base_pattern.dirname, - ignore_pattern.pattern_list, - ignore_pattern.dirname, - ) - ) + self.trie[root] = ignore_pattern def __getitem__(self, root): ignore_pattern = self.trie.longest_prefix(root) @@ -157,9 +149,18 @@ def __init__(self, tree, root_dir): def _update(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path): - self.ignores_trie_tree[dirname] = DvcIgnorePatterns.from_files( + new_pattern = DvcIgnorePatterns.from_files( ignore_file_path, self.tree ) + old_pattern = self.ignores_trie_tree[dirname] + self.ignores_trie_tree[dirname] = DvcIgnorePatterns( + *merge_patterns( + old_pattern.pattern_list, + old_pattern.dirname, + new_pattern.pattern_list, + new_pattern.dirname, + ) + ) def __call__(self, root, dirs, files): return self.ignores_trie_tree(root, dirs, files) From e92ee9e473b366419c6f270c6d2d33a806a3f99b Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Mon, 20 Jul 2020 17:38:29 +0800 Subject: [PATCH 4/8] clean --- dvc/ignore.py | 51 +++++++++++++++------------------------ tests/func/test_ignore.py | 32 +++++++++++++----------- 2 files changed, 37 insertions(+), 46 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 2b23171d97..a111fb4af0 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -99,25 +99,6 @@ def __bool__(self): return bool(self.pattern_list) -class DvcIgnorePatternsTrie(DvcIgnore): - def __init__(self, default_ignore_pattern, root): - self.trie = StringTrie(separator=os.sep) - self.trie[root] = DvcIgnorePatterns(default_ignore_pattern, root) - - def __call__(self, root, dirs, files): - ignore_pattern = self[root] - if ignore_pattern: - return ignore_pattern(root, dirs, files) - return dirs, files - - def __setitem__(self, root, ignore_pattern): - self.trie[root] = ignore_pattern - - def __getitem__(self, root): - ignore_pattern = self.trie.longest_prefix(root) - return ignore_pattern.value - - class DvcIgnoreFilterNoop: def __init__(self, tree, root_dir): pass @@ -139,7 +120,8 @@ def __init__(self, tree, root_dir): self.tree = tree self.root_dir = root_dir default_ignore_pattern = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] - self.ignores_trie_tree = DvcIgnorePatternsTrie( + self.ignores_trie_tree = StringTrie(separator=os.sep) + self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_pattern, root_dir ) for root, dirs, _ in self.tree.walk(self.root_dir): @@ -152,7 +134,7 @@ def _update(self, dirname): new_pattern = DvcIgnorePatterns.from_files( ignore_file_path, self.tree ) - old_pattern = self.ignores_trie_tree[dirname] + old_pattern = self._get_tire_pattern(dirname) self.ignores_trie_tree[dirname] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, @@ -163,26 +145,31 @@ def _update(self, dirname): ) def __call__(self, root, dirs, files): - return self.ignores_trie_tree(root, dirs, files) + ignore_pattern = self._get_tire_pattern(root) + return ignore_pattern(root, dirs, files) - def is_ignored_dir(self, path): + def _get_tire_pattern(self, dirname): + ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value + if not ignore_pattern: + return DvcIgnorePatterns([], dirname) + return ignore_pattern + + def _is_ignored(self, path, is_dir=False): if self._outside_repo(path): return True + dirname, basename = os.path.split(os.path.normpath(path)) + ignore_pattern = self._get_tire_pattern(dirname) + return ignore_pattern.matches(dirname, basename, is_dir) + def is_ignored_dir(self, path): path = os.path.abspath(path) if path == self.root_dir: return False - dirname, basename = os.path.split(path) - dirs, _ = self(dirname, [basename], []) - return not dirs - def is_ignored_file(self, path): - if self._outside_repo(path): - return True + return self._is_ignored(path, True) - dirname, basename = os.path.split(os.path.normpath(path)) - _, files = self(os.path.abspath(dirname), [], [basename]) - return not files + def is_ignored_file(self, path): + return self._is_ignored(path, False) def _outside_repo(self, path): path = PathInfo(path) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 34d263771d..e4ce250562 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -97,7 +97,7 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): ignore_file = tmp_dir / dname / DvcIgnore.DVCIGNORE_FILE ignore_file.write_text("foo") - ignore_pattern_trie = dvc.tree.dvcignore.ignores_trie_tree + dvcignore = dvc.tree.dvcignore top_ignore_path = os.path.dirname(os.fspath(top_ignore_file)) @@ -112,8 +112,8 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): top_ignore_path, ) ) - == ignore_pattern_trie[top_ignore_path] - == ignore_pattern_trie[sub_dir_path] + == dvcignore._get_tire_pattern(top_ignore_path) + == dvcignore._get_tire_pattern(sub_dir_path) ) @@ -288,22 +288,26 @@ def test_pattern_trie_tree(tmp_dir, dvc): } ) dvc.tree.__dict__.pop("dvcignore", None) - ignore_pattern_trie = dvc.tree.dvcignore.ignores_trie_tree + dvcignore = dvc.tree.dvcignore - ignore_pattern_top = ignore_pattern_trie[os.fspath(tmp_dir / "top")] - ignore_pattern_other = ignore_pattern_trie[os.fspath(tmp_dir / "other")] - ignore_pattern_first = ignore_pattern_trie[ + ignore_pattern_top = dvcignore._get_tire_pattern( + os.fspath(tmp_dir / "top") + ) + ignore_pattern_other = dvcignore._get_tire_pattern( + os.fspath(tmp_dir / "other") + ) + ignore_pattern_first = dvcignore._get_tire_pattern( os.fspath(tmp_dir / "top" / "first") - ] - ignore_pattern_middle = ignore_pattern_trie[ + ) + ignore_pattern_middle = dvcignore._get_tire_pattern( os.fspath(tmp_dir / "top" / "first" / "middle") - ] - ignore_pattern_second = ignore_pattern_trie[ + ) + ignore_pattern_second = dvcignore._get_tire_pattern( os.fspath(tmp_dir / "top" / "first" / "middle" / "second") - ] - ignore_pattern_bottom = ignore_pattern_trie[ + ) + ignore_pattern_bottom = dvcignore._get_tire_pattern( os.fspath(tmp_dir / "top" / "first" / "middle" / "second" / "bottom") - ] + ) base_pattern = [".hg/", ".git/", ".dvc/"], os.fspath(tmp_dir) first_pattern = merge_patterns( From 9a9b67bdcbb96c459cbbc99aa673e1dbcb72e0a4 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Mon, 20 Jul 2020 18:34:57 +0800 Subject: [PATCH 5/8] Config tree use --- dvc/config.py | 2 +- dvc/repo/__init__.py | 2 +- dvc/repo/tree.py | 4 +++- dvc/tree/azure.py | 2 +- dvc/tree/base.py | 2 +- dvc/tree/gdrive.py | 2 +- dvc/tree/git.py | 6 +++++- dvc/tree/gs.py | 2 +- dvc/tree/hdfs.py | 2 +- dvc/tree/http.py | 2 +- dvc/tree/local.py | 4 +++- dvc/tree/oss.py | 2 +- dvc/tree/s3.py | 2 +- dvc/tree/ssh/__init__.py | 2 +- 14 files changed, 22 insertions(+), 14 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index d4b3b34eeb..8ed0e56fe0 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -316,7 +316,7 @@ def _load_config(self, level): filename = self.files[level] tree = self.tree if level == "repo" else self.wtree - if tree.exists(filename): + if tree.exists(filename, use_dvcignore=False): with tree.open(filename) as fobj: conf_obj = configobj.ConfigObj(fobj) else: diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index e604aea4d1..6d98682727 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -99,7 +99,7 @@ def __init__(self, root_dir=None, scm=None, rev=None): ) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) - self.config = Config(self.dvc_dir) + self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 51ceacfbb9..2b7810c29d 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -277,7 +277,9 @@ def open( ) return self.repo.tree.open(path, mode=mode, encoding=encoding) - def exists(self, path): # pylint: disable=arguments-differ + def exists( + self, path, use_dvcignore=True + ): # pylint: disable=arguments-differ return self.repo.tree.exists(path) or ( self.dvctree and self.dvctree.exists(path) ) diff --git a/dvc/tree/azure.py b/dvc/tree/azure.py index 799ffa35a9..19f9ecdcb9 100644 --- a/dvc/tree/azure.py +++ b/dvc/tree/azure.py @@ -101,7 +101,7 @@ def _generate_download_url(self, path_info, expires=3600): ) return download_url - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): paths = self._list_paths(path_info.bucket, path_info.path) return any(path_info.path == path for path in paths) diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 6f7db8debc..700513b19f 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -156,7 +156,7 @@ def open(self, path_info, mode="r", encoding=None): raise RemoteActionNotImplemented("open", self.scheme) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): raise NotImplementedError # pylint: disable=unused-argument diff --git a/dvc/tree/gdrive.py b/dvc/tree/gdrive.py index d5c7a67c88..5e34fa4551 100644 --- a/dvc/tree/gdrive.py +++ b/dvc/tree/gdrive.py @@ -522,7 +522,7 @@ def _get_item_id(self, path_info, create=False, use_cache=True, hint=None): assert not create raise FileMissingError(path_info, hint) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): try: self._get_item_id(path_info) except FileMissingError: diff --git a/dvc/tree/git.py b/dvc/tree/git.py index a500fa2fb4..ff5be8209f 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -76,9 +76,13 @@ def open( return io.BytesIO(data) return io.StringIO(data.decode(encoding)) - def exists(self, path): # pylint: disable=arguments-differ + def exists( + self, path, use_dvcignore=True + ): # pylint: disable=arguments-differ if self._git_object_by_path(path) is None: return False + if not use_dvcignore: + return True return not self.dvcignore.is_ignored_file( path diff --git a/dvc/tree/gs.py b/dvc/tree/gs.py index c8372ae0ec..767154de35 100644 --- a/dvc/tree/gs.py +++ b/dvc/tree/gs.py @@ -119,7 +119,7 @@ def _generate_download_url(self, path_info, expires=3600): ) return signing_credentials.signer.sign(blob) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): """Check if the blob exists. If it does not exist, it could be a part of a directory path. diff --git a/dvc/tree/hdfs.py b/dvc/tree/hdfs.py index 55b9909416..cdc9a2c339 100644 --- a/dvc/tree/hdfs.py +++ b/dvc/tree/hdfs.py @@ -72,7 +72,7 @@ def open(self, path_info, mode="r", encoding=None): raise FileNotFoundError(*e.args) raise - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): assert not isinstance(path_info, list) assert path_info.scheme == "hdfs" with self.hdfs(path_info) as hdfs: diff --git a/dvc/tree/http.py b/dvc/tree/http.py index a683ec5ea5..abed95ca4f 100644 --- a/dvc/tree/http.py +++ b/dvc/tree/http.py @@ -122,7 +122,7 @@ def request(self, method, url, **kwargs): except requests.exceptions.RequestException: raise DvcException(f"could not perform a {method} request") - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): return bool(self.request("HEAD", path_info.url)) def get_file_hash(self, path_info): diff --git a/dvc/tree/local.py b/dvc/tree/local.py index 61af21e01e..24361f4c8d 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -69,7 +69,7 @@ def dvcignore(self): def open(path_info, mode="r", encoding=None): return open(path_info, mode=mode, encoding=encoding) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): assert isinstance(path_info, str) or path_info.scheme == "local" if self.repo: ret = os.path.lexists(path_info) @@ -77,6 +77,8 @@ def exists(self, path_info): ret = os.path.exists(path_info) if not ret: return False + if not use_dvcignore: + return True return not self.dvcignore.is_ignored_file( path_info diff --git a/dvc/tree/oss.py b/dvc/tree/oss.py index 634ad0d8d4..3e0a3d4775 100644 --- a/dvc/tree/oss.py +++ b/dvc/tree/oss.py @@ -88,7 +88,7 @@ def _generate_download_url(self, path_info, expires=3600): return self.oss_service.sign_url("GET", path_info.path, expires) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): paths = self._list_paths(path_info) return any(path_info.path == path for path in paths) diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index d07a5389a1..44368d7461 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -131,7 +131,7 @@ def _generate_download_url(self, path_info, expires=3600): ClientMethod="get_object", Params=params, ExpiresIn=int(expires) ) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): """Check if the blob exists. If it does not exist, it could be a part of a directory path. diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index 916e5a6521..26bee19388 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -156,7 +156,7 @@ def open(self, path_info, mode="r", encoding=None): else: yield io.TextIOWrapper(fd, encoding=encoding) - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): with self.ssh(path_info) as ssh: return ssh.exists(path_info.path) From 2c86b5ca62ac2629cc4264e1351ba17879520a7d Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Mon, 20 Jul 2020 21:56:10 +0800 Subject: [PATCH 6/8] Solve sub dir repo problem. Add more tests --- dvc/ignore.py | 20 ++++++++++++++++++++ tests/unit/test_ignore.py | 5 +++++ 2 files changed, 25 insertions(+) diff --git a/dvc/ignore.py b/dvc/ignore.py index a111fb4af0..214ffc5e75 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -114,6 +114,12 @@ def is_ignored_file(self, _): class DvcIgnoreFilter: + @staticmethod + def _is_dvc_repo(root, directory): + from dvc.repo import Repo + + return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) + def __init__(self, tree, root_dir): from dvc.repo import Repo @@ -126,6 +132,7 @@ def __init__(self, tree, root_dir): ) for root, dirs, _ in self.tree.walk(self.root_dir): self._update(root) + self._update_sub_repo(root, dirs) dirs[:], _ = self(root, dirs, []) def _update(self, dirname): @@ -144,6 +151,19 @@ def _update(self, dirname): ) ) + def _update_sub_repo(self, root, dirs): + for d in dirs: + if self._is_dvc_repo(root, d): + old_pattern = self._get_tire_pattern(root) + self.ignores_trie_tree[root] = DvcIgnorePatterns( + *merge_patterns( + old_pattern.pattern_list, + old_pattern.dirname, + ["/{}/".format(d)], + root, + ) + ) + def __call__(self, root, dirs, files): ignore_pattern = self._get_tire_pattern(root) return ignore_pattern(root, dirs, files) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 147ae80d57..3bc0344e79 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -85,6 +85,11 @@ def mock_dvcignore(dvcignore_path, patterns): ("to_ignore.txt", ["/*.txt"], True), (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False), (os.path.join("data", "file.txt"), ["data/*"], True), + (os.path.join("data", "subdir", "file.txt"), ["data/*"], False), + (os.path.join("data", "file.txt"), ["data/"], True), + (os.path.join("data", "subdir", "file.txt"), ["data/"], True), + (os.path.join("data", "subdir", "file.txt"), ["subdir/"], True), + (os.path.join("data", "subdir", "file.txt"), ["/subdir/"], False), (os.path.join("data", "path"), ["path/"], False), (os.path.join(".git", "file.txt"), [".git/"], True), (os.path.join("data", ".dvc", "file.txt"), [".dvc/"], True), From 97ad6e8091b0d6613303f58280aa824af5d8e0b6 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Tue, 21 Jul 2020 18:15:46 +0800 Subject: [PATCH 7/8] typo error --- dvc/ignore.py | 15 ++++++++------- tests/func/test_ignore.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 214ffc5e75..02e939d2cb 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -123,12 +123,13 @@ def _is_dvc_repo(root, directory): def __init__(self, tree, root_dir): from dvc.repo import Repo + default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] + self.tree = tree self.root_dir = root_dir - default_ignore_pattern = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] self.ignores_trie_tree = StringTrie(separator=os.sep) self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( - default_ignore_pattern, root_dir + default_ignore_patterns, root_dir ) for root, dirs, _ in self.tree.walk(self.root_dir): self._update(root) @@ -141,7 +142,7 @@ def _update(self, dirname): new_pattern = DvcIgnorePatterns.from_files( ignore_file_path, self.tree ) - old_pattern = self._get_tire_pattern(dirname) + old_pattern = self._get_trie_pattern(dirname) self.ignores_trie_tree[dirname] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, @@ -154,7 +155,7 @@ def _update(self, dirname): def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): - old_pattern = self._get_tire_pattern(root) + old_pattern = self._get_trie_pattern(root) self.ignores_trie_tree[root] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, @@ -165,10 +166,10 @@ def _update_sub_repo(self, root, dirs): ) def __call__(self, root, dirs, files): - ignore_pattern = self._get_tire_pattern(root) + ignore_pattern = self._get_trie_pattern(root) return ignore_pattern(root, dirs, files) - def _get_tire_pattern(self, dirname): + def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value if not ignore_pattern: return DvcIgnorePatterns([], dirname) @@ -178,7 +179,7 @@ def _is_ignored(self, path, is_dir=False): if self._outside_repo(path): return True dirname, basename = os.path.split(os.path.normpath(path)) - ignore_pattern = self._get_tire_pattern(dirname) + ignore_pattern = self._get_trie_pattern(dirname) return ignore_pattern.matches(dirname, basename, is_dir) def is_ignored_dir(self, path): diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index e4ce250562..16afb978af 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -112,8 +112,8 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): top_ignore_path, ) ) - == dvcignore._get_tire_pattern(top_ignore_path) - == dvcignore._get_tire_pattern(sub_dir_path) + == dvcignore._get_trie_pattern(top_ignore_path) + == dvcignore._get_trie_pattern(sub_dir_path) ) @@ -290,22 +290,22 @@ def test_pattern_trie_tree(tmp_dir, dvc): dvc.tree.__dict__.pop("dvcignore", None) dvcignore = dvc.tree.dvcignore - ignore_pattern_top = dvcignore._get_tire_pattern( + ignore_pattern_top = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top") ) - ignore_pattern_other = dvcignore._get_tire_pattern( + ignore_pattern_other = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "other") ) - ignore_pattern_first = dvcignore._get_tire_pattern( + ignore_pattern_first = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top" / "first") ) - ignore_pattern_middle = dvcignore._get_tire_pattern( + ignore_pattern_middle = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top" / "first" / "middle") ) - ignore_pattern_second = dvcignore._get_tire_pattern( + ignore_pattern_second = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top" / "first" / "middle" / "second") ) - ignore_pattern_bottom = dvcignore._get_tire_pattern( + ignore_pattern_bottom = dvcignore._get_trie_pattern( os.fspath(tmp_dir / "top" / "first" / "middle" / "second" / "bottom") ) From 4207f40251b8450b8805e8b110dca6cdbef46e9b Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Wed, 22 Jul 2020 09:19:03 +0800 Subject: [PATCH 8/8] Change request --- dvc/ignore.py | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 02e939d2cb..05463403cf 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -143,36 +143,45 @@ def _update(self, dirname): ignore_file_path, self.tree ) old_pattern = self._get_trie_pattern(dirname) - self.ignores_trie_tree[dirname] = DvcIgnorePatterns( - *merge_patterns( - old_pattern.pattern_list, - old_pattern.dirname, - new_pattern.pattern_list, - new_pattern.dirname, + if old_pattern: + self.ignores_trie_tree[dirname] = DvcIgnorePatterns( + *merge_patterns( + old_pattern.pattern_list, + old_pattern.dirname, + new_pattern.pattern_list, + new_pattern.dirname, + ) ) - ) + else: + self.ignores_trie_tree[dirname] = new_pattern def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): old_pattern = self._get_trie_pattern(root) - self.ignores_trie_tree[root] = DvcIgnorePatterns( - *merge_patterns( - old_pattern.pattern_list, - old_pattern.dirname, - ["/{}/".format(d)], - root, + if old_pattern: + self.ignores_trie_tree[root] = DvcIgnorePatterns( + *merge_patterns( + old_pattern.pattern_list, + old_pattern.dirname, + ["/{}/".format(d)], + root, + ) + ) + else: + self.ignores_trie_tree[root] = DvcIgnorePatterns( + ["/{}/".format(d)], root ) - ) def __call__(self, root, dirs, files): ignore_pattern = self._get_trie_pattern(root) - return ignore_pattern(root, dirs, files) + if ignore_pattern: + return ignore_pattern(root, dirs, files) + else: + return dirs, files def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value - if not ignore_pattern: - return DvcIgnorePatterns([], dirname) return ignore_pattern def _is_ignored(self, path, is_dir=False): @@ -180,7 +189,10 @@ def _is_ignored(self, path, is_dir=False): return True dirname, basename = os.path.split(os.path.normpath(path)) ignore_pattern = self._get_trie_pattern(dirname) - return ignore_pattern.matches(dirname, basename, is_dir) + if ignore_pattern: + return ignore_pattern.matches(dirname, basename, is_dir) + else: + return False def is_ignored_dir(self, path): path = os.path.abspath(path)