Skip to content

ignore: CleanTree should always check ignores, not just for walk() #3876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 70 additions & 6 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern

from dvc.path_info import PathInfo
from dvc.scm.tree import BaseTree
from dvc.utils import relpath

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -125,19 +127,79 @@ def tree_root(self):
return self.tree.tree_root

def open(self, path, mode="r", encoding="utf-8"):
return self.tree.open(path, mode, encoding)
if self.isfile(path):
return self.tree.open(path, mode, encoding)
raise FileNotFoundError

def exists(self, path):
return self.tree.exists(path)
if self.tree.exists(path) and self._parents_exist(path):
if self.tree.isdir(path):
return self._valid_dirname(path)
return self._valid_filename(path)
return False

def isdir(self, path):
return self.tree.isdir(path)
return (
self.tree.isdir(path)
and self._parents_exist(path)
and self._valid_dirname(path)
)

def _valid_dirname(self, path):
dirname, basename = os.path.split(os.path.normpath(path))
dirs, _ = self.dvcignore(os.path.abspath(dirname), [basename], [])
if dirs:
return True
return False

def isfile(self, path):
return self.tree.isfile(path)
return (
self.tree.isfile(path)
and self._parents_exist(path)
and self._valid_filename(path)
)

def _valid_filename(self, path):
dirname, basename = os.path.split(os.path.normpath(path))
_, files = self.dvcignore(os.path.abspath(dirname), [], [basename])
if files:
return True
return False

def isexec(self, path):
return self.tree.isexec(path)
return self.exists(path) and self.tree.isexec(path)

def _parents_exist(self, path):
from dvc.repo import Repo

path = PathInfo(path)

# if parent is tree_root or inside a .dvc dir we can skip this check
if path.parent == self.tree_root or Repo.DVC_DIR in path.parts:
return True

# if path is outside of tree, assume this is a local remote/local cache
# link/move operation where we do not need to filter ignores
path = relpath(path, self.tree_root)
if path.startswith("..") or (
os.name == "nt"
and not os.path.commonprefix(
[os.path.abspath(path), self.tree_root]
)
):
return True
Comment on lines +181 to +190
Copy link
Contributor Author

@pmrowla pmrowla May 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is non-intuitive, it seems to me that paths outside of repo root should by definition not exist in a repo CleanTree, but the problem is that for local cache/remote we do not currently differentiate between WorkingTree/CleanTree/filesystem paths.

This can be cleaned up by the remote/cache/tree separation:

  • local remote/cache should have their own working tree that would handle the case where DVC cache dir is outside of the repo root or on a different logical drive
  • DVC repo CleanTree could then disallow (ignore) paths outside of the repo root, since CleanTree paths would always and only be used to refer to paths inside the DVC repo, and not arbitrary local filesystem paths.


# check if parent directories are in our ignores, starting from
# tree_root
for parent_dir in reversed(PathInfo(path).parents):
dirname, basename = os.path.split(parent_dir)
if basename == ".":
# parent_dir == tree_root
continue
dirs, _ = self.dvcignore(os.path.abspath(dirname), [basename], [])
if not dirs:
return False
return True

def walk(self, top, topdown=True):
for root, dirs, files in self.tree.walk(top, topdown):
Expand All @@ -148,4 +210,6 @@ def walk(self, top, topdown=True):
yield root, dirs, files

def stat(self, path):
return self.tree.stat(path)
if self.exists(path):
return self.tree.stat(path)
raise FileNotFoundError
21 changes: 21 additions & 0 deletions tests/func/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from dvc.ignore import CleanTree
from dvc.path_info import PathInfo
from dvc.repo import Repo
from dvc.repo.tree import RepoTree
from dvc.scm import SCM
from dvc.scm.git import GitTree
Expand Down Expand Up @@ -220,3 +221,23 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, setup_remote):
cache.save(PathInfo(erepo_dir / "dir"), None, tree=tree)
for checksum in expected:
assert os.path.exists(cache.checksum_to_path_info(checksum))


def test_cleantree_subrepo(tmp_dir, dvc, scm, monkeypatch):
tmp_dir.gen({"subdir": {}})
subrepo_dir = tmp_dir / "subdir"
with subrepo_dir.chdir():
subrepo = Repo.init(subdir=True)
subrepo_dir.gen({"foo": "foo", "dir": {"bar": "bar"}})

assert isinstance(dvc.tree, CleanTree)
assert not dvc.tree.exists(subrepo_dir / "foo")
assert not dvc.tree.isfile(subrepo_dir / "foo")
assert not dvc.tree.exists(subrepo_dir / "dir")
assert not dvc.tree.isdir(subrepo_dir / "dir")

assert isinstance(subrepo.tree, CleanTree)
assert subrepo.tree.exists(subrepo_dir / "foo")
assert subrepo.tree.isfile(subrepo_dir / "foo")
assert subrepo.tree.exists(subrepo_dir / "dir")
assert subrepo.tree.isdir(subrepo_dir / "dir")