From 9fa241e48f4e1fbcad33d5a1f6a139e7cd7eaef7 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 31 Jul 2020 07:44:41 +0300 Subject: [PATCH 1/2] cache: don't make save() compute the hash This is incorrect and leads to weird bugs like #4305 --- dvc/cache/base.py | 2 -- dvc/external_repo.py | 28 +++++++++++++++++++++------- dvc/repo/tree.py | 3 +++ tests/func/test_tree.py | 5 ++++- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 64617ef994..a6c5596d77 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -276,8 +276,6 @@ def save(self, path_info, tree, hash_info, save_link=True, **kwargs): self.tree.scheme, ) - if not hash_info: - hash_info = self.tree.save_info(path_info, tree=tree, **kwargs) hash_ = hash_info[self.tree.PARAM_CHECKSUM] return self._save(path_info, tree, hash_, save_link, **kwargs) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 15332e7de1..c2de3d8e81 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -90,15 +90,26 @@ def use_cache(self, cache): """Use the specified cache in place of default tmpdir cache for download operations. """ - if hasattr(self, "cache"): - save_cache = self.cache.local - self.cache.local = cache + has_cache = hasattr(self, "cache") + + if has_cache: + save_cache = self.cache.local # pylint: disable=E0203 + self.cache.local = cache # pylint: disable=E0203 + else: + from collections import namedtuple + + mock_cache = namedtuple("MockCache", ["local"])(local=cache) + self.cache = mock_cache # pylint: disable=W0201 + self._local_cache = cache yield - if hasattr(self, "cache"): + if has_cache: self.cache.local = save_cache + else: + del self.cache + self._local_cache = None @cached_property @@ -130,14 +141,17 @@ def download_update(result): for path in paths: if not self.repo_tree.exists(path): raise PathMissingError(path, self.url) - save_info = self.local_cache.save( + hash_info = self.repo_tree.save_info( + path, download_callback=download_update + ) + self.local_cache.save( path, self.repo_tree, - None, + hash_info, save_link=False, download_callback=download_update, ) - save_infos.append(save_info) + save_infos.append(hash_info) return sum(download_results), failed, save_infos diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 2b7810c29d..8011d0318c 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -245,6 +245,9 @@ class RepoTree(BaseTree): # pylint:disable=abstract-method Any kwargs will be passed to `DvcTree()`. """ + scheme = "local" + PARAM_CHECKSUM = "md5" + def __init__(self, repo, **kwargs): super().__init__(repo, {"url": repo.root_dir}) if hasattr(repo, "dvc_dir"): diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 93b671bd75..3ab93efc9b 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -217,7 +217,10 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud): with erepo_dir.dvc.state: cache = dvc.cache.local with cache.tree.state: - cache.save(PathInfo(erepo_dir / "dir"), tree, None) + path_info = PathInfo(erepo_dir / "dir") + hash_info = cache.tree.save_info(path_info) + cache.save(path_info, tree, hash_info) + for hash_ in expected: assert os.path.exists(cache.tree.hash_to_path_info(hash_)) From b2e106928a125e704136a897a129a996243d362a Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 31 Jul 2020 07:52:19 +0300 Subject: [PATCH 2/2] tree: get rid of explicit tree arguments --- dvc/tree/base.py | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 700513b19f..d5df932b52 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -236,22 +236,17 @@ def is_dir_hash(cls, hash_): return False return hash_.endswith(cls.CHECKSUM_DIR_SUFFIX) - def get_hash(self, path_info, tree=None, **kwargs): + def get_hash(self, path_info, **kwargs): assert path_info and ( isinstance(path_info, str) or path_info.scheme == self.scheme ) - if not tree: - tree = self - - if not tree.exists(path_info): + if not self.exists(path_info): return None - if tree == self: - # pylint: disable=assignment-from-none - hash_ = self.state.get(path_info) - else: - hash_ = None + # pylint: disable=assignment-from-none + hash_ = self.state.get(path_info) + # If we have dir hash in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_hash() call below, # see https://github.com/iterative/dvc/issues/2219 for context @@ -267,10 +262,10 @@ def get_hash(self, path_info, tree=None, **kwargs): if hash_: return hash_ - if tree.isdir(path_info): - hash_ = self.get_dir_hash(path_info, tree, **kwargs) + if self.isdir(path_info): + hash_ = self.get_dir_hash(path_info, **kwargs) else: - hash_ = tree.get_file_hash(path_info) + hash_ = self.get_file_hash(path_info) if hash_ and self.exists(path_info): self.state.save(path_info, hash_) @@ -280,11 +275,11 @@ def get_hash(self, path_info, tree=None, **kwargs): def get_file_hash(self, path_info): raise NotImplementedError - def get_dir_hash(self, path_info, tree, **kwargs): + def get_dir_hash(self, path_info, **kwargs): if not self.cache: raise RemoteCacheRequiredError(path_info) - dir_info = self._collect_dir(path_info, tree, **kwargs) + dir_info = self._collect_dir(path_info, **kwargs) return self._save_dir_info(dir_info, path_info) def hash_to_path_info(self, hash_): @@ -298,29 +293,26 @@ def path_to_hash(self, path): return "".join(parts) - def save_info(self, path_info, tree=None, **kwargs): - return { - self.PARAM_CHECKSUM: self.get_hash(path_info, tree=tree, **kwargs) - } + def save_info(self, path_info, **kwargs): + return {self.PARAM_CHECKSUM: self.get_hash(path_info, **kwargs)} - @staticmethod - def _calculate_hashes(file_infos, tree): + def _calculate_hashes(self, file_infos): file_infos = list(file_infos) with Tqdm( total=len(file_infos), unit="md5", desc="Computing file/dir hashes (only done once)", ) as pbar: - worker = pbar.wrap_fn(tree.get_file_hash) - with ThreadPoolExecutor(max_workers=tree.hash_jobs) as executor: + worker = pbar.wrap_fn(self.get_file_hash) + with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor: tasks = executor.map(worker, file_infos) hashes = dict(zip(file_infos, tasks)) return hashes - def _collect_dir(self, path_info, tree, **kwargs): + def _collect_dir(self, path_info, **kwargs): file_infos = set() - for fname in tree.walk_files(path_info, **kwargs): + for fname in self.walk_files(path_info, **kwargs): if DvcIgnore.DVCIGNORE_FILE == fname.name: raise DvcIgnoreInCollectedDirError(fname.parent) @@ -329,7 +321,7 @@ def _collect_dir(self, path_info, tree, **kwargs): hashes = {fi: self.state.get(fi) for fi in file_infos} not_in_state = {fi for fi, hash_ in hashes.items() if hash_ is None} - new_hashes = self._calculate_hashes(not_in_state, tree) + new_hashes = self._calculate_hashes(not_in_state) hashes.update(new_hashes) result = [