Skip to content

Commit 023dec4

Browse files
authored
tree: make get_hash return type and hash pair (#4397)
Currently we kinda assume that whatever is returned by `get_file_hash` is of type self.PARAM_CHECKSUM, which is not actually true. E.g. for http it might return `etag` or `md5`, but we don't distinguish between those and call both `etag`. This is becoming more relevant for dir hashes that are computed a few different ways (e.g. in-memory md5 or upload to remote and get etag for the dir file). Prerequisite for #4144 and #3069
1 parent b773ba7 commit 023dec4

File tree

17 files changed

+51
-34
lines changed

17 files changed

+51
-34
lines changed

dvc/cache/base.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,9 @@ def changed(self, path_info, hash_info):
127127
logger.debug("cache for '%s'('%s') has changed.", path_info, hash_)
128128
return True
129129

130-
actual = self.tree.get_hash(path_info)
130+
typ, actual = self.tree.get_hash(path_info)
131+
assert typ == self.tree.PARAM_CHECKSUM
132+
131133
if hash_ != actual:
132134
logger.debug(
133135
"hash value '%s' for '%s' has changed (actual '%s').",
@@ -312,7 +314,7 @@ def changed_cache_file(self, hash_):
312314
)
313315
return False
314316

315-
actual = self.tree.get_hash(cache_info)
317+
_, actual = self.tree.get_hash(cache_info)
316318

317319
logger.debug(
318320
"cache '%s' expected '%s' actual '%s'", cache_info, hash_, actual,
@@ -358,7 +360,7 @@ def changed_cache(self, hash_, path_info=None, filter_info=None):
358360
return self.changed_cache_file(hash_)
359361

360362
def already_cached(self, path_info):
361-
current = self.tree.get_hash(path_info)
363+
_, current = self.tree.get_hash(path_info)
362364

363365
if not current:
364366
return False

dvc/cache/local.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,9 @@ def hashes_exist(
9191
def already_cached(self, path_info):
9292
assert path_info.scheme in ["", "local"]
9393

94-
current_md5 = self.tree.get_hash(path_info)
94+
typ, current_md5 = self.tree.get_hash(path_info)
95+
96+
assert typ == "md5"
9597

9698
if not current_md5:
9799
return False

dvc/dependency/repo.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ def _get_checksum(self, locked=True):
6464

6565
# We are polluting our repo cache with some dir listing here
6666
if tree.isdir(path):
67-
return self.repo.cache.local.tree.get_hash(path, tree=tree)
67+
return self.repo.cache.local.tree.get_hash(
68+
path, tree=tree
69+
)[1]
6870
return tree.get_file_hash(path)
6971

7072
def workspace_status(self):

dvc/output/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def checksum(self, checksum):
184184
self.info[self.tree.PARAM_CHECKSUM] = checksum
185185

186186
def get_checksum(self):
187-
return self.tree.get_hash(self.path_info)
187+
return self.tree.get_hash(self.path_info)[1]
188188

189189
@property
190190
def is_dir_checksum(self):

dvc/repo/diff.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def _to_path(output):
3737

3838
def _to_checksum(output):
3939
if on_working_tree:
40-
return self.cache.local.tree.get_hash(output.path_info)
40+
return self.cache.local.tree.get_hash(output.path_info)[1]
4141
return output.checksum
4242

4343
def _exists(output):

dvc/repo/tree.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -238,8 +238,11 @@ def get_file_hash(self, path_info):
238238
raise OutputNotFoundError
239239
out = outs[0]
240240
if out.is_dir_checksum:
241-
return self._get_granular_checksum(path_info, out)
242-
return out.checksum
241+
return (
242+
out.tree.PARAM_CHECKSUM,
243+
self._get_granular_checksum(path_info, out),
244+
)
245+
return out.tree.PARAM_CHECKSUM, out.checksum
243246

244247

245248
class RepoTree(BaseTree): # pylint:disable=abstract-method
@@ -504,7 +507,7 @@ def get_file_hash(self, path_info):
504507
return dvc_tree.get_file_hash(path_info)
505508
except OutputNotFoundError:
506509
pass
507-
return file_md5(path_info, self)[0]
510+
return self.PARAM_CHECKSUM, file_md5(path_info, self)[0]
508511

509512
def copytree(self, top, dest):
510513
top = PathInfo(top)

dvc/tree/azure.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def remove(self, path_info):
153153
).delete_blob()
154154

155155
def get_file_hash(self, path_info):
156-
return self.get_etag(path_info)
156+
return self.PARAM_CHECKSUM, self.get_etag(path_info)
157157

158158
def _upload(
159159
self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs

dvc/tree/base.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def get_hash(self, path_info, **kwargs):
242242
)
243243

244244
if not self.exists(path_info):
245-
return None
245+
return self.PARAM_CHECKSUM, None
246246

247247
# pylint: disable=assignment-from-none
248248
hash_ = self.state.get(path_info)
@@ -260,17 +260,17 @@ def get_hash(self, path_info, **kwargs):
260260
hash_ = None
261261

262262
if hash_:
263-
return hash_
263+
return self.PARAM_CHECKSUM, hash_
264264

265265
if self.isdir(path_info):
266-
hash_ = self.get_dir_hash(path_info, **kwargs)
266+
typ, hash_ = self.get_dir_hash(path_info, **kwargs)
267267
else:
268-
hash_ = self.get_file_hash(path_info)
268+
typ, hash_ = self.get_file_hash(path_info)
269269

270270
if hash_ and self.exists(path_info):
271271
self.state.save(path_info, hash_)
272272

273-
return hash_
273+
return typ, hash_
274274

275275
def get_file_hash(self, path_info):
276276
raise NotImplementedError
@@ -294,7 +294,8 @@ def path_to_hash(self, path):
294294
return "".join(parts)
295295

296296
def save_info(self, path_info, **kwargs):
297-
return {self.PARAM_CHECKSUM: self.get_hash(path_info, **kwargs)}
297+
typ, hash_ = self.get_hash(path_info, **kwargs)
298+
return {typ: hash_}
298299

299300
def _calculate_hashes(self, file_infos):
300301
file_infos = list(file_infos)
@@ -305,9 +306,10 @@ def _calculate_hashes(self, file_infos):
305306
) as pbar:
306307
worker = pbar.wrap_fn(self.get_file_hash)
307308
with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor:
308-
tasks = executor.map(worker, file_infos)
309-
hashes = dict(zip(file_infos, tasks))
310-
return hashes
309+
hashes = (
310+
value for typ, value in executor.map(worker, file_infos)
311+
)
312+
return dict(zip(file_infos, hashes))
311313

312314
def _collect_dir(self, path_info, **kwargs):
313315
file_infos = set()
@@ -344,7 +346,7 @@ def _collect_dir(self, path_info, **kwargs):
344346
return sorted(result, key=itemgetter(self.PARAM_RELPATH))
345347

346348
def _save_dir_info(self, dir_info):
347-
hash_, tmp_info = self._get_dir_info_hash(dir_info)
349+
typ, hash_, tmp_info = self._get_dir_info_hash(dir_info)
348350
new_info = self.cache.tree.hash_to_path_info(hash_)
349351
if self.cache.changed_cache_file(hash_):
350352
self.cache.tree.makedirs(new_info.parent)
@@ -354,7 +356,7 @@ def _save_dir_info(self, dir_info):
354356

355357
self.state.save(new_info, hash_)
356358

357-
return hash_
359+
return typ, hash_
358360

359361
def _get_dir_info_hash(self, dir_info):
360362
tmp = tempfile.NamedTemporaryFile(delete=False).name
@@ -366,8 +368,8 @@ def _get_dir_info_hash(self, dir_info):
366368
to_info = tree.path_info / tmp_fname("")
367369
tree.upload(from_info, to_info, no_progress_bar=True)
368370

369-
hash_ = tree.get_file_hash(to_info) + self.CHECKSUM_DIR_SUFFIX
370-
return hash_, to_info
371+
typ, hash_ = tree.get_file_hash(to_info)
372+
return typ, hash_ + self.CHECKSUM_DIR_SUFFIX, to_info
371373

372374
def upload(self, from_info, to_info, name=None, no_progress_bar=False):
373375
if not hasattr(self, "_upload"):

dvc/tree/gs.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,11 +190,14 @@ def get_file_hash(self, path_info):
190190
path = path_info.path
191191
blob = self.gs.bucket(bucket).get_blob(path)
192192
if not blob:
193-
return None
193+
return self.PARAM_CHECKSUM, None
194194

195195
b64_md5 = blob.md5_hash
196196
md5 = base64.b64decode(b64_md5)
197-
return codecs.getencoder("hex")(md5)[0].decode("utf-8")
197+
return (
198+
self.PARAM_CHECKSUM,
199+
codecs.getencoder("hex")(md5)[0].decode("utf-8"),
200+
)
198201

199202
def _upload(self, from_file, to_info, name=None, no_progress_bar=False):
200203
bucket = self.gs.bucket(to_info.bucket)

dvc/tree/hdfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def get_file_hash(self, path_info):
167167
stdout = self.hadoop_fs(
168168
f"checksum {path_info.url}", user=path_info.user
169169
)
170-
return self._group(regex, stdout, "checksum")
170+
return self.PARAM_CHECKSUM, self._group(regex, stdout, "checksum")
171171

172172
def _upload(self, from_file, to_info, **_kwargs):
173173
with self.hdfs(to_info) as hdfs:

dvc/tree/http.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def get_file_hash(self, path_info):
136136
"Content-MD5 header for '{url}'".format(url=url)
137137
)
138138

139-
return etag
139+
return self.PARAM_CHECKSUM, etag
140140

141141
def _download(self, from_info, to_file, name=None, no_progress_bar=False):
142142
response = self.request("GET", from_info.url, stream=True)

dvc/tree/local.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ def is_protected(self, path_info):
309309
return stat.S_IMODE(mode) == self.CACHE_MODE
310310

311311
def get_file_hash(self, path_info):
312-
return file_md5(path_info)[0]
312+
return self.PARAM_CHECKSUM, file_md5(path_info)[0]
313313

314314
@staticmethod
315315
def getsize(path_info):

dvc/tree/s3.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,10 @@ def _copy(cls, s3, from_info, to_info, extra_args):
318318
raise ETagMismatchError(etag, cached_etag)
319319

320320
def get_file_hash(self, path_info):
321-
return self.get_etag(self.s3, path_info.bucket, path_info.path)
321+
return (
322+
self.PARAM_CHECKSUM,
323+
self.get_etag(self.s3, path_info.bucket, path_info.path),
324+
)
322325

323326
def _upload(self, from_file, to_info, name=None, no_progress_bar=False):
324327
total = os.path.getsize(from_file)

dvc/tree/ssh/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ def get_file_hash(self, path_info):
238238
raise NotImplementedError
239239

240240
with self.ssh(path_info) as ssh:
241-
return ssh.md5(path_info.path)
241+
return self.PARAM_CHECKSUM, ssh.md5(path_info.path)
242242

243243
def getsize(self, path_info):
244244
with self.ssh(path_info) as ssh:

dvc/tree/webdav.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def get_file_hash(self, path_info):
142142
"Content-MD5 header for '{url}'".format(url=path_info.url)
143143
)
144144

145-
return etag
145+
return self.PARAM_CHECKSUM, etag
146146

147147
# Checks whether path points to directory
148148
def isdir(self, path_info):

tests/func/test_tree.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud):
211211
# into dvc.cache, not fetched or streamed from a remote
212212
tree = RepoTree(erepo_dir.dvc, stream=True)
213213
expected = [
214-
tree.get_file_hash(PathInfo(erepo_dir / path))
214+
tree.get_file_hash(PathInfo(erepo_dir / path))[1]
215215
for path in ("dir/bar", "dir/subdir/foo")
216216
]
217217

tests/unit/remote/test_azure.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_get_file_hash(tmp_dir, azure):
3636
to_info = azure
3737
tree.upload(PathInfo("foo"), to_info)
3838
assert tree.exists(to_info)
39-
hash_ = tree.get_file_hash(to_info)
39+
_, hash_ = tree.get_file_hash(to_info)
4040
assert hash_
4141
assert isinstance(hash_, str)
4242
assert hash_.strip("'").strip('"') == hash_

0 commit comments

Comments
 (0)