-
Notifications
You must be signed in to change notification settings - Fork 1.2k
remote: use .dir checksum existence to infer file contents existence #3632
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1443626
5a77d6a
ad68c27
edf2ec1
b5e5574
60ff14c
45f2deb
02aaf42
e914aae
266d1ab
448876d
9bc3d78
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,9 +2,11 @@ | |
import logging | ||
import os | ||
import stat | ||
from concurrent.futures import ThreadPoolExecutor | ||
from concurrent.futures import as_completed, ThreadPoolExecutor | ||
from functools import partial | ||
|
||
from funcy import concat | ||
|
||
from shortuuid import uuid | ||
|
||
from dvc.compat import fspath_py35 | ||
|
@@ -255,37 +257,102 @@ def status( | |
show_checksums=False, | ||
download=False, | ||
): | ||
# Return flattened dict containing all status info | ||
dir_status, file_status, _ = self._status( | ||
named_cache, | ||
remote, | ||
jobs=jobs, | ||
show_checksums=show_checksums, | ||
download=download, | ||
) | ||
return dict(dir_status, **file_status) | ||
|
||
def _status( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a bit late to the party :) excellent PR overall. will mention a few things that I noticed reviewing that will might help later somehow. this function definitely wants some split I would say... extract presentation logic out for example, and the that checks dir and excludes it? or may be some other refactoring ... but it's too long and complicated now overall "internal-client" functions should be easy to read, even if it feels that extraction does not make much sense (e.g. helpers that are used only in a single place) think how will developers read this. It's easier to read story-like main function and go into details if needed |
||
self, | ||
named_cache, | ||
remote, | ||
jobs=None, | ||
show_checksums=False, | ||
download=False, | ||
): | ||
"""Return a tuple of (dir_status_info, file_status_info, dir_mapping). | ||
|
||
dir_status_info contains status for .dir files, file_status_info | ||
contains status for all other files, and dir_mapping is a dict of | ||
{dir_path_info: set(file_path_info...)} which can be used to map | ||
a .dir file to its file contents. | ||
""" | ||
logger.debug( | ||
"Preparing to collect status from {}".format(remote.path_info) | ||
) | ||
md5s = list(named_cache[self.scheme]) | ||
md5s = set(named_cache.scheme_keys(self.scheme)) | ||
|
||
logger.debug("Collecting information from local cache...") | ||
local_exists = self.cache_exists(md5s, jobs=jobs, name=self.cache_dir) | ||
local_exists = frozenset( | ||
self.cache_exists(md5s, jobs=jobs, name=self.cache_dir) | ||
) | ||
|
||
# This is a performance optimization. We can safely assume that, | ||
# if the resources that we want to fetch are already cached, | ||
# there's no need to check the remote storage for the existence of | ||
# those files. | ||
if download and sorted(local_exists) == sorted(md5s): | ||
if download and local_exists == md5s: | ||
remote_exists = local_exists | ||
else: | ||
logger.debug("Collecting information from remote cache...") | ||
remote_exists = list( | ||
remote.cache_exists( | ||
md5s, jobs=jobs, name=str(remote.path_info) | ||
remote_exists = set() | ||
dir_md5s = set(named_cache.dir_keys(self.scheme)) | ||
if dir_md5s: | ||
# If .dir checksum exists on the remote, assume directory | ||
# contents also exists on the remote | ||
for dir_checksum in remote._cache_object_exists(dir_md5s): | ||
file_checksums = list( | ||
named_cache.child_keys(self.scheme, dir_checksum) | ||
) | ||
logger.debug( | ||
"'{}' exists on remote, " | ||
"assuming '{}' files also exist".format( | ||
dir_checksum, len(file_checksums) | ||
) | ||
) | ||
md5s.remove(dir_checksum) | ||
remote_exists.add(dir_checksum) | ||
md5s.difference_update(file_checksums) | ||
remote_exists.update(file_checksums) | ||
if md5s: | ||
remote_exists.update( | ||
remote.cache_exists( | ||
md5s, jobs=jobs, name=str(remote.path_info) | ||
) | ||
) | ||
) | ||
|
||
ret = { | ||
checksum: {"name": checksum if show_checksums else " ".join(names)} | ||
for checksum, names in named_cache[self.scheme].items() | ||
} | ||
self._fill_statuses(ret, local_exists, remote_exists) | ||
def make_names(checksum, names): | ||
return {"name": checksum if show_checksums else " ".join(names)} | ||
|
||
dir_status = {} | ||
file_status = {} | ||
dir_paths = {} | ||
for checksum, item in named_cache[self.scheme].items(): | ||
if item.children: | ||
dir_status[checksum] = make_names(checksum, item.names) | ||
file_status.update( | ||
{ | ||
child_checksum: make_names(child_checksum, child.names) | ||
for child_checksum, child in item.children.items() | ||
} | ||
) | ||
dir_paths[remote.checksum_to_path_info(checksum)] = frozenset( | ||
map(remote.checksum_to_path_info, item.child_keys()) | ||
) | ||
else: | ||
file_status[checksum] = make_names(checksum, item.names) | ||
|
||
self._fill_statuses(dir_status, local_exists, remote_exists) | ||
self._fill_statuses(file_status, local_exists, remote_exists) | ||
|
||
self._log_missing_caches(ret) | ||
self._log_missing_caches(dict(dir_status, **file_status)) | ||
|
||
return ret | ||
return dir_status, file_status, dir_paths | ||
|
||
@staticmethod | ||
def _fill_statuses(checksum_info_dir, local_exists, remote_exists): | ||
|
@@ -347,31 +414,76 @@ def _process( | |
if jobs is None: | ||
jobs = remote.JOBS | ||
|
||
status_info = self.status( | ||
dir_status, file_status, dir_paths = self._status( | ||
named_cache, | ||
remote, | ||
jobs=jobs, | ||
show_checksums=show_checksums, | ||
download=download, | ||
) | ||
|
||
plans = self._get_plans(download, remote, status_info, status) | ||
dir_plans = self._get_plans(download, remote, dir_status, status) | ||
file_plans = self._get_plans(download, remote, file_status, status) | ||
|
||
if len(plans[0]) == 0: | ||
if len(dir_plans[0]) + len(file_plans[0]) == 0: | ||
return 0 | ||
|
||
if jobs > 1: | ||
with ThreadPoolExecutor(max_workers=jobs) as executor: | ||
fails = sum(executor.map(func, *plans)) | ||
else: | ||
fails = sum(map(func, *plans)) | ||
with ThreadPoolExecutor(max_workers=jobs) as executor: | ||
if download: | ||
fails = sum(executor.map(func, *dir_plans)) | ||
fails += sum(executor.map(func, *file_plans)) | ||
else: | ||
# for uploads, push files first, and any .dir files last | ||
|
||
file_futures = {} | ||
for from_info, to_info, name in zip(*file_plans): | ||
file_futures[to_info] = executor.submit( | ||
func, from_info, to_info, name | ||
) | ||
dir_futures = {} | ||
for from_info, to_info, name in zip(*dir_plans): | ||
wait_futures = { | ||
future | ||
for file_path, future in file_futures.items() | ||
if file_path in dir_paths[to_info] | ||
} | ||
dir_futures[to_info] = executor.submit( | ||
self._dir_upload, | ||
func, | ||
wait_futures, | ||
from_info, | ||
to_info, | ||
name, | ||
) | ||
fails = sum( | ||
future.result() | ||
for future in concat( | ||
file_futures.values(), dir_futures.values() | ||
) | ||
) | ||
|
||
if fails: | ||
if download: | ||
raise DownloadError(fails) | ||
raise UploadError(fails) | ||
|
||
return len(plans[0]) | ||
return len(dir_plans[0]) + len(file_plans[0]) | ||
|
||
@staticmethod | ||
def _dir_upload(func, futures, from_info, to_info, name): | ||
for future in as_completed(futures): | ||
if future.result(): | ||
# do not upload this .dir file if any file in this | ||
# directory failed to upload | ||
logger.debug( | ||
"failed to upload full contents of '{}', " | ||
"aborting .dir file upload".format(name) | ||
) | ||
logger.error( | ||
"failed to upload '{}' to '{}'".format(from_info, to_info) | ||
) | ||
return 1 | ||
return func(from_info, to_info, name) | ||
|
||
def push(self, named_cache, remote, jobs=None, show_checksums=False): | ||
return self._process( | ||
|
Uh oh!
There was an error while loading. Please reload this page.