From c57e74f523cc74aa6ad03a980d5c72eece826c6e Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 21 Dec 2020 20:26:29 +0300 Subject: [PATCH 01/36] [WIP] Implement export-to-remote functionality --- dvc/command/data_sync.py | 48 ++++++++++++++ dvc/istextfile.py | 29 ++++---- dvc/repo/__init__.py | 1 + dvc/repo/export_to_remote.py | 29 ++++++++ dvc/tree/base.py | 124 ++++++++++++++++++++++++++++++++++- dvc/tree/s3.py | 9 +++ dvc/utils/http.py | 5 +- dvc/utils/stream.py | 31 +++++++++ 8 files changed, 258 insertions(+), 18 deletions(-) create mode 100644 dvc/repo/export_to_remote.py diff --git a/dvc/command/data_sync.py b/dvc/command/data_sync.py index 5bbfdf4dce..c6ad8185c6 100644 --- a/dvc/command/data_sync.py +++ b/dvc/command/data_sync.py @@ -94,6 +94,22 @@ def run(self): return 0 +class CmdDataExportToRemote(CmdDataBase): + def run(self): + try: + processed_files_count = self.repo.export_to_remote( + source=self.args.source, + destination=self.args.destination, + remote=self.args.remote, + jobs=self.args.jobs, + ) + self.log_summary({"exported": processed_files_count}) + except DvcException: + logger.exception("failed to export data to the remote") + return 1 + return 0 + + def shared_parent_parser(): from dvc.cli import get_parent_parser @@ -125,8 +141,40 @@ def shared_parent_parser(): def add_parser(subparsers, _parent_parser): + from dvc.cli import get_parent_parser from dvc.command.status import CmdDataStatus + # Export to Remote + EXPORT_HELP = "Exports given data to the remote." + + export_parser = subparsers.add_parser( + "export-to-remote", + parents=[get_parent_parser()], + description=append_doc_link(EXPORT_HELP, "export-to-remote"), + help=EXPORT_HELP, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + export_parser.add_argument( + "source", help="Source location to the file that is being exported" + ) + export_parser.add_argument("destination", help="The name of the file") + export_parser.add_argument( + "-r", "--remote", help="Remote storage to export to", metavar="", + ) + export_parser.add_argument( + "-j", + "--jobs", + type=int, + help=( + "Number of jobs to run simultaneously. " + "The default value is 4 * cpu_count(). " + "For SSH remotes, the default is 4. " + ), + metavar="", + ) + + export_parser.set_defaults(func=CmdDataExportToRemote) + # Pull PULL_HELP = "Download tracked files or directories from remote storage." diff --git a/dvc/istextfile.py b/dvc/istextfile.py index e4c29fdcf1..917a6e3541 100644 --- a/dvc/istextfile.py +++ b/dvc/istextfile.py @@ -7,19 +7,7 @@ TEXT_CHARS = bytes(range(32, 127)) + b"\n\r\t\f\b" -def istextfile(fname, blocksize=512, tree=None): - """ Uses heuristics to guess whether the given file is text or binary, - by reading a single block of bytes from the file. - If more than 30% of the chars in the block are non-text, or there - are NUL ('\x00') bytes in the block, assume this is a binary file. - """ - if tree: - open_func = tree.open - else: - open_func = open - with open_func(fname, "rb") as fobj: - block = fobj.read(blocksize) - +def istextblock(block): if not block: # An empty file is considered a valid text file return True @@ -32,3 +20,18 @@ def istextfile(fname, blocksize=512, tree=None): # occurrences of TEXT_CHARS from the block nontext = block.translate(None, TEXT_CHARS) return float(len(nontext)) / len(block) <= 0.30 + + +def istextfile(fname, blocksize=512, tree=None): + """ Uses heuristics to guess whether the given file is text or binary, + by reading a single block of bytes from the file. + If more than 30% of the chars in the block are non-text, or there + are NUL ('\x00') bytes in the block, assume this is a binary file. + """ + if tree: + open_func = tree.open + else: + open_func = open + with open_func(fname, "rb") as fobj: + block = fobj.read(blocksize) + return istextblock(block) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 4b64fd0cda..b5267243eb 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -60,6 +60,7 @@ class Repo: from dvc.repo.commit import commit from dvc.repo.destroy import destroy from dvc.repo.diff import diff + from dvc.repo.export_to_remote import export_to_remote from dvc.repo.fetch import fetch from dvc.repo.freeze import freeze, unfreeze from dvc.repo.gc import gc diff --git a/dvc/repo/export_to_remote.py b/dvc/repo/export_to_remote.py new file mode 100644 index 0000000000..d2997a4f23 --- /dev/null +++ b/dvc/repo/export_to_remote.py @@ -0,0 +1,29 @@ +from dvc.utils import resolve_paths + +from . import locked + + +@locked +def export_to_remote(self, source, destination, remote=None, jobs=None): + from dvc.dvcfile import Dvcfile + from dvc.stage import Stage, create_stage + from dvc.tree import get_cloud_tree + + from_tree = get_cloud_tree(self, url=source) + from_tree.config["jobs"] = jobs + + remote_tree = self.cloud.get_remote(remote).tree + + hash_info = from_tree.export( + remote_tree, from_tree.path_info, remote_tree.path_info, repo=self, + ) + + path, _, _ = resolve_paths(self, destination) + stage = create_stage(Stage, self, path, outs=[destination]) + + dvcfile = Dvcfile(self, stage.path) + dvcfile.remove() + + stage.outs[0].hash_info = hash_info + dvcfile.dump(stage) + return hash_info.nfiles diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 311b9263ac..5ef3ab9bfe 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -1,12 +1,15 @@ import logging +import tempfile +import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial from multiprocessing import cpu_count -from typing import Any, ClassVar, Dict, Optional +from typing import Any, ClassVar, Dict, Optional, Type from urllib.parse import urlparse from funcy import cached_property, decorator +from dvc import tree as dvc_tree from dvc.dir_info import DirInfo from dvc.exceptions import DvcException, DvcIgnoreInCollectedDirError from dvc.ignore import DvcIgnore @@ -16,6 +19,7 @@ from dvc.utils import tmp_fname from dvc.utils.fs import makedirs, move from dvc.utils.http import open_url +from dvc.utils.stream import HashedIterStream logger = logging.getLogger(__name__) @@ -59,6 +63,7 @@ class BaseTree: TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True + CHUNK_SIZE = 5 * 1024 ** 3 SHARED_MODE_MAP = {None: (None, None), "group": (None, None)} PARAM_CHECKSUM: ClassVar[Optional[str]] = None @@ -168,12 +173,20 @@ def dir_mode(self): def cache(self): return getattr(self.repo.cache, self.scheme) - def open(self, path_info, mode: str = "r", encoding: str = None): + def open( + self, + path_info, + mode: str = "r", + encoding: str = None, + stream_cls: Optional[Type] = None, + ): if hasattr(self, "_generate_download_url"): # pylint:disable=no-member func = self._generate_download_url # type: ignore[attr-defined] get_url = partial(func, path_info) - return open_url(get_url, mode=mode, encoding=encoding) + return open_url( + get_url, mode=mode, encoding=encoding, stream_cls=stream_cls + ) raise RemoteActionNotImplemented("open", self.scheme) @@ -382,6 +395,14 @@ def upload( file_mode=file_mode, ) + def upload_multipart(self, stream, to_info, chunk_size=CHUNK_SIZE): + if hasattr(self, "_upload_multipart"): + self._upload_multipart( # pylint: disable=no-member + stream, to_info, chunk_size=chunk_size + ) + else: + raise RemoteActionNotImplemented("upload_multipart", self.scheme) + def download( self, from_info, @@ -489,3 +510,100 @@ def _download_file( ) move(tmp_file, to_info, mode=file_mode) + + def _get_cache_location(self, to_info, hash_info): + return to_info / hash_info.value[:2] / hash_info.value[2:] + + def _export_file_fallback(self, remote_tree, file_info, to_info, repo): + with tempfile.NamedTemporaryFile() as temp_file: + temp_tree = dvc_tree.get_cloud_tree(repo, url=temp_file.name) + self.download(file_info, temp_tree.path_info, no_progress_bar=True) + + hash_info = temp_tree.get_file_hash(temp_tree.path_info) + remote_tree.upload( + temp_tree.path_info, + self._get_cache_location(to_info, hash_info), + no_progress_bar=True, + ) + + return hash_info + + def _export_file_chunked(self, remote_tree, file_info, to_info): + temporary_location = to_info / str(uuid.uuid4()) + with self.open( + file_info, mode="rb", stream_cls=HashedIterStream + ) as hashed_file: + # Since we don't know the hash beforehand, we'll + # upload it to a temporary location and then move + # it. + remote_tree.upload_multipart(hashed_file, temporary_location) + hash_info = hashed_file.hash_info + + self.move( + temporary_location, self._get_cache_location(to_info, hash_info) + ) + return hash_info + + def _export_file(self, remote_tree, from_info, to_info, file_info, repo): + try: + hash_info = self._export_file_chunked( + remote_tree, file_info, to_info + ) + except RemoteActionNotImplemented: + hash_info = self._export_file_fallback( + remote_tree, file_info, to_info, repo + ) + + rel_target_filename = file_info.relative_to(from_info) + return rel_target_filename.parts, hash_info + + def export( + self, remote_tree, from_info, to_info, repo, no_progress_bar=False, + ): + try: + file_infos = list(self.walk_files(from_info)) + except NotImplementedError: + file_infos = [from_info] + + with Tqdm( + total=len(file_infos), + desc="Exporting files", + unit="Files", + disable=no_progress_bar, + ) as pbar: + export_single = pbar.wrap_fn( + partial( + self._export_file, + remote_tree, + from_info, + to_info, + repo=repo, + ) + ) + + if self.isdir(from_info): + dir_info = DirInfo() + with ThreadPoolExecutor(max_workers=self.jobs) as executor: + futures = [ + executor.submit(export_single, file_info) + for file_info in file_infos + ] + for future in futures: + key, file_hash_info = future.result() + dir_info.trie[key] = file_hash_info + + ( + hash_info, + hash_file_info, + ) = repo.cache.local._get_dir_info_hash( # noqa, pylint: disable=protected-access + dir_info + ) + remote_tree.upload( + hash_file_info, + to_info / hash_info.value[:2] / hash_info.value[2:], + no_progress_bar=True, + ) + else: + _, hash_info = export_single(from_info) + + return hash_info diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index 7b3eb0c10f..7b72c03dff 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -362,6 +362,15 @@ def _upload( from_file, Callback=pbar.update, ExtraArgs=self.extra_args, ) + def _upload_multipart(self, stream, to_info, chunk_size): + from boto3.s3.transfer import TransferConfig + + config = TransferConfig(multipart_threshold=chunk_size) + with self._get_s3() as s3: + s3.meta.client.upload_fileobj( + stream, to_info.bucket, to_info.path, Config=config + ) + def _download(self, from_info, to_file, name=None, no_progress_bar=False): with self._get_obj(from_info) as obj: with Tqdm( diff --git a/dvc/utils/http.py b/dvc/utils/http.py index 4472b80912..66eda51dbd 100644 --- a/dvc/utils/http.py +++ b/dvc/utils/http.py @@ -5,16 +5,17 @@ @contextmanager -def open_url(url, mode="r", encoding=None): +def open_url(url, mode="r", encoding=None, stream_cls=None): """Opens an url as a readable stream. Resumes on connection error. Url could be a string or a callable returning a string. """ + stream_cls = stream_cls or IterStream assert mode in {"r", "rt", "rb"} with iter_url(url) as (response, it): - bytes_stream = IterStream(it) + bytes_stream = stream_cls(it) if mode == "rb": yield bytes_stream diff --git a/dvc/utils/stream.py b/dvc/utils/stream.py index c2d3fe33ea..9581c19e72 100644 --- a/dvc/utils/stream.py +++ b/dvc/utils/stream.py @@ -1,5 +1,10 @@ +import hashlib import io +from dvc.hash_info import HashInfo +from dvc.istextfile import istextblock +from dvc.utils import dos2unix + class IterStream(io.RawIOBase): """Wraps an iterator yielding bytes as a file object""" @@ -54,3 +59,29 @@ def peek(self, n): except StopIteration: break return self.leftover[:n] + +class HashedIterStream(IterStream): + + PARAM_CHECKSUM = "md5" + + def __init__(self, iterator): + super().__init__(iterator) + self.md5 = hashlib.md5() + self.is_text_file = None + + def read(self, n=-1): + chunk = super().read(n) + if self.is_text_file is None: + self.is_text_file = istextblock(chunk) + + if self.is_text_file: + data = dos2unix(chunk) + else: + data = chunk + self.md5.update(data) + + return chunk + + @property + def hash_info(self): + return HashInfo(self.PARAM_CHECKSUM, self.md5.hexdigest(), nfiles=1) From 64a3f48d0d554fb13baba46c766890fbb456b95f Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 4 Jan 2021 14:48:50 +0300 Subject: [PATCH 02/36] Redistrubition of functionality from tree to remote --- dvc/data_cloud.py | 15 ++++ dvc/remote/base.py | 133 +++++++++++++++++++++++++++++++++++ dvc/repo/export_to_remote.py | 10 +-- dvc/tree/base.py | 101 -------------------------- 4 files changed, 150 insertions(+), 109 deletions(-) diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index c4dab8e8cd..09610f3306 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -92,6 +92,21 @@ def pull( show_checksums=show_checksums, ) + def transfer_straight_to_remote(self, source, jobs=None, remote=None): + from dvc.tree import get_cloud_tree + + from_tree = get_cloud_tree(self.repo, url=source) + from_tree.config["jobs"] = jobs + remote = self.get_remote(remote, "export-to-remote") + + return remote.transfer_straight_to_remote( + from_tree, + remote.tree, + from_tree.path_info, + remote.tree.path_info, + self.repo, + ) + def status( self, cache, diff --git a/dvc/remote/base.py b/dvc/remote/base.py index eb2d2f86b2..2127a0b670 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -1,13 +1,19 @@ import errno import hashlib import logging +import tempfile +import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial, wraps from funcy import concat +from dvc import tree as dvc_tree +from dvc.dir_info import DirInfo from dvc.exceptions import DownloadError, UploadError from dvc.hash_info import HashInfo +from dvc.tree.base import RemoteActionNotImplemented +from dvc.utils.stream import HashedIterStream from ..progress import Tqdm from .index import RemoteIndex, RemoteIndexNoop @@ -472,6 +478,133 @@ def pull(self, cache, named_cache, jobs=None, show_checksums=False): return ret + def _hash_to_cache_path(self, to_info, hash_info): + return to_info / hash_info.value[:2] / hash_info.value[2:] + + def _transfer_file_as_whole( + self, from_tree, to_tree, file_info, to_info, repo + ): + # When we can't use the chunked upload, we have to first download + # and then calculate the hash as if it were a local file and then + # upload it. + with tempfile.NamedTemporaryFile() as temp_file: + temp_tree = dvc_tree.get_cloud_tree(repo, url=temp_file.name) + from_tree.download( + file_info, temp_tree.path_info, no_progress_bar=True + ) + + hash_info = temp_tree.get_file_hash(temp_tree.path_info) + to_tree.upload( + temp_tree.path_info, + self._hash_to_cache_path(to_info, hash_info), + no_progress_bar=True, + ) + + return hash_info + + def _transfer_file_as_chunked( + self, from_tree, to_tree, file_info, to_info + ): + temporary_location = to_info / str(uuid.uuid4()) + with from_tree.open( + file_info, mode="rb", stream_cls=HashedIterStream + ) as hashed_file: + # Since we don't know the hash beforehand, we'll + # upload it to a temporary location and then move + # it. + to_tree.upload_multipart(hashed_file, temporary_location) + hash_info = hashed_file.hash_info + + to_tree.move( + temporary_location, self._hash_to_cache_path(to_info, hash_info) + ) + return hash_info + + def _transfer_file( + self, from_tree, to_tree, from_info, to_info, file_info, repo + ): + try: + hash_info = self._transfer_file_as_chunked( + from_tree, to_tree, file_info, to_info + ) + except RemoteActionNotImplemented: + hash_info = self._transfer_file_as_whole( + from_tree, to_tree, file_info, to_info, repo + ) + + rel_target_filename = file_info.relative_to(from_info) + return rel_target_filename.parts, hash_info + + def _transfer_directory( + self, from_tree, to_tree, to_info, repo, transfer_func, file_infos + ): + dir_info = DirInfo() + with ThreadPoolExecutor(max_workers=from_tree.jobs) as executor: + futures = [ + executor.submit(transfer_func, file_info) + for file_info in file_infos + ] + for future in as_completed(futures): + key, file_hash_info = future.result() + dir_info.trie[key] = file_hash_info + + ( + hash_info, + hash_file_info, + ) = repo.cache.local._get_dir_info_hash( # noqa, pylint: disable=protected-access + dir_info + ) + to_tree.upload( + hash_file_info, + self._hash_to_cache_path(to_info, hash_info), + no_progress_bar=True, + ) + return hash_info + + def transfer_straight_to_remote( + self, + from_tree, + to_tree, + from_info, + to_info, + repo, + no_progress_bar=False, + ): + try: + file_infos = list(from_tree.walk_files(from_info)) + except NotImplementedError: + file_infos = [from_info] + + with Tqdm( + total=len(file_infos), + desc="Transfering files to the remote", + unit="Files", + disable=no_progress_bar, + ) as pbar: + transfer_single = pbar.wrap_fn( + partial( + self._transfer_file, + from_tree, + to_tree, + from_info, + to_info, + repo=repo, + ) + ) + if from_tree.isdir(from_info): + hash_info = self._transfer_directory( + from_tree, + to_tree, + to_info, + repo, + transfer_single, + file_infos, + ) + else: + _, hash_info = transfer_single(from_info) + + return hash_info + @staticmethod def _log_missing_caches(hash_info_dict): missing_caches = [ diff --git a/dvc/repo/export_to_remote.py b/dvc/repo/export_to_remote.py index d2997a4f23..dca1b27b39 100644 --- a/dvc/repo/export_to_remote.py +++ b/dvc/repo/export_to_remote.py @@ -7,15 +7,9 @@ def export_to_remote(self, source, destination, remote=None, jobs=None): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage - from dvc.tree import get_cloud_tree - from_tree = get_cloud_tree(self, url=source) - from_tree.config["jobs"] = jobs - - remote_tree = self.cloud.get_remote(remote).tree - - hash_info = from_tree.export( - remote_tree, from_tree.path_info, remote_tree.path_info, repo=self, + hash_info = self.cloud.transfer_straight_to_remote( + source, jobs=jobs, remote=remote ) path, _, _ = resolve_paths(self, destination) diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 5ef3ab9bfe..10fe4ba5ce 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -1,6 +1,4 @@ import logging -import tempfile -import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial from multiprocessing import cpu_count @@ -9,7 +7,6 @@ from funcy import cached_property, decorator -from dvc import tree as dvc_tree from dvc.dir_info import DirInfo from dvc.exceptions import DvcException, DvcIgnoreInCollectedDirError from dvc.ignore import DvcIgnore @@ -19,7 +16,6 @@ from dvc.utils import tmp_fname from dvc.utils.fs import makedirs, move from dvc.utils.http import open_url -from dvc.utils.stream import HashedIterStream logger = logging.getLogger(__name__) @@ -510,100 +506,3 @@ def _download_file( ) move(tmp_file, to_info, mode=file_mode) - - def _get_cache_location(self, to_info, hash_info): - return to_info / hash_info.value[:2] / hash_info.value[2:] - - def _export_file_fallback(self, remote_tree, file_info, to_info, repo): - with tempfile.NamedTemporaryFile() as temp_file: - temp_tree = dvc_tree.get_cloud_tree(repo, url=temp_file.name) - self.download(file_info, temp_tree.path_info, no_progress_bar=True) - - hash_info = temp_tree.get_file_hash(temp_tree.path_info) - remote_tree.upload( - temp_tree.path_info, - self._get_cache_location(to_info, hash_info), - no_progress_bar=True, - ) - - return hash_info - - def _export_file_chunked(self, remote_tree, file_info, to_info): - temporary_location = to_info / str(uuid.uuid4()) - with self.open( - file_info, mode="rb", stream_cls=HashedIterStream - ) as hashed_file: - # Since we don't know the hash beforehand, we'll - # upload it to a temporary location and then move - # it. - remote_tree.upload_multipart(hashed_file, temporary_location) - hash_info = hashed_file.hash_info - - self.move( - temporary_location, self._get_cache_location(to_info, hash_info) - ) - return hash_info - - def _export_file(self, remote_tree, from_info, to_info, file_info, repo): - try: - hash_info = self._export_file_chunked( - remote_tree, file_info, to_info - ) - except RemoteActionNotImplemented: - hash_info = self._export_file_fallback( - remote_tree, file_info, to_info, repo - ) - - rel_target_filename = file_info.relative_to(from_info) - return rel_target_filename.parts, hash_info - - def export( - self, remote_tree, from_info, to_info, repo, no_progress_bar=False, - ): - try: - file_infos = list(self.walk_files(from_info)) - except NotImplementedError: - file_infos = [from_info] - - with Tqdm( - total=len(file_infos), - desc="Exporting files", - unit="Files", - disable=no_progress_bar, - ) as pbar: - export_single = pbar.wrap_fn( - partial( - self._export_file, - remote_tree, - from_info, - to_info, - repo=repo, - ) - ) - - if self.isdir(from_info): - dir_info = DirInfo() - with ThreadPoolExecutor(max_workers=self.jobs) as executor: - futures = [ - executor.submit(export_single, file_info) - for file_info in file_infos - ] - for future in futures: - key, file_hash_info = future.result() - dir_info.trie[key] = file_hash_info - - ( - hash_info, - hash_file_info, - ) = repo.cache.local._get_dir_info_hash( # noqa, pylint: disable=protected-access - dir_info - ) - remote_tree.upload( - hash_file_info, - to_info / hash_info.value[:2] / hash_info.value[2:], - no_progress_bar=True, - ) - else: - _, hash_info = export_single(from_info) - - return hash_info From e081a45f3aacc3c6e04131f8fbb48d402da1e783 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 5 Jan 2021 09:25:29 +0300 Subject: [PATCH 03/36] Pass chunk size into the open_url --- dvc/remote/base.py | 5 ++++- dvc/tree/base.py | 9 +++++++-- dvc/tree/s3.py | 9 +++++++-- dvc/utils/http.py | 4 ++-- dvc/utils/stream.py | 3 ++- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 2127a0b670..e188107dbe 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -507,7 +507,10 @@ def _transfer_file_as_chunked( ): temporary_location = to_info / str(uuid.uuid4()) with from_tree.open( - file_info, mode="rb", stream_cls=HashedIterStream + file_info, + mode="rb", + stream_cls=HashedIterStream, + chunk_size=from_tree.CHUNK_SIZE, ) as hashed_file: # Since we don't know the hash beforehand, we'll # upload it to a temporary location and then move diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 10fe4ba5ce..c7cbd2c621 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -59,7 +59,7 @@ class BaseTree: TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True - CHUNK_SIZE = 5 * 1024 ** 3 + CHUNK_SIZE = 512 * 1024 * 1024 SHARED_MODE_MAP = {None: (None, None), "group": (None, None)} PARAM_CHECKSUM: ClassVar[Optional[str]] = None @@ -175,13 +175,18 @@ def open( mode: str = "r", encoding: str = None, stream_cls: Optional[Type] = None, + **kwargs, ): if hasattr(self, "_generate_download_url"): # pylint:disable=no-member func = self._generate_download_url # type: ignore[attr-defined] get_url = partial(func, path_info) return open_url( - get_url, mode=mode, encoding=encoding, stream_cls=stream_cls + get_url, + mode=mode, + encoding=encoding, + stream_cls=stream_cls, + **kwargs, ) raise RemoteActionNotImplemented("open", self.scheme) diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index 7b72c03dff..1adeacfa1a 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -365,10 +365,15 @@ def _upload( def _upload_multipart(self, stream, to_info, chunk_size): from boto3.s3.transfer import TransferConfig - config = TransferConfig(multipart_threshold=chunk_size) + config = TransferConfig( + multipart_threshold=chunk_size, + multipart_chunksize=chunk_size, + max_concurrency=1, + use_threads=False, + ) with self._get_s3() as s3: s3.meta.client.upload_fileobj( - stream, to_info.bucket, to_info.path, Config=config + stream, to_info.bucket, to_info.path, Config=config, ) def _download(self, from_info, to_file, name=None, no_progress_bar=False): diff --git a/dvc/utils/http.py b/dvc/utils/http.py index 66eda51dbd..593fd3e413 100644 --- a/dvc/utils/http.py +++ b/dvc/utils/http.py @@ -5,7 +5,7 @@ @contextmanager -def open_url(url, mode="r", encoding=None, stream_cls=None): +def open_url(url, mode="r", encoding=None, stream_cls=None, **kwargs): """Opens an url as a readable stream. Resumes on connection error. @@ -14,7 +14,7 @@ def open_url(url, mode="r", encoding=None, stream_cls=None): stream_cls = stream_cls or IterStream assert mode in {"r", "rt", "rb"} - with iter_url(url) as (response, it): + with iter_url(url, **kwargs) as (response, it): bytes_stream = stream_cls(it) if mode == "rb": diff --git a/dvc/utils/stream.py b/dvc/utils/stream.py index 9581c19e72..d6b5845ce2 100644 --- a/dvc/utils/stream.py +++ b/dvc/utils/stream.py @@ -60,6 +60,7 @@ def peek(self, n): break return self.leftover[:n] + class HashedIterStream(IterStream): PARAM_CHECKSUM = "md5" @@ -70,7 +71,7 @@ def __init__(self, iterator): self.is_text_file = None def read(self, n=-1): - chunk = super().read(n) + chunk = super().read1(n) if self.is_text_file is None: self.is_text_file = istextblock(chunk) From bd331c920cbf3ac432e62536a1a12d72cd5e2ff8 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 5 Jan 2021 11:03:27 +0300 Subject: [PATCH 04/36] Move functionality away from remote to cache --- dvc/cache/base.py | 74 +++++++++++++++++++- dvc/data_cloud.py | 13 +--- dvc/remote/base.py | 131 +++-------------------------------- dvc/repo/export_to_remote.py | 4 +- 4 files changed, 83 insertions(+), 139 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 4ec3683d26..0bf4381fd8 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -3,7 +3,8 @@ import json import logging import os -from concurrent.futures import ThreadPoolExecutor +import tempfile +from concurrent.futures import ThreadPoolExecutor, as_completed from copy import copy from typing import Optional @@ -11,6 +12,7 @@ from shortuuid import uuid import dvc.prompt as prompt +from dvc import tree as dvc_tree from dvc.dir_info import DirInfo from dvc.exceptions import ( CacheLinkError, @@ -22,6 +24,7 @@ from dvc.remote.slow_link_detection import ( # type: ignore[attr-defined] slow_link_guard, ) +from dvc.utils.stream import HashedIterStream from ..tree.base import RemoteActionNotImplemented @@ -247,6 +250,73 @@ def _save_file(self, path_info, tree, hash_info, save_link=True, **kwargs): self.tree.state.save(cache_info, hash_info) + def _transfer_file_as_whole(self, from_tree, from_info): + # When we can't use the chunked upload, we have to first download + # and then calculate the hash as if it were a local file and then + # upload it. + with tempfile.NamedTemporaryFile() as temp_file: + temp_tree = dvc_tree.get_cloud_tree(self.repo, url=temp_file.name) + from_tree.download( + from_info, temp_tree.path_info, no_progress_bar=True + ) + + hash_info = temp_tree.get_file_hash(temp_tree.path_info) + self.tree.upload( + temp_tree.path_info, + self.hash_to_path(hash_info), + no_progress_bar=True, + ) + + return hash_info + + def _transfer_file_as_chunked(self, from_tree, from_info): + temporary_location = self.tree.path_info / uuid() + with from_tree.open( + from_info, + mode="rb", + stream_cls=HashedIterStream, + chunk_size=from_tree.CHUNK_SIZE, + ) as hashed_file: + # Since we don't know the hash beforehand, we'll + # upload it to a temporary location and then move + # it. + self.tree.upload_multipart(hashed_file, temporary_location) + hash_info = hashed_file.hash_info + + self.tree.move(temporary_location, self.hash_to_path(hash_info.value)) + return hash_info + + def transfer_file(self, from_tree, from_info): + try: + hash_info = self._transfer_file_as_chunked(from_tree, from_info) + except RemoteActionNotImplemented: + hash_info = self._transfer_file_as_whole(from_tree, from_info) + + rel_target_filename = from_info.relative_to(from_tree.path_info) + return rel_target_filename.parts, hash_info + + def transfer_directory(self, from_tree, from_infos, func): + dir_info = DirInfo() + with ThreadPoolExecutor(max_workers=from_tree.jobs) as executor: + futures = [ + executor.submit(func, from_tree, from_info) + for from_info in from_infos + ] + for future in as_completed(futures): + key, file_hash_info = future.result() + dir_info.trie[key] = file_hash_info + + ( + hash_info, + hash_path_info, + ) = self._get_dir_info_hash( # noqa, pylint: disable=protected-access + dir_info + ) + self.tree.move( + hash_path_info, self.hash_to_path(hash_info.value), + ) + return hash_info + def _cache_is_copy(self, path_info): """Checks whether cache uses copies.""" if self.cache_type_confirmed: @@ -270,8 +340,6 @@ def _cache_is_copy(self, path_info): return self.cache_types[0] == "copy" def _get_dir_info_hash(self, dir_info): - import tempfile - from dvc.path_info import PathInfo from dvc.utils import tmp_fname diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index 09610f3306..4a4482fa8c 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -92,20 +92,11 @@ def pull( show_checksums=show_checksums, ) - def transfer_straight_to_remote(self, source, jobs=None, remote=None): - from dvc.tree import get_cloud_tree - + def transfer(self, source, jobs=None, remote=None): from_tree = get_cloud_tree(self.repo, url=source) from_tree.config["jobs"] = jobs remote = self.get_remote(remote, "export-to-remote") - - return remote.transfer_straight_to_remote( - from_tree, - remote.tree, - from_tree.path_info, - remote.tree.path_info, - self.repo, - ) + return remote.transfer(from_tree) def status( self, diff --git a/dvc/remote/base.py b/dvc/remote/base.py index e188107dbe..5674999e80 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -1,19 +1,13 @@ import errno import hashlib import logging -import tempfile -import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial, wraps from funcy import concat -from dvc import tree as dvc_tree -from dvc.dir_info import DirInfo from dvc.exceptions import DownloadError, UploadError from dvc.hash_info import HashInfo -from dvc.tree.base import RemoteActionNotImplemented -from dvc.utils.stream import HashedIterStream from ..progress import Tqdm from .index import RemoteIndex, RemoteIndexNoop @@ -478,133 +472,26 @@ def pull(self, cache, named_cache, jobs=None, show_checksums=False): return ret - def _hash_to_cache_path(self, to_info, hash_info): - return to_info / hash_info.value[:2] / hash_info.value[2:] - - def _transfer_file_as_whole( - self, from_tree, to_tree, file_info, to_info, repo - ): - # When we can't use the chunked upload, we have to first download - # and then calculate the hash as if it were a local file and then - # upload it. - with tempfile.NamedTemporaryFile() as temp_file: - temp_tree = dvc_tree.get_cloud_tree(repo, url=temp_file.name) - from_tree.download( - file_info, temp_tree.path_info, no_progress_bar=True - ) - - hash_info = temp_tree.get_file_hash(temp_tree.path_info) - to_tree.upload( - temp_tree.path_info, - self._hash_to_cache_path(to_info, hash_info), - no_progress_bar=True, - ) - - return hash_info - - def _transfer_file_as_chunked( - self, from_tree, to_tree, file_info, to_info - ): - temporary_location = to_info / str(uuid.uuid4()) - with from_tree.open( - file_info, - mode="rb", - stream_cls=HashedIterStream, - chunk_size=from_tree.CHUNK_SIZE, - ) as hashed_file: - # Since we don't know the hash beforehand, we'll - # upload it to a temporary location and then move - # it. - to_tree.upload_multipart(hashed_file, temporary_location) - hash_info = hashed_file.hash_info - - to_tree.move( - temporary_location, self._hash_to_cache_path(to_info, hash_info) - ) - return hash_info - - def _transfer_file( - self, from_tree, to_tree, from_info, to_info, file_info, repo - ): - try: - hash_info = self._transfer_file_as_chunked( - from_tree, to_tree, file_info, to_info - ) - except RemoteActionNotImplemented: - hash_info = self._transfer_file_as_whole( - from_tree, to_tree, file_info, to_info, repo - ) - - rel_target_filename = file_info.relative_to(from_info) - return rel_target_filename.parts, hash_info - - def _transfer_directory( - self, from_tree, to_tree, to_info, repo, transfer_func, file_infos - ): - dir_info = DirInfo() - with ThreadPoolExecutor(max_workers=from_tree.jobs) as executor: - futures = [ - executor.submit(transfer_func, file_info) - for file_info in file_infos - ] - for future in as_completed(futures): - key, file_hash_info = future.result() - dir_info.trie[key] = file_hash_info - - ( - hash_info, - hash_file_info, - ) = repo.cache.local._get_dir_info_hash( # noqa, pylint: disable=protected-access - dir_info - ) - to_tree.upload( - hash_file_info, - self._hash_to_cache_path(to_info, hash_info), - no_progress_bar=True, - ) - return hash_info - - def transfer_straight_to_remote( - self, - from_tree, - to_tree, - from_info, - to_info, - repo, - no_progress_bar=False, - ): + def transfer(self, from_tree, no_progress_bar=False): + from_info = from_tree.path_info try: - file_infos = list(from_tree.walk_files(from_info)) + from_infos = list(from_tree.walk_files(from_info)) except NotImplementedError: - file_infos = [from_info] + from_infos = [from_info] with Tqdm( - total=len(file_infos), + total=len(from_infos), desc="Transfering files to the remote", unit="Files", disable=no_progress_bar, ) as pbar: - transfer_single = pbar.wrap_fn( - partial( - self._transfer_file, - from_tree, - to_tree, - from_info, - to_info, - repo=repo, - ) - ) + transfer_file = pbar.wrap_fn(self.cache.transfer_file) if from_tree.isdir(from_info): - hash_info = self._transfer_directory( - from_tree, - to_tree, - to_info, - repo, - transfer_single, - file_infos, + hash_info = self.cache.transfer_directory( + from_tree, from_infos, transfer_file ) else: - _, hash_info = transfer_single(from_info) + _, hash_info = transfer_file(from_tree, from_info) return hash_info diff --git a/dvc/repo/export_to_remote.py b/dvc/repo/export_to_remote.py index dca1b27b39..a6617cb0cb 100644 --- a/dvc/repo/export_to_remote.py +++ b/dvc/repo/export_to_remote.py @@ -8,9 +8,7 @@ def export_to_remote(self, source, destination, remote=None, jobs=None): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage - hash_info = self.cloud.transfer_straight_to_remote( - source, jobs=jobs, remote=remote - ) + hash_info = self.cloud.transfer(source, jobs=jobs, remote=remote) path, _, _ = resolve_paths(self, destination) stage = create_stage(Stage, self, path, outs=[destination]) From faf40168ccb5cdc078053d74e47df7a424f6bf72 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 5 Jan 2021 11:49:28 +0300 Subject: [PATCH 05/36] Basic progress bars for both transfer types --- dvc/cache/base.py | 10 ++-------- dvc/tree/base.py | 9 +++++++-- dvc/tree/s3.py | 15 +++++++++++---- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 0bf4381fd8..d465eb9e9f 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -256,16 +256,10 @@ def _transfer_file_as_whole(self, from_tree, from_info): # upload it. with tempfile.NamedTemporaryFile() as temp_file: temp_tree = dvc_tree.get_cloud_tree(self.repo, url=temp_file.name) - from_tree.download( - from_info, temp_tree.path_info, no_progress_bar=True - ) + from_tree.download(from_info, temp_tree.path_info) hash_info = temp_tree.get_file_hash(temp_tree.path_info) - self.tree.upload( - temp_tree.path_info, - self.hash_to_path(hash_info), - no_progress_bar=True, - ) + self.tree.upload(temp_tree.path_info, self.hash_to_path(hash_info)) return hash_info diff --git a/dvc/tree/base.py b/dvc/tree/base.py index c7cbd2c621..e4121ed8e6 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -396,10 +396,15 @@ def upload( file_mode=file_mode, ) - def upload_multipart(self, stream, to_info, chunk_size=CHUNK_SIZE): + def upload_multipart( + self, stream, to_info, chunk_size=CHUNK_SIZE, no_progress_bar=False + ): if hasattr(self, "_upload_multipart"): self._upload_multipart( # pylint: disable=no-member - stream, to_info, chunk_size=chunk_size + stream, + to_info, + chunk_size=chunk_size, + no_progress_bar=no_progress_bar, ) else: raise RemoteActionNotImplemented("upload_multipart", self.scheme) diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index 1adeacfa1a..b724e5e324 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -362,7 +362,9 @@ def _upload( from_file, Callback=pbar.update, ExtraArgs=self.extra_args, ) - def _upload_multipart(self, stream, to_info, chunk_size): + def _upload_multipart( + self, stream, to_info, chunk_size, no_progress_bar=False + ): from boto3.s3.transfer import TransferConfig config = TransferConfig( @@ -372,9 +374,14 @@ def _upload_multipart(self, stream, to_info, chunk_size): use_threads=False, ) with self._get_s3() as s3: - s3.meta.client.upload_fileobj( - stream, to_info.bucket, to_info.path, Config=config, - ) + with Tqdm(disable=no_progress_bar, bytes=True) as pbar: + s3.meta.client.upload_fileobj( + stream, + to_info.bucket, + to_info.path, + Config=config, + Callback=pbar.update, + ) def _download(self, from_info, to_file, name=None, no_progress_bar=False): with self._get_obj(from_info) as obj: From 9cbce0c01c0e6efcbb65f4b5a91377e4596df6f5 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 5 Jan 2021 12:18:57 +0300 Subject: [PATCH 06/36] Proper job calculation --- dvc/cache/base.py | 4 ++-- dvc/data_cloud.py | 3 +-- dvc/remote/base.py | 5 +++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index d465eb9e9f..12e4de27c6 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -289,9 +289,9 @@ def transfer_file(self, from_tree, from_info): rel_target_filename = from_info.relative_to(from_tree.path_info) return rel_target_filename.parts, hash_info - def transfer_directory(self, from_tree, from_infos, func): + def transfer_directory(self, from_tree, from_infos, func, jobs): dir_info = DirInfo() - with ThreadPoolExecutor(max_workers=from_tree.jobs) as executor: + with ThreadPoolExecutor(max_workers=jobs) as executor: futures = [ executor.submit(func, from_tree, from_info) for from_info in from_infos diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index 4a4482fa8c..c7dbda63e7 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -94,9 +94,8 @@ def pull( def transfer(self, source, jobs=None, remote=None): from_tree = get_cloud_tree(self.repo, url=source) - from_tree.config["jobs"] = jobs remote = self.get_remote(remote, "export-to-remote") - return remote.transfer(from_tree) + return remote.transfer(from_tree, jobs=jobs) def status( self, diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 5674999e80..b2ca47fc75 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -472,7 +472,8 @@ def pull(self, cache, named_cache, jobs=None, show_checksums=False): return ret - def transfer(self, from_tree, no_progress_bar=False): + def transfer(self, from_tree, jobs=None, no_progress_bar=False): + jobs = jobs or min((from_tree.jobs, self.tree.jobs)) from_info = from_tree.path_info try: from_infos = list(from_tree.walk_files(from_info)) @@ -488,7 +489,7 @@ def transfer(self, from_tree, no_progress_bar=False): transfer_file = pbar.wrap_fn(self.cache.transfer_file) if from_tree.isdir(from_info): hash_info = self.cache.transfer_directory( - from_tree, from_infos, transfer_file + from_tree, from_infos, transfer_file, jobs=jobs ) else: _, hash_info = transfer_file(from_tree, from_info) From c590729a34b0d825e9bf96d8b04e56f8053680ec Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 5 Jan 2021 13:13:35 +0300 Subject: [PATCH 07/36] Redesign .open() usage in order to work with different trees --- dvc/cache/base.py | 17 ++++++++--------- dvc/tree/base.py | 17 +++-------------- dvc/tree/hdfs.py | 2 +- dvc/tree/local.py | 2 +- dvc/utils/http.py | 7 +++---- dvc/utils/stream.py | 8 ++++---- 6 files changed, 20 insertions(+), 33 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 12e4de27c6..c1d27b574d 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -266,19 +266,18 @@ def _transfer_file_as_whole(self, from_tree, from_info): def _transfer_file_as_chunked(self, from_tree, from_info): temporary_location = self.tree.path_info / uuid() with from_tree.open( - from_info, - mode="rb", - stream_cls=HashedIterStream, - chunk_size=from_tree.CHUNK_SIZE, - ) as hashed_file: + from_info, mode="rb", chunk_size=from_tree.CHUNK_SIZE + ) as stream: + wrapped_file = HashedIterStream(stream) # Since we don't know the hash beforehand, we'll # upload it to a temporary location and then move # it. - self.tree.upload_multipart(hashed_file, temporary_location) - hash_info = hashed_file.hash_info + self.tree.upload_multipart(wrapped_file, temporary_location) - self.tree.move(temporary_location, self.hash_to_path(hash_info.value)) - return hash_info + self.tree.move( + temporary_location, self.hash_to_path(wrapped_file.hash_info.value) + ) + return wrapped_file.hash_info def transfer_file(self, from_tree, from_info): try: diff --git a/dvc/tree/base.py b/dvc/tree/base.py index e4121ed8e6..94e9b630f0 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -2,7 +2,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial from multiprocessing import cpu_count -from typing import Any, ClassVar, Dict, Optional, Type +from typing import Any, ClassVar, Dict, Optional from urllib.parse import urlparse from funcy import cached_property, decorator @@ -170,24 +170,13 @@ def cache(self): return getattr(self.repo.cache, self.scheme) def open( - self, - path_info, - mode: str = "r", - encoding: str = None, - stream_cls: Optional[Type] = None, - **kwargs, + self, path_info, mode: str = "r", encoding: str = None, **kwargs, ): if hasattr(self, "_generate_download_url"): # pylint:disable=no-member func = self._generate_download_url # type: ignore[attr-defined] get_url = partial(func, path_info) - return open_url( - get_url, - mode=mode, - encoding=encoding, - stream_cls=stream_cls, - **kwargs, - ) + return open_url(get_url, mode=mode, encoding=encoding, **kwargs,) raise RemoteActionNotImplemented("open", self.scheme) diff --git a/dvc/tree/hdfs.py b/dvc/tree/hdfs.py index 2c2db68c37..c086539ca7 100644 --- a/dvc/tree/hdfs.py +++ b/dvc/tree/hdfs.py @@ -63,7 +63,7 @@ def close(self): ) @contextmanager - def open(self, path_info, mode="r", encoding=None): + def open(self, path_info, mode="r", encoding=None, **kwargs): assert mode in {"r", "rt", "rb"} try: diff --git a/dvc/tree/local.py b/dvc/tree/local.py index f8aa4e0690..bec9c89ef3 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -56,7 +56,7 @@ def dvcignore(self): return cls(self, root) @staticmethod - def open(path_info, mode="r", encoding=None): + def open(path_info, mode="r", encoding=None, **kwargs): return open(path_info, mode=mode, encoding=encoding) def exists(self, path_info, use_dvcignore=True): diff --git a/dvc/utils/http.py b/dvc/utils/http.py index 593fd3e413..c372db561d 100644 --- a/dvc/utils/http.py +++ b/dvc/utils/http.py @@ -5,17 +5,16 @@ @contextmanager -def open_url(url, mode="r", encoding=None, stream_cls=None, **kwargs): +def open_url(url, mode="r", encoding=None, **iter_opts): """Opens an url as a readable stream. Resumes on connection error. Url could be a string or a callable returning a string. """ - stream_cls = stream_cls or IterStream assert mode in {"r", "rt", "rb"} - with iter_url(url, **kwargs) as (response, it): - bytes_stream = stream_cls(it) + with iter_url(url, **iter_opts) as (response, it): + bytes_stream = IterStream(it) if mode == "rb": yield bytes_stream diff --git a/dvc/utils/stream.py b/dvc/utils/stream.py index d6b5845ce2..53090d7484 100644 --- a/dvc/utils/stream.py +++ b/dvc/utils/stream.py @@ -61,17 +61,17 @@ def peek(self, n): return self.leftover[:n] -class HashedIterStream(IterStream): +class HashedIterStream: PARAM_CHECKSUM = "md5" - def __init__(self, iterator): - super().__init__(iterator) + def __init__(self, fobj): self.md5 = hashlib.md5() + self.fobj = fobj self.is_text_file = None def read(self, n=-1): - chunk = super().read1(n) + chunk = self.fobj.read1(n) if self.is_text_file is None: self.is_text_file = istextblock(chunk) From 69c5acc8dff2ee72f208ab4963d60a09df7eb450 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 5 Jan 2021 14:22:52 +0300 Subject: [PATCH 08/36] Align the signatures of other open() functions to accept **kwargs --- dvc/tree/gdrive.py | 2 +- dvc/tree/ssh/__init__.py | 2 +- dvc/tree/webhdfs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dvc/tree/gdrive.py b/dvc/tree/gdrive.py index 00fbcfbd81..2007f8d156 100644 --- a/dvc/tree/gdrive.py +++ b/dvc/tree/gdrive.py @@ -389,7 +389,7 @@ def _gdrive_download_file( @contextmanager @_gdrive_retry - def open(self, path_info, mode="r", encoding=None): + def open(self, path_info, mode="r", encoding=None, **kwargs): assert mode in {"r", "rt", "rb"} item_id = self._get_item_id(path_info) diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index b2391e4c90..8463067003 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -147,7 +147,7 @@ def ssh(self, path_info): ) @contextmanager - def open(self, path_info, mode="r", encoding=None): + def open(self, path_info, mode="r", encoding=None, **kwargs): assert mode in {"r", "rt", "rb", "wb"} with self.ssh(path_info) as ssh, closing( diff --git a/dvc/tree/webhdfs.py b/dvc/tree/webhdfs.py index a65568f090..cc102c9428 100644 --- a/dvc/tree/webhdfs.py +++ b/dvc/tree/webhdfs.py @@ -91,7 +91,7 @@ def hdfs_client(self): return client @contextmanager - def open(self, path_info, mode="r", encoding=None): + def open(self, path_info, mode="r", encoding=None, **kwargs): assert mode in {"r", "rt", "rb"} with self.hdfs_client.read( From 49c01f3cbd9aefbbc35cf87186ee15743933fc3a Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 5 Jan 2021 16:05:35 +0300 Subject: [PATCH 09/36] Implement chunked uploading to the ssh --- dvc/tree/ssh/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index 8463067003..94dbc47447 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -10,6 +10,7 @@ import dvc.prompt as prompt from dvc.hash_info import HashInfo +from dvc.progress import Tqdm from dvc.scheme import Schemes from ..base import BaseTree @@ -260,6 +261,18 @@ def _download(self, from_info, to_file, name=None, no_progress_bar=False): no_progress_bar=no_progress_bar, ) + def _upload_multipart( + self, stream, to_info, chunk_size, no_progress_bar=False + ): + with self.ssh(to_info) as ssh: + with Tqdm(disable=no_progress_bar, bytes=True) as pbar: + while True: + data = stream.read(chunk_size) + if not data: + break + ssh.write(data) + pbar.update(len(data)) + def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs ): From 3af38da9d31354ce44009aac49897809fc460c9c Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Wed, 6 Jan 2021 09:55:52 +0300 Subject: [PATCH 10/36] Use local cache to compute directory hash (md5) --- dvc/cache/base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index c1d27b574d..4e64d50a38 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -302,12 +302,10 @@ def transfer_directory(self, from_tree, from_infos, func, jobs): ( hash_info, hash_path_info, - ) = self._get_dir_info_hash( # noqa, pylint: disable=protected-access + ) = self.repo.cache.local._get_dir_info_hash( # noqa, pylint: disable=protected-access dir_info ) - self.tree.move( - hash_path_info, self.hash_to_path(hash_info.value), - ) + self.tree.upload(hash_path_info, self.hash_to_path(hash_info.value)) return hash_info def _cache_is_copy(self, path_info): From 5d89eaf659da77cb820b2dced8e816b7d0231825 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Wed, 6 Jan 2021 11:13:20 +0300 Subject: [PATCH 11/36] S3: multipart upload test --- tests/func/test_s3.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/func/test_s3.py b/tests/func/test_s3.py index 9e52aecd33..af67bac25a 100644 --- a/tests/func/test_s3.py +++ b/tests/func/test_s3.py @@ -1,3 +1,4 @@ +import io from functools import wraps import boto3 @@ -113,9 +114,25 @@ def test_copy_multipart_preserve_etag(): S3Tree._copy(s3, from_info, to_info, {}) +@mock_s3 def test_s3_isdir(tmp_dir, dvc, s3): s3.gen({"data": {"foo": "foo"}}) tree = S3Tree(dvc, s3.config) assert not tree.isdir(s3 / "data" / "foo") assert tree.isdir(s3 / "data") + +@mock_s3 +def test_multipart_upload(dvc): + from_info, to_info = _get_src_dst() + s3 = boto3.client("s3") + s3.create_bucket(Bucket=from_info.bucket) + + tree = S3Tree(dvc, {"url": str(to_info)}) + + stream = io.BytesIO(b"data") + tree._upload_multipart(stream, to_info, 1) + + assert tree.exists(to_info) + with tree.open(to_info) as stream: + assert stream.read() == "data" From 457f1e84d03eee92ee8925d488c19af030b09848 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Wed, 6 Jan 2021 16:17:15 +0300 Subject: [PATCH 12/36] Wrap local cache's hash_to_path with PathInfo --- dvc/cache/base.py | 4 +++- dvc/cache/local.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 4e64d50a38..1cb130a025 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -259,7 +259,9 @@ def _transfer_file_as_whole(self, from_tree, from_info): from_tree.download(from_info, temp_tree.path_info) hash_info = temp_tree.get_file_hash(temp_tree.path_info) - self.tree.upload(temp_tree.path_info, self.hash_to_path(hash_info)) + self.tree.upload( + temp_tree.path_info, self.hash_to_path(hash_info.value) + ) return hash_info diff --git a/dvc/cache/local.py b/dvc/cache/local.py index 14d76cd898..d8ffa61a74 100644 --- a/dvc/cache/local.py +++ b/dvc/cache/local.py @@ -40,7 +40,9 @@ def hash_to_path(self, hash_): # NOTE: `self.cache_path` is already normalized so we can simply use # `os.sep` instead of `os.path.join`. This results in this helper # being ~5.5 times faster. - return f"{self.cache_path}{os.sep}{hash_[0:2]}{os.sep}{hash_[2:]}" + return PathInfo( + f"{self.cache_path}{os.sep}{hash_[0:2]}{os.sep}{hash_[2:]}" + ) def hashes_exist( self, hashes, jobs=None, name=None From 0cef7a9222f20698d92d485a1a6f7d1ee2ac9008 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Wed, 6 Jan 2021 17:18:35 +0300 Subject: [PATCH 13/36] Use .isdir() instead of a try/except statement for calculating the from_infos --- dvc/remote/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index b2ca47fc75..72e1233aa4 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -475,9 +475,10 @@ def pull(self, cache, named_cache, jobs=None, show_checksums=False): def transfer(self, from_tree, jobs=None, no_progress_bar=False): jobs = jobs or min((from_tree.jobs, self.tree.jobs)) from_info = from_tree.path_info - try: + + if from_tree.isdir(from_info): from_infos = list(from_tree.walk_files(from_info)) - except NotImplementedError: + else: from_infos = [from_info] with Tqdm( From dc7179e148fff848155e8eaa4a23e9a15c5bef0f Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Wed, 6 Jan 2021 18:20:11 +0300 Subject: [PATCH 14/36] New CLI layout (add/import-url) --- dvc/command/add.py | 42 +++++++++++++++++++++++++++++++ dvc/command/data_sync.py | 48 ------------------------------------ dvc/command/imp_url.py | 26 +++++++++++++++++++ dvc/data_cloud.py | 4 +-- dvc/repo/__init__.py | 1 - dvc/repo/add.py | 22 +++++++++++++++++ dvc/repo/export_to_remote.py | 21 ---------------- dvc/repo/imp_url.py | 24 ++++++++++++++---- 8 files changed, 111 insertions(+), 77 deletions(-) delete mode 100644 dvc/repo/export_to_remote.py diff --git a/dvc/command/add.py b/dvc/command/add.py index db112162f9..d387d833b1 100644 --- a/dvc/command/add.py +++ b/dvc/command/add.py @@ -18,6 +18,26 @@ def run(self): if len(self.args.targets) > 1 and self.args.file: raise RecursiveAddingWhileUsingFilename() + if len(self.args.targets) > 1 and self.args.straight_to_remote: + logger.error( + "--straight-to-remote can't used with multiple targets" + ) + return 1 + + if not self.args.straight_to_remote: + if self.args.remote: + command = "--remote" + elif self.args.out: + command = "--out" + else: + command = None + + if command is not None: + logger.error( + "--remote can't be used without --straight-to-remote" + ) + return 1 + self.repo.add( self.args.targets, recursive=self.args.recursive, @@ -26,6 +46,9 @@ def run(self): external=self.args.external, glob=self.args.glob, desc=self.args.desc, + out=self.args.out, + remote=self.args.remote, + straight_to_remote=self.args.straight_to_remote, ) except DvcException: @@ -74,6 +97,25 @@ def add_parser(subparsers, parent_parser): help="Specify name of the DVC-file this command will generate.", metavar="", ) + parser.add_argument( + "-o", + "--out", + help="Destination path to put files to.", + nargs="?", + metavar="", + ) + parser.add_argument( + "--straight-to-remote", + action="store_true", + default=False, + help="Download it directly to the remote", + ) + parser.add_argument( + "-r", + "--remote", + help="Remote storage to download to", + metavar="", + ) parser.add_argument( "--desc", type=str, diff --git a/dvc/command/data_sync.py b/dvc/command/data_sync.py index c6ad8185c6..5bbfdf4dce 100644 --- a/dvc/command/data_sync.py +++ b/dvc/command/data_sync.py @@ -94,22 +94,6 @@ def run(self): return 0 -class CmdDataExportToRemote(CmdDataBase): - def run(self): - try: - processed_files_count = self.repo.export_to_remote( - source=self.args.source, - destination=self.args.destination, - remote=self.args.remote, - jobs=self.args.jobs, - ) - self.log_summary({"exported": processed_files_count}) - except DvcException: - logger.exception("failed to export data to the remote") - return 1 - return 0 - - def shared_parent_parser(): from dvc.cli import get_parent_parser @@ -141,40 +125,8 @@ def shared_parent_parser(): def add_parser(subparsers, _parent_parser): - from dvc.cli import get_parent_parser from dvc.command.status import CmdDataStatus - # Export to Remote - EXPORT_HELP = "Exports given data to the remote." - - export_parser = subparsers.add_parser( - "export-to-remote", - parents=[get_parent_parser()], - description=append_doc_link(EXPORT_HELP, "export-to-remote"), - help=EXPORT_HELP, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - export_parser.add_argument( - "source", help="Source location to the file that is being exported" - ) - export_parser.add_argument("destination", help="The name of the file") - export_parser.add_argument( - "-r", "--remote", help="Remote storage to export to", metavar="", - ) - export_parser.add_argument( - "-j", - "--jobs", - type=int, - help=( - "Number of jobs to run simultaneously. " - "The default value is 4 * cpu_count(). " - "For SSH remotes, the default is 4. " - ), - metavar="", - ) - - export_parser.set_defaults(func=CmdDataExportToRemote) - # Pull PULL_HELP = "Download tracked files or directories from remote storage." diff --git a/dvc/command/imp_url.py b/dvc/command/imp_url.py index a5b5767f78..7d9b667911 100644 --- a/dvc/command/imp_url.py +++ b/dvc/command/imp_url.py @@ -10,12 +10,26 @@ class CmdImportUrl(CmdBase): def run(self): + if self.args.no_exec and self.args.straight_to_remote: + logger.error( + "--no-exec can't be combined with --straight-to-remote" + ) + return 1 + + if self.args.remote and not self.args.straight_to_remote: + logger.error( + "--remote can't be used alone without --straight-to-remote" + ) + return 1 + try: self.repo.imp_url( self.args.url, out=self.args.out, fname=self.args.file, no_exec=self.args.no_exec, + remote=self.args.remote, + straight_to_remote=self.args.straight_to_remote, desc=self.args.desc, ) except DvcException: @@ -68,6 +82,18 @@ def add_parser(subparsers, parent_parser): default=False, help="Only create DVC-file without actually downloading it.", ) + import_parser.add_argument( + "--straight-to-remote", + action="store_true", + default=False, + help="Download it directly to the remote", + ) + import_parser.add_argument( + "-r", + "--remote", + help="Remote storage to download to", + metavar="", + ) import_parser.add_argument( "--desc", type=str, diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index c7dbda63e7..9245788325 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -92,9 +92,9 @@ def pull( show_checksums=show_checksums, ) - def transfer(self, source, jobs=None, remote=None): + def transfer(self, source, jobs=None, remote=None, command=None): from_tree = get_cloud_tree(self.repo, url=source) - remote = self.get_remote(remote, "export-to-remote") + remote = self.get_remote(remote, command) return remote.transfer(from_tree, jobs=jobs) def status( diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index b5267243eb..4b64fd0cda 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -60,7 +60,6 @@ class Repo: from dvc.repo.commit import commit from dvc.repo.destroy import destroy from dvc.repo.diff import diff - from dvc.repo.export_to_remote import export_to_remote from dvc.repo.fetch import fetch from dvc.repo.freeze import freeze, unfreeze from dvc.repo.gc import gc diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 5e59114f55..07e211fd3e 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -6,6 +6,7 @@ from ..exceptions import ( CacheLinkError, + InvalidArgumentError, OutputDuplicationError, OverlappingOutputPathsError, RecursiveAddingWhileUsingFilename, @@ -30,6 +31,9 @@ def add( recursive=False, no_commit=False, fname=None, + straight_to_remote=False, + out=None, + remote=None, **kwargs, ): from dvc.utils.collections import ensure_list @@ -38,6 +42,24 @@ def add( raise RecursiveAddingWhileUsingFilename() targets = ensure_list(targets) + + if straight_to_remote: + if len(targets) != 1: + raise InvalidArgumentError( + "straight_to_remote requires only 1 target" + ) + + return repo.imp_url( + url=targets[0], + out=out, + remote=remote, + desc=kwargs.get("desc"), + fname=kwargs.get("fname"), + comand="add", + track_remote_url=False, + straight_to_remote=True, + ) + link_failures = [] stages_list = [] num_targets = len(targets) diff --git a/dvc/repo/export_to_remote.py b/dvc/repo/export_to_remote.py deleted file mode 100644 index a6617cb0cb..0000000000 --- a/dvc/repo/export_to_remote.py +++ /dev/null @@ -1,21 +0,0 @@ -from dvc.utils import resolve_paths - -from . import locked - - -@locked -def export_to_remote(self, source, destination, remote=None, jobs=None): - from dvc.dvcfile import Dvcfile - from dvc.stage import Stage, create_stage - - hash_info = self.cloud.transfer(source, jobs=jobs, remote=remote) - - path, _, _ = resolve_paths(self, destination) - stage = create_stage(Stage, self, path, outs=[destination]) - - dvcfile = Dvcfile(self, stage.path) - dvcfile.remove() - - stage.outs[0].hash_info = hash_info - dvcfile.dump(stage) - return hash_info.nfiles diff --git a/dvc/repo/imp_url.py b/dvc/repo/imp_url.py index 71838b1e2f..2ab5aee786 100644 --- a/dvc/repo/imp_url.py +++ b/dvc/repo/imp_url.py @@ -18,8 +18,12 @@ def imp_url( erepo=None, frozen=True, no_exec=False, + remote=None, + track_remote_url=True, + straight_to_remote=False, desc=None, jobs=None, + command="import-url", ): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta @@ -35,15 +39,20 @@ def imp_url( ): url = relpath(url, wdir) + deps = [url] + if not track_remote_url: + deps.clear() + stage = create_stage( Stage, self, fname or path, wdir=wdir, - deps=[url], + deps=deps, outs=[out], erepo=erepo, ) + restore_meta(stage) if stage.can_be_skipped: return None @@ -54,13 +63,18 @@ def imp_url( dvcfile = Dvcfile(self, stage.path) dvcfile.remove() - try: - self.check_modified_graph([stage]) - except OutputDuplicationError as exc: - raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) + if not straight_to_remote: + try: + self.check_modified_graph([stage]) + except OutputDuplicationError as exc: + raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() + elif straight_to_remote: + stage.outs[0].hash_info = self.cloud.transfer( + url, remote=remote, command=command + ) else: stage.run(jobs=jobs) From 37fcb960cb71fb799f088b4df5143888f9234d63 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 7 Jan 2021 09:13:07 +0300 Subject: [PATCH 15/36] Adjust CLI tests to accept new arguments --- tests/unit/command/test_add.py | 3 +++ tests/unit/command/test_imp_url.py | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/unit/command/test_add.py b/tests/unit/command/test_add.py index 22b57c6cb9..05f1d8cef8 100644 --- a/tests/unit/command/test_add.py +++ b/tests/unit/command/test_add.py @@ -31,5 +31,8 @@ def test_add(mocker, dvc): glob=True, fname="file", external=True, + out=None, + remote=None, + straight_to_remote=False, desc="stage description", ) diff --git a/tests/unit/command/test_imp_url.py b/tests/unit/command/test_imp_url.py index 3eb6dcc0d5..0be30414d5 100644 --- a/tests/unit/command/test_imp_url.py +++ b/tests/unit/command/test_imp_url.py @@ -17,7 +17,13 @@ def test_import_url(mocker): assert cmd.run() == 0 m.assert_called_once_with( - "src", out="out", fname="file", no_exec=False, desc="description" + "src", + out="out", + fname="file", + no_exec=False, + remote=None, + straight_to_remote=False, + desc="description", ) @@ -57,5 +63,11 @@ def test_import_url_no_exec(mocker): assert cmd.run() == 0 m.assert_called_once_with( - "src", out="out", fname="file", no_exec=True, desc="description" + "src", + out="out", + fname="file", + no_exec=True, + remote=None, + straight_to_remote=False, + desc="description", ) From 372bb8f2c5209d2e1d1f25e8b06ad7dcfb5a3147 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 7 Jan 2021 09:52:39 +0300 Subject: [PATCH 16/36] tests: import_url/add func/unit tests --- dvc/repo/add.py | 2 +- tests/dir_helpers.py | 3 + tests/func/test_add.py | 16 +++++ tests/func/test_import_url.py | 96 ++++++++++++++++++++++++++++++ tests/unit/command/test_add.py | 33 ++++++++++ tests/unit/command/test_imp_url.py | 31 ++++++++++ 6 files changed, 180 insertions(+), 1 deletion(-) diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 07e211fd3e..1a3e65a973 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -55,7 +55,7 @@ def add( remote=remote, desc=kwargs.get("desc"), fname=kwargs.get("fname"), - comand="add", + command="add", track_remote_url=False, straight_to_remote=True, ) diff --git a/tests/dir_helpers.py b/tests/dir_helpers.py index 34e6dbf467..aa5f14637f 100644 --- a/tests/dir_helpers.py +++ b/tests/dir_helpers.py @@ -238,6 +238,9 @@ def read_text(self, *args, **kwargs): # pylint: disable=signature-differs } return super().read_text(*args, **kwargs) + def hash_to_path_info(self, hash_): + return self / hash_[0:2] / hash_[2:] + def _coerce_filenames(filenames): if isinstance(filenames, (str, bytes, pathlib.PurePath)): diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 554efd8b22..b2e34430c9 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -991,3 +991,19 @@ def test_add_long_fname(tmp_dir, dvc): dvc.add("data") assert (tmp_dir / "data").read_text() == {name: "foo"} + + +def test_add_straight_to_remote(tmp_dir, dvc, local_cloud, local_remote): + local_cloud.gen("foo", "foo") + + url = "remote://upstream/foo" + stage = dvc.add(url, straight_to_remote=True) + + assert not (tmp_dir / "foo").exists() + assert (tmp_dir / "foo.dvc").exists() + + assert len(stage.deps) == 0 + assert len(stage.outs) == 1 + + hash_info = stage.outs[0].hash_info + assert local_remote.hash_to_path_info(hash_info.value).read_text() == "foo" diff --git a/tests/func/test_import_url.py b/tests/func/test_import_url.py index a314de0354..b4654ccbb7 100644 --- a/tests/func/test_import_url.py +++ b/tests/func/test_import_url.py @@ -1,3 +1,4 @@ +import json import os import textwrap from uuid import uuid4 @@ -245,3 +246,98 @@ def test_import_url_preserve_meta(tmp_dir, dvc): frozen: true """ ) + + +@pytest.mark.parametrize( + "workspace", + [ + pytest.lazy_fixture("local_cloud"), + pytest.lazy_fixture("s3"), + pytest.lazy_fixture("gs"), + pytest.lazy_fixture("hdfs"), + pytest.lazy_fixture("webhdfs"), + pytest.param( + pytest.lazy_fixture("ssh"), + marks=pytest.mark.skipif( + os.name == "nt", reason="disabled on windows" + ), + ), + pytest.lazy_fixture("http"), + ], + indirect=True, +) +def test_import_url_straight_to_remote_single_file( + tmp_dir, dvc, workspace, local_remote +): + workspace.gen("foo", "foo") + + url = "remote://workspace/foo" + stage = dvc.imp_url(url, straight_to_remote=True) + + assert not (tmp_dir / "foo").exists() + assert (tmp_dir / "foo.dvc").exists() + + assert len(stage.deps) == 1 + assert stage.deps[0].def_path == url + assert len(stage.outs) == 1 + + hash_info = stage.outs[0].hash_info + assert local_remote.hash_to_path_info(hash_info.value).read_text() == "foo" + + +@pytest.mark.parametrize( + "workspace", + [ + pytest.lazy_fixture("local_cloud"), + pytest.lazy_fixture("s3"), + pytest.lazy_fixture("gs"), + pytest.lazy_fixture("hdfs"), + pytest.lazy_fixture("webhdfs"), + pytest.param( + pytest.lazy_fixture("ssh"), + marks=pytest.mark.skipif( + os.name == "nt", reason="disabled on windows" + ), + ), + ], + indirect=True, +) +def test_import_url_straight_to_remote_directory( + tmp_dir, dvc, workspace, local_remote +): + workspace.gen( + { + "data": { + "foo": "foo", + "bar": "bar", + "sub_dir": {"baz": "sub_dir/baz"}, + } + } + ) + + url = "remote://workspace/data" + stage = dvc.imp_url(url, straight_to_remote=True) + + assert not (tmp_dir / "data").exists() + assert (tmp_dir / "data.dvc").exists() + + assert len(stage.deps) == 1 + assert stage.deps[0].def_path == url + assert len(stage.outs) == 1 + + hash_info = stage.outs[0].hash_info + with open(local_remote.hash_to_path_info(hash_info.value)) as stream: + file_parts = json.load(stream) + + assert len(file_parts) == 3 + assert {file_part["relpath"] for file_part in file_parts} == { + "foo", + "bar", + "sub_dir/baz", + } + + for file_part in file_parts: + assert ( + local_remote.hash_to_path_info(file_part["md5"]).read_text() + == file_part["relpath"] + ) diff --git a/tests/unit/command/test_add.py b/tests/unit/command/test_add.py index 05f1d8cef8..febd997bf2 100644 --- a/tests/unit/command/test_add.py +++ b/tests/unit/command/test_add.py @@ -36,3 +36,36 @@ def test_add(mocker, dvc): straight_to_remote=False, desc="stage description", ) + + +def test_add_straight_to_remote(mocker, dvc): + cli_args = parse_args( + [ + "add", + "s3://bucket/foo", + "--straight-to-remote", + "--out", + "bar", + "--remote", + "remote", + ] + ) + assert cli_args.func == CmdAdd + + cmd = cli_args.func(cli_args) + m = mocker.patch.object(cmd.repo, "add", autospec=True) + + assert cmd.run() == 0 + + m.assert_called_once_with( + ["s3://bucket/foo"], + recursive=False, + no_commit=False, + glob=False, + fname=None, + external=False, + out="bar", + remote="remote", + straight_to_remote=True, + desc=None, + ) diff --git a/tests/unit/command/test_imp_url.py b/tests/unit/command/test_imp_url.py index 0be30414d5..5b2ccc03be 100644 --- a/tests/unit/command/test_imp_url.py +++ b/tests/unit/command/test_imp_url.py @@ -71,3 +71,34 @@ def test_import_url_no_exec(mocker): straight_to_remote=False, desc="description", ) + + +def test_import_url_straight_to_remote(mocker): + cli_args = parse_args( + [ + "import-url", + "s3://bucket/foo", + "bar", + "--straight-to-remote", + "--remote", + "remote", + "--desc", + "description", + ] + ) + assert cli_args.func == CmdImportUrl + + cmd = cli_args.func(cli_args) + m = mocker.patch.object(cmd.repo, "imp_url", autospec=True) + + assert cmd.run() == 0 + + m.assert_called_once_with( + "s3://bucket/foo", + out="bar", + fname=None, + no_exec=False, + remote="remote", + straight_to_remote=True, + desc="description", + ) From 5d06ca4f976bdedca5f7feb6acd874a31f681f03 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 7 Jan 2021 10:11:33 +0300 Subject: [PATCH 17/36] Introduce cache.transfer() --- dvc/cache/base.py | 25 +++++++++++++++++++++++++ dvc/remote/base.py | 26 +++----------------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 1cb130a025..191e7f1164 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -310,6 +310,31 @@ def transfer_directory(self, from_tree, from_infos, func, jobs): self.tree.upload(hash_path_info, self.hash_to_path(hash_info.value)) return hash_info + def transfer(self, from_tree, jobs=None, no_progress_bar=False): + jobs = jobs or min((from_tree.jobs, self.tree.jobs)) + from_info = from_tree.path_info + + if from_tree.isdir(from_info): + from_infos = list(from_tree.walk_files(from_info)) + else: + from_infos = [from_info] + + with Tqdm( + total=len(from_infos), + desc="Transfering files to the remote", + unit="Files", + disable=no_progress_bar, + ) as pbar: + transfer_file = pbar.wrap_fn(self.transfer_file) + if from_tree.isdir(from_info): + hash_info = self.transfer_directory( + from_tree, from_infos, transfer_file, jobs=jobs + ) + else: + _, hash_info = transfer_file(from_tree, from_info) + + return hash_info + def _cache_is_copy(self, path_info): """Checks whether cache uses copies.""" if self.cache_type_confirmed: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 72e1233aa4..d8c78d69e7 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -473,29 +473,9 @@ def pull(self, cache, named_cache, jobs=None, show_checksums=False): return ret def transfer(self, from_tree, jobs=None, no_progress_bar=False): - jobs = jobs or min((from_tree.jobs, self.tree.jobs)) - from_info = from_tree.path_info - - if from_tree.isdir(from_info): - from_infos = list(from_tree.walk_files(from_info)) - else: - from_infos = [from_info] - - with Tqdm( - total=len(from_infos), - desc="Transfering files to the remote", - unit="Files", - disable=no_progress_bar, - ) as pbar: - transfer_file = pbar.wrap_fn(self.cache.transfer_file) - if from_tree.isdir(from_info): - hash_info = self.cache.transfer_directory( - from_tree, from_infos, transfer_file, jobs=jobs - ) - else: - _, hash_info = transfer_file(from_tree, from_info) - - return hash_info + return self.cache.transfer( + from_tree, jobs=jobs, no_progress_bar=no_progress_bar + ) @staticmethod def _log_missing_caches(hash_info_dict): From 811b489d25b10b39cc6bf98cb30fe4f5cddfd985 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 7 Jan 2021 10:32:25 +0300 Subject: [PATCH 18/36] Invalid CLI combination tests --- dvc/cache/base.py | 4 ++-- dvc/command/add.py | 3 ++- dvc/repo/add.py | 5 ++--- tests/unit/command/test_add.py | 28 ++++++++++++++++++++++++- tests/unit/command/test_imp_url.py | 33 ++++++++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 7 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 191e7f1164..065b08f578 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -315,9 +315,9 @@ def transfer(self, from_tree, jobs=None, no_progress_bar=False): from_info = from_tree.path_info if from_tree.isdir(from_info): - from_infos = list(from_tree.walk_files(from_info)) + from_infos = tuple(from_tree.walk_files(from_info)) else: - from_infos = [from_info] + from_infos = (from_info,) with Tqdm( total=len(from_infos), diff --git a/dvc/command/add.py b/dvc/command/add.py index d387d833b1..795a926ece 100644 --- a/dvc/command/add.py +++ b/dvc/command/add.py @@ -34,7 +34,8 @@ def run(self): if command is not None: logger.error( - "--remote can't be used without --straight-to-remote" + f"--{command} can't be used without" + " --straight-to-remote" ) return 1 diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 1a3e65a973..db72c9f083 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -6,7 +6,6 @@ from ..exceptions import ( CacheLinkError, - InvalidArgumentError, OutputDuplicationError, OverlappingOutputPathsError, RecursiveAddingWhileUsingFilename, @@ -45,8 +44,8 @@ def add( if straight_to_remote: if len(targets) != 1: - raise InvalidArgumentError( - "straight_to_remote requires only 1 target" + raise ValueError( + "Can't use straigh_to_remote option with multiple targets" ) return repo.imp_url( diff --git a/tests/unit/command/test_add.py b/tests/unit/command/test_add.py index febd997bf2..6ca6e86230 100644 --- a/tests/unit/command/test_add.py +++ b/tests/unit/command/test_add.py @@ -1,3 +1,5 @@ +import logging + from dvc.cli import parse_args from dvc.command.add import CmdAdd @@ -38,7 +40,7 @@ def test_add(mocker, dvc): ) -def test_add_straight_to_remote(mocker, dvc): +def test_add_straight_to_remote(mocker): cli_args = parse_args( [ "add", @@ -69,3 +71,27 @@ def test_add_straight_to_remote(mocker, dvc): straight_to_remote=True, desc=None, ) + + +def test_add_straight_to_remote_invalid_combinations(mocker, caplog): + cli_args = parse_args( + ["add", "s3://bucket/foo", "s3://bucket/bar", "--straight-to-remote"] + ) + assert cli_args.func == CmdAdd + + cmd = cli_args.func(cli_args) + with caplog.at_level(logging.ERROR, logger="dvc"): + assert cmd.run() == 1 + expected_msg = "--straight-to-remote can't used with multiple targets" + assert expected_msg in caplog.text + + for option in "--remote", "--out": + cli_args = parse_args(["add", "foo", option, "bar"]) + + cmd = cli_args.func(cli_args) + with caplog.at_level(logging.ERROR, logger="dvc"): + assert cmd.run() == 1 + expected_msg = ( + f"--{option} can't be used without --straight-to-remote" + ) + assert expected_msg in caplog.text diff --git a/tests/unit/command/test_imp_url.py b/tests/unit/command/test_imp_url.py index 5b2ccc03be..979690ca30 100644 --- a/tests/unit/command/test_imp_url.py +++ b/tests/unit/command/test_imp_url.py @@ -102,3 +102,36 @@ def test_import_url_straight_to_remote(mocker): straight_to_remote=True, desc="description", ) + + +def test_import_url_straight_to_remote_invalid_combination(mocker, caplog): + cli_args = parse_args( + [ + "import-url", + "s3://bucket/foo", + "bar", + "--straight-to-remote", + "--remote", + "remote", + "--no-exec", + ] + ) + assert cli_args.func == CmdImportUrl + + cmd = cli_args.func(cli_args) + with caplog.at_level(logging.ERROR, logger="dvc"): + assert cmd.run() == 1 + expected_msg = "--no-exec can't be combined with --straight-to-remote" + assert expected_msg in caplog.text + + cli_args = parse_args( + ["import-url", "s3://bucket/foo", "bar", "--remote", "remote"] + ) + + cmd = cli_args.func(cli_args) + with caplog.at_level(logging.ERROR, logger="dvc"): + assert cmd.run() == 1 + expected_msg = ( + "--remote can't be used alone without --straight-to-remote" + ) + assert expected_msg in caplog.text From 3ba20f6a6beb0ce9680401a5010db0fa2380c60e Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 7 Jan 2021 10:43:31 +0300 Subject: [PATCH 19/36] Store dir_info key in the transfer_directory --- dvc/cache/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 065b08f578..71de0f0a30 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -287,19 +287,19 @@ def transfer_file(self, from_tree, from_info): except RemoteActionNotImplemented: hash_info = self._transfer_file_as_whole(from_tree, from_info) - rel_target_filename = from_info.relative_to(from_tree.path_info) - return rel_target_filename.parts, hash_info + return hash_info def transfer_directory(self, from_tree, from_infos, func, jobs): dir_info = DirInfo() with ThreadPoolExecutor(max_workers=jobs) as executor: - futures = [ - executor.submit(func, from_tree, from_info) + futures = { + executor.submit( + func, from_tree, from_info + ): from_info.relative_to(from_tree.path_info) for from_info in from_infos - ] + } for future in as_completed(futures): - key, file_hash_info = future.result() - dir_info.trie[key] = file_hash_info + dir_info.trie[futures[future].parts] = future.result() ( hash_info, @@ -331,7 +331,7 @@ def transfer(self, from_tree, jobs=None, no_progress_bar=False): from_tree, from_infos, transfer_file, jobs=jobs ) else: - _, hash_info = transfer_file(from_tree, from_info) + hash_info = transfer_file(from_tree, from_info) return hash_info From 1ee02d9fc5373d6af7e50d875d7d68154fc8df17 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 7 Jan 2021 14:29:41 +0300 Subject: [PATCH 20/36] LocalTree.upload_fobj --- dvc/cache/base.py | 6 +++--- dvc/tree/base.py | 14 ++------------ dvc/tree/local.py | 23 ++++++++++++++++++++++- dvc/tree/s3.py | 10 ++++------ dvc/tree/ssh/__init__.py | 16 ++++------------ dvc/utils/fs.py | 13 +++++++++++-- dvc/utils/stream.py | 6 ++++-- tests/func/test_s3.py | 2 +- 8 files changed, 51 insertions(+), 39 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 71de0f0a30..ef946b7838 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -266,7 +266,7 @@ def _transfer_file_as_whole(self, from_tree, from_info): return hash_info def _transfer_file_as_chunked(self, from_tree, from_info): - temporary_location = self.tree.path_info / uuid() + tmp_info = self.tree.path_info / uuid() with from_tree.open( from_info, mode="rb", chunk_size=from_tree.CHUNK_SIZE ) as stream: @@ -274,10 +274,10 @@ def _transfer_file_as_chunked(self, from_tree, from_info): # Since we don't know the hash beforehand, we'll # upload it to a temporary location and then move # it. - self.tree.upload_multipart(wrapped_file, temporary_location) + self.tree.upload_fobj(wrapped_file, tmp_info) self.tree.move( - temporary_location, self.hash_to_path(wrapped_file.hash_info.value) + tmp_info, self.hash_to_path(wrapped_file.hash_info.value) ) return wrapped_file.hash_info diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 94e9b630f0..7503b2576d 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -385,18 +385,8 @@ def upload( file_mode=file_mode, ) - def upload_multipart( - self, stream, to_info, chunk_size=CHUNK_SIZE, no_progress_bar=False - ): - if hasattr(self, "_upload_multipart"): - self._upload_multipart( # pylint: disable=no-member - stream, - to_info, - chunk_size=chunk_size, - no_progress_bar=no_progress_bar, - ) - else: - raise RemoteActionNotImplemented("upload_multipart", self.scheme) + def upload_fobj(self, fobj, to_info, no_progress_bar=False): + raise RemoteActionNotImplemented("upload_fobj", self.scheme) def download( self, diff --git a/dvc/tree/local.py b/dvc/tree/local.py index bec9c89ef3..6934dd65b1 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -11,7 +11,14 @@ from dvc.scheme import Schemes from dvc.system import System from dvc.utils import file_md5, is_exec, tmp_fname -from dvc.utils.fs import copy_fobj_to_file, copyfile, makedirs, move, remove +from dvc.utils.fs import ( + copy_fobj_to_file, + copy_fobj_to_fobj, + copyfile, + makedirs, + move, + remove, +) from .base import BaseTree @@ -282,6 +289,20 @@ def _upload( self.chmod(tmp_file, file_mode) os.replace(tmp_file, to_info) + def upload_fobj(self, fobj, to_info, no_progress_bar=False): + self.makedirs(to_info.parent) + tmp_info = to_info.parent / tmp_fname("") + try: + with open(tmp_info, mode="wb") as fdest: + copy_fobj_to_fobj( + fobj, fdest, self.CHUNK_SIZE, no_progress_bar + ) + os.chmod(tmp_info, self.file_mode) + os.rename(tmp_info, to_info) + except Exception: + os.remove(tmp_info) + raise + @staticmethod def _download( from_info, to_file, name=None, no_progress_bar=False, **_kwargs diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index b724e5e324..1f634eb5e7 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -362,21 +362,19 @@ def _upload( from_file, Callback=pbar.update, ExtraArgs=self.extra_args, ) - def _upload_multipart( - self, stream, to_info, chunk_size, no_progress_bar=False - ): + def upload_fobj(self, fobj, to_info, no_progress_bar=False): from boto3.s3.transfer import TransferConfig config = TransferConfig( - multipart_threshold=chunk_size, - multipart_chunksize=chunk_size, + multipart_threshold=self.CHUNK_SIZE, + multipart_chunksize=self.CHUNK_SIZE, max_concurrency=1, use_threads=False, ) with self._get_s3() as s3: with Tqdm(disable=no_progress_bar, bytes=True) as pbar: s3.meta.client.upload_fileobj( - stream, + fobj, to_info.bucket, to_info.path, Config=config, diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index 94dbc47447..e4b48b9ce4 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -10,8 +10,8 @@ import dvc.prompt as prompt from dvc.hash_info import HashInfo -from dvc.progress import Tqdm from dvc.scheme import Schemes +from dvc.utils.fs import copy_fobj_to_fobj from ..base import BaseTree from ..pool import get_connection @@ -261,17 +261,9 @@ def _download(self, from_info, to_file, name=None, no_progress_bar=False): no_progress_bar=no_progress_bar, ) - def _upload_multipart( - self, stream, to_info, chunk_size, no_progress_bar=False - ): - with self.ssh(to_info) as ssh: - with Tqdm(disable=no_progress_bar, bytes=True) as pbar: - while True: - data = stream.read(chunk_size) - if not data: - break - ssh.write(data) - pbar.update(len(data)) + def upload_fobj(self, fobj, to_info, no_progress_bar=False): + with self.open(to_info, mode="wb") as fdest: + copy_fobj_to_fobj(fobj, fdest, self.CHUNK_SIZE, no_progress_bar) def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index 4e37601105..f1058ea121 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -6,6 +6,7 @@ import sys from dvc.exceptions import DvcException, FileOwnershipError +from dvc.progress import Tqdm from dvc.system import System from dvc.utils import dict_md5 @@ -174,8 +175,6 @@ def makedirs(path, exist_ok=False, mode=None): def copyfile(src, dest, no_progress_bar=False, name=None): """Copy file with progress bar""" - from dvc.progress import Tqdm - name = name if name else os.path.basename(dest) total = os.stat(src).st_size @@ -207,6 +206,16 @@ def copy_fobj_to_file(fsrc, dest): shutil.copyfileobj(fsrc, fdest) +def copy_fobj_to_fobj(fsrc, fdest, chunk_size=-1, no_progress_bar=False): + with Tqdm(disable=no_progress_bar, bytes=True) as pbar: + while True: + data = fsrc.read(chunk_size) + if not data: + break + fdest.write(data) + pbar.update(len(data)) + + def walk_files(directory): for root, _, files in os.walk(directory): for f in files: diff --git a/dvc/utils/stream.py b/dvc/utils/stream.py index 53090d7484..fa458e32a7 100644 --- a/dvc/utils/stream.py +++ b/dvc/utils/stream.py @@ -67,11 +67,13 @@ class HashedIterStream: def __init__(self, fobj): self.md5 = hashlib.md5() - self.fobj = fobj + self.reader = fobj.read1 if hasattr(fobj, "read1") else fobj.read self.is_text_file = None + self.total = 0 def read(self, n=-1): - chunk = self.fobj.read1(n) + chunk = self.reader(n) + self.total += len(chunk) if self.is_text_file is None: self.is_text_file = istextblock(chunk) diff --git a/tests/func/test_s3.py b/tests/func/test_s3.py index af67bac25a..67b431d85f 100644 --- a/tests/func/test_s3.py +++ b/tests/func/test_s3.py @@ -131,7 +131,7 @@ def test_multipart_upload(dvc): tree = S3Tree(dvc, {"url": str(to_info)}) stream = io.BytesIO(b"data") - tree._upload_multipart(stream, to_info, 1) + tree.upload_fobj(stream, to_info, 1) assert tree.exists(to_info) with tree.open(to_info) as stream: From bacc198abc9a9485dc91c0fbeed03ff7df9e03d3 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 7 Jan 2021 14:53:01 +0300 Subject: [PATCH 21/36] Tests for utils --- dvc/cache/base.py | 4 ++-- dvc/data_cloud.py | 2 ++ dvc/utils/fs.py | 5 ++++- dvc/utils/stream.py | 6 ++---- tests/unit/utils/test_fs.py | 11 +++++++++++ tests/unit/utils/test_stream.py | 31 +++++++++++++++++++++++++++++++ 6 files changed, 52 insertions(+), 7 deletions(-) create mode 100644 tests/unit/utils/test_stream.py diff --git a/dvc/cache/base.py b/dvc/cache/base.py index ef946b7838..f677a1905c 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -24,7 +24,7 @@ from dvc.remote.slow_link_detection import ( # type: ignore[attr-defined] slow_link_guard, ) -from dvc.utils.stream import HashedIterStream +from dvc.utils.stream import HashedStreamReader from ..tree.base import RemoteActionNotImplemented @@ -270,7 +270,7 @@ def _transfer_file_as_chunked(self, from_tree, from_info): with from_tree.open( from_info, mode="rb", chunk_size=from_tree.CHUNK_SIZE ) as stream: - wrapped_file = HashedIterStream(stream) + wrapped_file = HashedStreamReader(stream) # Since we don't know the hash beforehand, we'll # upload it to a temporary location and then move # it. diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index 9245788325..9ad105a4c4 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -93,6 +93,8 @@ def pull( ) def transfer(self, source, jobs=None, remote=None, command=None): + from dvc.tree import get_cloud_tree + from_tree = get_cloud_tree(self.repo, url=source) remote = self.get_remote(remote, command) return remote.transfer(from_tree, jobs=jobs) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index f1058ea121..568ca2e6c6 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -6,7 +6,6 @@ import sys from dvc.exceptions import DvcException, FileOwnershipError -from dvc.progress import Tqdm from dvc.system import System from dvc.utils import dict_md5 @@ -175,6 +174,8 @@ def makedirs(path, exist_ok=False, mode=None): def copyfile(src, dest, no_progress_bar=False, name=None): """Copy file with progress bar""" + from dvc.progress import Tqdm + name = name if name else os.path.basename(dest) total = os.stat(src).st_size @@ -207,6 +208,8 @@ def copy_fobj_to_file(fsrc, dest): def copy_fobj_to_fobj(fsrc, fdest, chunk_size=-1, no_progress_bar=False): + from dvc.progress import Tqdm + with Tqdm(disable=no_progress_bar, bytes=True) as pbar: while True: data = fsrc.read(chunk_size) diff --git a/dvc/utils/stream.py b/dvc/utils/stream.py index fa458e32a7..c93f2756da 100644 --- a/dvc/utils/stream.py +++ b/dvc/utils/stream.py @@ -61,19 +61,17 @@ def peek(self, n): return self.leftover[:n] -class HashedIterStream: +class HashedStreamReader: PARAM_CHECKSUM = "md5" def __init__(self, fobj): self.md5 = hashlib.md5() - self.reader = fobj.read1 if hasattr(fobj, "read1") else fobj.read self.is_text_file = None - self.total = 0 + self.reader = fobj.read1 if hasattr(fobj, "read1") else fobj.read def read(self, n=-1): chunk = self.reader(n) - self.total += len(chunk) if self.is_text_file is None: self.is_text_file = istextblock(chunk) diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index 2db42cc093..ff6205d526 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -14,6 +14,7 @@ BasePathNotInCheckedPathException, contains_symlink_up_to, copy_fobj_to_file, + copy_fobj_to_fobj, copyfile, get_inode, get_mtime_and_size, @@ -271,5 +272,15 @@ def test_copy_fobj_to_file(tmp_dir): assert filecmp.cmp(src, dest) +def test_copy_fobj_to_fobj(tmp_dir): + tmp_dir.gen({"foo": "foo"}) + src = tmp_dir / "foo" + dest = tmp_dir / "bar" + + with open(src, "rb") as fsrc, open(dest, "wb") as fdest: + copy_fobj_to_fobj(fsrc, fdest) + assert filecmp.cmp(src, dest) + + def test_walk_files(tmp_dir): assert list(walk_files(".")) == list(walk_files(tmp_dir)) diff --git a/tests/unit/utils/test_stream.py b/tests/unit/utils/test_stream.py new file mode 100644 index 0000000000..94ce01b1ff --- /dev/null +++ b/tests/unit/utils/test_stream.py @@ -0,0 +1,31 @@ +from dvc.utils import file_md5 +from dvc.utils.stream import HashedStreamReader + + +def test_hashed_stream_reader(tmp_dir): + tmp_dir.gen({"foo": "foo"}) + + foo = tmp_dir / "foo" + with open(foo, "rb") as fobj: + stream_reader = HashedStreamReader(fobj) + stream_reader.read() + + hex_digest, _ = file_md5(foo) + assert stream_reader.is_text_file + assert hex_digest == stream_reader.hash_info.value + + +def test_hashed_stream_reader_as_chunks(tmp_dir): + tmp_dir.gen({"foo": b"foo \x00" * 16}) + + foo = tmp_dir / "foo" + with open(foo, "rb") as fobj: + stream_reader = HashedStreamReader(fobj) + while True: + chunk = stream_reader.read(16) + if not chunk: + break + + hex_digest, _ = file_md5(foo) + assert not stream_reader.is_text_file + assert hex_digest == stream_reader.hash_info.value From 8a73d89ca4fec67671f48d14faf3537a2e6ef6f5 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 7 Jan 2021 15:29:41 +0300 Subject: [PATCH 22/36] Lazy imports on some places --- dvc/cache/base.py | 20 +++++++++++--------- dvc/utils/__init__.py | 2 +- tests/func/test_import_url.py | 2 -- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index f677a1905c..eb2cac0ce1 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -12,7 +12,6 @@ from shortuuid import uuid import dvc.prompt as prompt -from dvc import tree as dvc_tree from dvc.dir_info import DirInfo from dvc.exceptions import ( CacheLinkError, @@ -24,7 +23,6 @@ from dvc.remote.slow_link_detection import ( # type: ignore[attr-defined] slow_link_guard, ) -from dvc.utils.stream import HashedStreamReader from ..tree.base import RemoteActionNotImplemented @@ -251,6 +249,8 @@ def _save_file(self, path_info, tree, hash_info, save_link=True, **kwargs): self.tree.state.save(cache_info, hash_info) def _transfer_file_as_whole(self, from_tree, from_info): + from dvc import tree as dvc_tree + # When we can't use the chunked upload, we have to first download # and then calculate the hash as if it were a local file and then # upload it. @@ -266,20 +266,22 @@ def _transfer_file_as_whole(self, from_tree, from_info): return hash_info def _transfer_file_as_chunked(self, from_tree, from_info): - tmp_info = self.tree.path_info / uuid() + from dvc.utils import tmp_fname + from dvc.utils.stream import HashedStreamReader + + tmp_info = self.tree.path_info / tmp_fname() with from_tree.open( from_info, mode="rb", chunk_size=from_tree.CHUNK_SIZE ) as stream: - wrapped_file = HashedStreamReader(stream) + stream_reader = HashedStreamReader(stream) # Since we don't know the hash beforehand, we'll # upload it to a temporary location and then move # it. - self.tree.upload_fobj(wrapped_file, tmp_info) + self.tree.upload_fobj(stream_reader, tmp_info) - self.tree.move( - tmp_info, self.hash_to_path(wrapped_file.hash_info.value) - ) - return wrapped_file.hash_info + hash_info = stream_reader.hash_info + self.tree.move(tmp_info, self.hash_to_path(hash_info.value)) + return hash_info def transfer_file(self, from_tree, from_info): try: diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index f888b76b17..49122a53ae 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -226,7 +226,7 @@ def fix_env(env=None): return env -def tmp_fname(fname): +def tmp_fname(fname=""): """ Temporary name for a partial download """ from shortuuid import uuid diff --git a/tests/func/test_import_url.py b/tests/func/test_import_url.py index b4654ccbb7..0537b0fa5a 100644 --- a/tests/func/test_import_url.py +++ b/tests/func/test_import_url.py @@ -255,7 +255,6 @@ def test_import_url_preserve_meta(tmp_dir, dvc): pytest.lazy_fixture("s3"), pytest.lazy_fixture("gs"), pytest.lazy_fixture("hdfs"), - pytest.lazy_fixture("webhdfs"), pytest.param( pytest.lazy_fixture("ssh"), marks=pytest.mark.skipif( @@ -292,7 +291,6 @@ def test_import_url_straight_to_remote_single_file( pytest.lazy_fixture("s3"), pytest.lazy_fixture("gs"), pytest.lazy_fixture("hdfs"), - pytest.lazy_fixture("webhdfs"), pytest.param( pytest.lazy_fixture("ssh"), marks=pytest.mark.skipif( From c77a8dd782559f2dc9a095061ed4b46ac5fd4bfd Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Fri, 8 Jan 2021 10:41:23 +0300 Subject: [PATCH 23/36] --to-remote migration (and repo._transfer) --- dvc/cache/base.py | 29 ++++++------ dvc/cache/local.py | 4 +- dvc/command/add.py | 15 +++--- dvc/command/imp_url.py | 16 +++---- dvc/repo/__init__.py | 1 + dvc/repo/_transfer.py | 73 ++++++++++++++++++++++++++++++ dvc/repo/add.py | 16 +++---- dvc/repo/imp_url.py | 36 ++++++++------- dvc/tree/base.py | 2 +- tests/func/test_add.py | 4 +- tests/func/test_import_url.py | 10 ++-- tests/unit/command/test_add.py | 18 ++++---- tests/unit/command/test_imp_url.py | 20 ++++---- 13 files changed, 151 insertions(+), 93 deletions(-) create mode 100644 dvc/repo/_transfer.py diff --git a/dvc/cache/base.py b/dvc/cache/base.py index eb2cac0ce1..35cce24a54 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -260,7 +260,8 @@ def _transfer_file_as_whole(self, from_tree, from_info): hash_info = temp_tree.get_file_hash(temp_tree.path_info) self.tree.upload( - temp_tree.path_info, self.hash_to_path(hash_info.value) + temp_tree.path_info, + self.tree.hash_to_path_info(hash_info.value), ) return hash_info @@ -280,7 +281,7 @@ def _transfer_file_as_chunked(self, from_tree, from_info): self.tree.upload_fobj(stream_reader, tmp_info) hash_info = stream_reader.hash_info - self.tree.move(tmp_info, self.hash_to_path(hash_info.value)) + self.tree.move(tmp_info, self.tree.hash_to_path_info(hash_info.value)) return hash_info def transfer_file(self, from_tree, from_info): @@ -291,17 +292,19 @@ def transfer_file(self, from_tree, from_info): return hash_info - def transfer_directory(self, from_tree, from_infos, func, jobs): + def transfer_directory(self, from_tree, from_info, jobs, callback=None): dir_info = DirInfo() with ThreadPoolExecutor(max_workers=jobs) as executor: futures = { executor.submit( - func, from_tree, from_info + self.transfer_file, from_tree, from_info ): from_info.relative_to(from_tree.path_info) - for from_info in from_infos + for from_info in from_tree.walk_files(from_info) } for future in as_completed(futures): dir_info.trie[futures[future].parts] = future.result() + if callback is not None: + callback() ( hash_info, @@ -309,31 +312,27 @@ def transfer_directory(self, from_tree, from_infos, func, jobs): ) = self.repo.cache.local._get_dir_info_hash( # noqa, pylint: disable=protected-access dir_info ) - self.tree.upload(hash_path_info, self.hash_to_path(hash_info.value)) + self.tree.upload( + hash_path_info, self.tree.hash_to_path_info(hash_info.value) + ) return hash_info def transfer(self, from_tree, jobs=None, no_progress_bar=False): jobs = jobs or min((from_tree.jobs, self.tree.jobs)) from_info = from_tree.path_info - if from_tree.isdir(from_info): - from_infos = tuple(from_tree.walk_files(from_info)) - else: - from_infos = (from_info,) - with Tqdm( - total=len(from_infos), desc="Transfering files to the remote", unit="Files", disable=no_progress_bar, ) as pbar: - transfer_file = pbar.wrap_fn(self.transfer_file) if from_tree.isdir(from_info): hash_info = self.transfer_directory( - from_tree, from_infos, transfer_file, jobs=jobs + from_tree, from_info, jobs=jobs, callback=pbar.update ) else: - hash_info = transfer_file(from_tree, from_info) + hash_info = self.transfer_file(from_tree, from_info) + pbar.update(1) return hash_info diff --git a/dvc/cache/local.py b/dvc/cache/local.py index d8ffa61a74..14d76cd898 100644 --- a/dvc/cache/local.py +++ b/dvc/cache/local.py @@ -40,9 +40,7 @@ def hash_to_path(self, hash_): # NOTE: `self.cache_path` is already normalized so we can simply use # `os.sep` instead of `os.path.join`. This results in this helper # being ~5.5 times faster. - return PathInfo( - f"{self.cache_path}{os.sep}{hash_[0:2]}{os.sep}{hash_[2:]}" - ) + return f"{self.cache_path}{os.sep}{hash_[0:2]}{os.sep}{hash_[2:]}" def hashes_exist( self, hashes, jobs=None, name=None diff --git a/dvc/command/add.py b/dvc/command/add.py index 795a926ece..2da6d03ff8 100644 --- a/dvc/command/add.py +++ b/dvc/command/add.py @@ -18,13 +18,11 @@ def run(self): if len(self.args.targets) > 1 and self.args.file: raise RecursiveAddingWhileUsingFilename() - if len(self.args.targets) > 1 and self.args.straight_to_remote: - logger.error( - "--straight-to-remote can't used with multiple targets" - ) + if len(self.args.targets) > 1 and self.args.to_remote: + logger.error("--to-remote can't used with multiple targets") return 1 - if not self.args.straight_to_remote: + if not self.args.to_remote: if self.args.remote: command = "--remote" elif self.args.out: @@ -34,8 +32,7 @@ def run(self): if command is not None: logger.error( - f"--{command} can't be used without" - " --straight-to-remote" + f"--{command} can't be used without" " --to-remote" ) return 1 @@ -49,7 +46,7 @@ def run(self): desc=self.args.desc, out=self.args.out, remote=self.args.remote, - straight_to_remote=self.args.straight_to_remote, + to_remote=self.args.to_remote, ) except DvcException: @@ -106,7 +103,7 @@ def add_parser(subparsers, parent_parser): metavar="", ) parser.add_argument( - "--straight-to-remote", + "--to-remote", action="store_true", default=False, help="Download it directly to the remote", diff --git a/dvc/command/imp_url.py b/dvc/command/imp_url.py index 7d9b667911..8cb8c612bd 100644 --- a/dvc/command/imp_url.py +++ b/dvc/command/imp_url.py @@ -10,16 +10,12 @@ class CmdImportUrl(CmdBase): def run(self): - if self.args.no_exec and self.args.straight_to_remote: - logger.error( - "--no-exec can't be combined with --straight-to-remote" - ) + if self.args.no_exec and self.args.to_remote: + logger.error("--no-exec can't be combined with --to-remote") return 1 - if self.args.remote and not self.args.straight_to_remote: - logger.error( - "--remote can't be used alone without --straight-to-remote" - ) + if self.args.remote and not self.args.to_remote: + logger.error("--remote can't be used alone without --to-remote") return 1 try: @@ -29,7 +25,7 @@ def run(self): fname=self.args.file, no_exec=self.args.no_exec, remote=self.args.remote, - straight_to_remote=self.args.straight_to_remote, + to_remote=self.args.to_remote, desc=self.args.desc, ) except DvcException: @@ -83,7 +79,7 @@ def add_parser(subparsers, parent_parser): help="Only create DVC-file without actually downloading it.", ) import_parser.add_argument( - "--straight-to-remote", + "--to-remote", action="store_true", default=False, help="Download it directly to the remote", diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 4b64fd0cda..5bb557d412 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -54,6 +54,7 @@ def wrapper(repo, *args, **kwargs): class Repo: DVC_DIR = ".dvc" + from dvc.repo._transfer import _transfer from dvc.repo.add import add from dvc.repo.brancher import brancher from dvc.repo.checkout import checkout diff --git a/dvc/repo/_transfer.py b/dvc/repo/_transfer.py new file mode 100644 index 0000000000..e378ab3a16 --- /dev/null +++ b/dvc/repo/_transfer.py @@ -0,0 +1,73 @@ +import os + +from dvc.repo.scm_context import scm_context +from dvc.utils import relpath, resolve_output, resolve_paths +from dvc.utils.fs import path_isin + +from ..exceptions import OutputDuplicationError +from . import locked + + +@locked +@scm_context +def _transfer( + self, + url, + command, + out=None, + fname=None, + erepo=None, + frozen=True, + remote=None, + desc=None, + jobs=None, +): + from dvc.dvcfile import Dvcfile + from dvc.stage import Stage, create_stage, restore_meta + + out = resolve_output(url, out) + path, wdir, out = resolve_paths(self, out) + + # NOTE: when user is transfering something from within their own repository + if ( + erepo is None + and os.path.exists(url) + and path_isin(os.path.abspath(url), self.root_dir) + ): + url = relpath(url, wdir) + + deps = [] + if command == "import-url": + deps.append(url) + + stage = create_stage( + Stage, + self, + fname or path, + wdir=wdir, + deps=deps, + outs=[out], + erepo=erepo, + ) + restore_meta(stage) + + if stage.can_be_skipped: + return None + if desc: + stage.outs[0].desc = desc + + dvcfile = Dvcfile(self, stage.path) + dvcfile.remove() + + try: + self.check_modified_graph([stage]) + except OutputDuplicationError as exc: + raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) + + stage.frozen = frozen + stage.outs[0].hash_info = self.cloud.transfer( + url, jobs=jobs, remote=remote, command=command + ) + + dvcfile.dump(stage) + return stage diff --git a/dvc/repo/add.py b/dvc/repo/add.py index db72c9f083..802fd46404 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -30,7 +30,7 @@ def add( recursive=False, no_commit=False, fname=None, - straight_to_remote=False, + to_remote=False, out=None, remote=None, **kwargs, @@ -42,21 +42,19 @@ def add( targets = ensure_list(targets) - if straight_to_remote: + if to_remote: if len(targets) != 1: raise ValueError( "Can't use straigh_to_remote option with multiple targets" ) - return repo.imp_url( - url=targets[0], + return repo._transfer( # pylint: disable=protected-access + targets[0], + "add", out=out, - remote=remote, - desc=kwargs.get("desc"), fname=kwargs.get("fname"), - command="add", - track_remote_url=False, - straight_to_remote=True, + desc=kwargs.get("desc"), + remote=remote, ) link_failures = [] diff --git a/dvc/repo/imp_url.py b/dvc/repo/imp_url.py index 2ab5aee786..8360b335e3 100644 --- a/dvc/repo/imp_url.py +++ b/dvc/repo/imp_url.py @@ -19,15 +19,26 @@ def imp_url( frozen=True, no_exec=False, remote=None, - track_remote_url=True, - straight_to_remote=False, + to_remote=False, desc=None, jobs=None, - command="import-url", ): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta + if to_remote: + return self._transfer( # pylint: disable=protected-access + url, + "import-url", + out=out, + fname=fname, + erepo=erepo, + frozen=frozen, + remote=remote, + desc=desc, + jobs=jobs, + ) + out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out) @@ -39,16 +50,12 @@ def imp_url( ): url = relpath(url, wdir) - deps = [url] - if not track_remote_url: - deps.clear() - stage = create_stage( Stage, self, fname or path, wdir=wdir, - deps=deps, + deps=[url], outs=[out], erepo=erepo, ) @@ -63,18 +70,13 @@ def imp_url( dvcfile = Dvcfile(self, stage.path) dvcfile.remove() - if not straight_to_remote: - try: - self.check_modified_graph([stage]) - except OutputDuplicationError as exc: - raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) + try: + self.check_modified_graph([stage]) + except OutputDuplicationError as exc: + raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() - elif straight_to_remote: - stage.outs[0].hash_info = self.cloud.transfer( - url, remote=remote, command=command - ) else: stage.run(jobs=jobs) diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 7503b2576d..6b5d2bf858 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -59,7 +59,7 @@ class BaseTree: TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True - CHUNK_SIZE = 512 * 1024 * 1024 + CHUNK_SIZE = 64 * 1024 * 1024 SHARED_MODE_MAP = {None: (None, None), "group": (None, None)} PARAM_CHECKSUM: ClassVar[Optional[str]] = None diff --git a/tests/func/test_add.py b/tests/func/test_add.py index b2e34430c9..9ad4be7112 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -993,11 +993,11 @@ def test_add_long_fname(tmp_dir, dvc): assert (tmp_dir / "data").read_text() == {name: "foo"} -def test_add_straight_to_remote(tmp_dir, dvc, local_cloud, local_remote): +def test_add_to_remote(tmp_dir, dvc, local_cloud, local_remote): local_cloud.gen("foo", "foo") url = "remote://upstream/foo" - stage = dvc.add(url, straight_to_remote=True) + stage = dvc.add(url, to_remote=True) assert not (tmp_dir / "foo").exists() assert (tmp_dir / "foo.dvc").exists() diff --git a/tests/func/test_import_url.py b/tests/func/test_import_url.py index 0537b0fa5a..84888448a0 100644 --- a/tests/func/test_import_url.py +++ b/tests/func/test_import_url.py @@ -265,13 +265,13 @@ def test_import_url_preserve_meta(tmp_dir, dvc): ], indirect=True, ) -def test_import_url_straight_to_remote_single_file( +def test_import_url_to_remote_single_file( tmp_dir, dvc, workspace, local_remote ): workspace.gen("foo", "foo") url = "remote://workspace/foo" - stage = dvc.imp_url(url, straight_to_remote=True) + stage = dvc.imp_url(url, to_remote=True) assert not (tmp_dir / "foo").exists() assert (tmp_dir / "foo.dvc").exists() @@ -300,9 +300,7 @@ def test_import_url_straight_to_remote_single_file( ], indirect=True, ) -def test_import_url_straight_to_remote_directory( - tmp_dir, dvc, workspace, local_remote -): +def test_import_url_to_remote_directory(tmp_dir, dvc, workspace, local_remote): workspace.gen( { "data": { @@ -314,7 +312,7 @@ def test_import_url_straight_to_remote_directory( ) url = "remote://workspace/data" - stage = dvc.imp_url(url, straight_to_remote=True) + stage = dvc.imp_url(url, to_remote=True) assert not (tmp_dir / "data").exists() assert (tmp_dir / "data.dvc").exists() diff --git a/tests/unit/command/test_add.py b/tests/unit/command/test_add.py index 6ca6e86230..4278c259b6 100644 --- a/tests/unit/command/test_add.py +++ b/tests/unit/command/test_add.py @@ -35,17 +35,17 @@ def test_add(mocker, dvc): external=True, out=None, remote=None, - straight_to_remote=False, + to_remote=False, desc="stage description", ) -def test_add_straight_to_remote(mocker): +def test_add_to_remote(mocker): cli_args = parse_args( [ "add", "s3://bucket/foo", - "--straight-to-remote", + "--to-remote", "--out", "bar", "--remote", @@ -68,21 +68,21 @@ def test_add_straight_to_remote(mocker): external=False, out="bar", remote="remote", - straight_to_remote=True, + to_remote=True, desc=None, ) -def test_add_straight_to_remote_invalid_combinations(mocker, caplog): +def test_add_to_remote_invalid_combinations(mocker, caplog): cli_args = parse_args( - ["add", "s3://bucket/foo", "s3://bucket/bar", "--straight-to-remote"] + ["add", "s3://bucket/foo", "s3://bucket/bar", "--to-remote"] ) assert cli_args.func == CmdAdd cmd = cli_args.func(cli_args) with caplog.at_level(logging.ERROR, logger="dvc"): assert cmd.run() == 1 - expected_msg = "--straight-to-remote can't used with multiple targets" + expected_msg = "--to-remote can't used with multiple targets" assert expected_msg in caplog.text for option in "--remote", "--out": @@ -91,7 +91,5 @@ def test_add_straight_to_remote_invalid_combinations(mocker, caplog): cmd = cli_args.func(cli_args) with caplog.at_level(logging.ERROR, logger="dvc"): assert cmd.run() == 1 - expected_msg = ( - f"--{option} can't be used without --straight-to-remote" - ) + expected_msg = f"--{option} can't be used without --to-remote" assert expected_msg in caplog.text diff --git a/tests/unit/command/test_imp_url.py b/tests/unit/command/test_imp_url.py index 979690ca30..0288bd6d8e 100644 --- a/tests/unit/command/test_imp_url.py +++ b/tests/unit/command/test_imp_url.py @@ -22,7 +22,7 @@ def test_import_url(mocker): fname="file", no_exec=False, remote=None, - straight_to_remote=False, + to_remote=False, desc="description", ) @@ -68,18 +68,18 @@ def test_import_url_no_exec(mocker): fname="file", no_exec=True, remote=None, - straight_to_remote=False, + to_remote=False, desc="description", ) -def test_import_url_straight_to_remote(mocker): +def test_import_url_to_remote(mocker): cli_args = parse_args( [ "import-url", "s3://bucket/foo", "bar", - "--straight-to-remote", + "--to-remote", "--remote", "remote", "--desc", @@ -99,18 +99,18 @@ def test_import_url_straight_to_remote(mocker): fname=None, no_exec=False, remote="remote", - straight_to_remote=True, + to_remote=True, desc="description", ) -def test_import_url_straight_to_remote_invalid_combination(mocker, caplog): +def test_import_url_to_remote_invalid_combination(mocker, caplog): cli_args = parse_args( [ "import-url", "s3://bucket/foo", "bar", - "--straight-to-remote", + "--to-remote", "--remote", "remote", "--no-exec", @@ -121,7 +121,7 @@ def test_import_url_straight_to_remote_invalid_combination(mocker, caplog): cmd = cli_args.func(cli_args) with caplog.at_level(logging.ERROR, logger="dvc"): assert cmd.run() == 1 - expected_msg = "--no-exec can't be combined with --straight-to-remote" + expected_msg = "--no-exec can't be combined with --to-remote" assert expected_msg in caplog.text cli_args = parse_args( @@ -131,7 +131,5 @@ def test_import_url_straight_to_remote_invalid_combination(mocker, caplog): cmd = cli_args.func(cli_args) with caplog.at_level(logging.ERROR, logger="dvc"): assert cmd.run() == 1 - expected_msg = ( - "--remote can't be used alone without --straight-to-remote" - ) + expected_msg = "--remote can't be used alone without --to-remote" assert expected_msg in caplog.text From 3e90a66455c7f0a8e138dba7d598a3f51126f878 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Fri, 8 Jan 2021 12:22:45 +0300 Subject: [PATCH 24/36] Use proper temporary location, more efficient pbar calc --- dvc/cache/base.py | 35 +++++++++++++++++++---------------- dvc/repo/imp_url.py | 1 - dvc/tree/base.py | 6 ++---- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 35cce24a54..e742993562 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -249,22 +249,21 @@ def _save_file(self, path_info, tree, hash_info, save_link=True, **kwargs): self.tree.state.save(cache_info, hash_info) def _transfer_file_as_whole(self, from_tree, from_info): - from dvc import tree as dvc_tree + from dvc.utils import tmp_fname # When we can't use the chunked upload, we have to first download # and then calculate the hash as if it were a local file and then # upload it. - with tempfile.NamedTemporaryFile() as temp_file: - temp_tree = dvc_tree.get_cloud_tree(self.repo, url=temp_file.name) - from_tree.download(from_info, temp_tree.path_info) - - hash_info = temp_tree.get_file_hash(temp_tree.path_info) - self.tree.upload( - temp_tree.path_info, - self.tree.hash_to_path_info(hash_info.value), - ) + local_tree = self.repo.cache.local.tree + local_info = local_tree.path_info / tmp_fname() - return hash_info + from_tree.download(from_info, local_info) + hash_info = local_tree.get_file_hash(local_info) + + self.tree.upload( + local_info, self.tree.hash_to_path_info(hash_info.value), + ) + return hash_info def _transfer_file_as_chunked(self, from_tree, from_info): from dvc.utils import tmp_fname @@ -292,7 +291,7 @@ def transfer_file(self, from_tree, from_info): return hash_info - def transfer_directory(self, from_tree, from_info, jobs, callback=None): + def transfer_directory(self, from_tree, from_info, jobs, pbar=None): dir_info = DirInfo() with ThreadPoolExecutor(max_workers=jobs) as executor: futures = { @@ -301,10 +300,13 @@ def transfer_directory(self, from_tree, from_info, jobs, callback=None): ): from_info.relative_to(from_tree.path_info) for from_info in from_tree.walk_files(from_info) } + if pbar is not None: + pbar.total = len(futures) + for future in as_completed(futures): dir_info.trie[futures[future].parts] = future.result() - if callback is not None: - callback() + if pbar is not None: + pbar.update() ( hash_info, @@ -325,14 +327,15 @@ def transfer(self, from_tree, jobs=None, no_progress_bar=False): desc="Transfering files to the remote", unit="Files", disable=no_progress_bar, + total=1, ) as pbar: if from_tree.isdir(from_info): hash_info = self.transfer_directory( - from_tree, from_info, jobs=jobs, callback=pbar.update + from_tree, from_info, jobs=jobs, pbar=pbar ) else: hash_info = self.transfer_file(from_tree, from_info) - pbar.update(1) + pbar.update() return hash_info diff --git a/dvc/repo/imp_url.py b/dvc/repo/imp_url.py index 8360b335e3..e5caf958e6 100644 --- a/dvc/repo/imp_url.py +++ b/dvc/repo/imp_url.py @@ -59,7 +59,6 @@ def imp_url( outs=[out], erepo=erepo, ) - restore_meta(stage) if stage.can_be_skipped: return None diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 6b5d2bf858..fc0d924ce6 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -169,14 +169,12 @@ def dir_mode(self): def cache(self): return getattr(self.repo.cache, self.scheme) - def open( - self, path_info, mode: str = "r", encoding: str = None, **kwargs, - ): + def open(self, path_info, mode: str = "r", encoding: str = None, **kwargs): if hasattr(self, "_generate_download_url"): # pylint:disable=no-member func = self._generate_download_url # type: ignore[attr-defined] get_url = partial(func, path_info) - return open_url(get_url, mode=mode, encoding=encoding, **kwargs,) + return open_url(get_url, mode=mode, encoding=encoding, **kwargs) raise RemoteActionNotImplemented("open", self.scheme) From 859d41f86deee43691a3cd43c059c52aa7a0aa26 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Fri, 8 Jan 2021 18:43:20 +0300 Subject: [PATCH 25/36] Directory transfer logic rewrite --- dvc/cache/base.py | 94 +++++++++++++++++++++++++++------------------- dvc/data_cloud.py | 13 ++++++- dvc/remote/base.py | 4 +- dvc/tree/base.py | 2 +- 4 files changed, 70 insertions(+), 43 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index e742993562..3cbf543b5e 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -4,7 +4,8 @@ import logging import os import tempfile -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent import futures +from concurrent.futures import ThreadPoolExecutor from copy import copy from typing import Optional @@ -283,7 +284,7 @@ def _transfer_file_as_chunked(self, from_tree, from_info): self.tree.move(tmp_info, self.tree.hash_to_path_info(hash_info.value)) return hash_info - def transfer_file(self, from_tree, from_info): + def _transfer_file(self, from_tree, from_info): try: hash_info = self._transfer_file_as_chunked(from_tree, from_info) except RemoteActionNotImplemented: @@ -291,53 +292,66 @@ def transfer_file(self, from_tree, from_info): return hash_info - def transfer_directory(self, from_tree, from_info, jobs, pbar=None): - dir_info = DirInfo() + def _transfer_directory_contents(self, from_tree, from_info, jobs, pbar): + from_infos = from_tree.walk_files(from_info) + + rel_path_infos = {} + + def create_futures(executor, amount): + for entry_info in itertools.islice(from_infos, amount): + pbar.total += 1 + task = executor.submit( + pbar.wrap_fn(self._transfer_file), from_tree, entry_info + ) + rel_path_infos[task] = entry_info.relative_to(from_info) + yield task + + pbar.total = 0 with ThreadPoolExecutor(max_workers=jobs) as executor: - futures = { - executor.submit( - self.transfer_file, from_tree, from_info - ): from_info.relative_to(from_tree.path_info) - for from_info in from_tree.walk_files(from_info) - } - if pbar is not None: - pbar.total = len(futures) - - for future in as_completed(futures): - dir_info.trie[futures[future].parts] = future.result() - if pbar is not None: - pbar.update() + tasks = set(create_futures(executor, jobs * 5)) + + while tasks: + done, tasks = futures.wait( + tasks, return_when=futures.FIRST_COMPLETED + ) + tasks.update(create_futures(executor, len(done))) + for task in done: + yield rel_path_infos.pop(task), task.result() + + def _transfer_directory( + self, from_tree, from_info, jobs, no_progress_bar=False + ): + dir_info = DirInfo() + + with Tqdm(total=1, unit="file", disable=no_progress_bar) as pbar: + for entry_info, entry_hash in self._transfer_directory_contents( + from_tree, from_info, jobs, pbar + ): + dir_info.trie[entry_info.parts] = entry_hash + local_cache = self.repo.cache.local ( hash_info, - hash_path_info, - ) = self.repo.cache.local._get_dir_info_hash( # noqa, pylint: disable=protected-access + to_info, + ) = local_cache._get_dir_info_hash( # pylint: disable=protected-access dir_info ) - self.tree.upload( - hash_path_info, self.tree.hash_to_path_info(hash_info.value) - ) + + self.tree.upload(to_info, self.tree.hash_to_path_info(hash_info.value)) return hash_info - def transfer(self, from_tree, jobs=None, no_progress_bar=False): + def transfer(self, from_tree, from_info, jobs=None, no_progress_bar=False): jobs = jobs or min((from_tree.jobs, self.tree.jobs)) - from_info = from_tree.path_info - - with Tqdm( - desc="Transfering files to the remote", - unit="Files", - disable=no_progress_bar, - total=1, - ) as pbar: - if from_tree.isdir(from_info): - hash_info = self.transfer_directory( - from_tree, from_info, jobs=jobs, pbar=pbar - ) - else: - hash_info = self.transfer_file(from_tree, from_info) - pbar.update() - return hash_info + if from_tree.isdir(from_info): + return self._transfer_directory( + from_tree, + from_info, + jobs=jobs, + no_progress_bar=no_progress_bar, + ) + else: + return self._transfer_file(from_tree, from_info) def _cache_is_copy(self, path_info): """Checks whether cache uses copies.""" @@ -362,6 +376,8 @@ def _cache_is_copy(self, path_info): return self.cache_types[0] == "copy" def _get_dir_info_hash(self, dir_info): + import tempfile + from dvc.path_info import PathInfo from dvc.utils import tmp_fname diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index 9ad105a4c4..93539d7621 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -93,11 +93,22 @@ def pull( ) def transfer(self, source, jobs=None, remote=None, command=None): + """Transfer data items in a cloud-agnostic way. + + Args: + source (str): url for the source location. + jobs (int): number of jobs that can be running simultaneously. + remote (dvc.remote.base.BaseRemote): optional remote to compare + cache to. By default remote from core.remote config option + is used. + command (bool): the command which is benefitting from this function + (to be used for reporting better error messages). + """ from dvc.tree import get_cloud_tree from_tree = get_cloud_tree(self.repo, url=source) remote = self.get_remote(remote, command) - return remote.transfer(from_tree, jobs=jobs) + return remote.transfer(from_tree, from_tree.path_info, jobs=jobs) def status( self, diff --git a/dvc/remote/base.py b/dvc/remote/base.py index d8c78d69e7..773cc53e2a 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -472,9 +472,9 @@ def pull(self, cache, named_cache, jobs=None, show_checksums=False): return ret - def transfer(self, from_tree, jobs=None, no_progress_bar=False): + def transfer(self, from_tree, from_info, jobs=None, no_progress_bar=False): return self.cache.transfer( - from_tree, jobs=jobs, no_progress_bar=no_progress_bar + from_tree, from_info, jobs=jobs, no_progress_bar=no_progress_bar ) @staticmethod diff --git a/dvc/tree/base.py b/dvc/tree/base.py index fc0d924ce6..9ce264b954 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -59,7 +59,7 @@ class BaseTree: TRAVERSE_PREFIX_LEN = 3 TRAVERSE_THRESHOLD_SIZE = 500000 CAN_TRAVERSE = True - CHUNK_SIZE = 64 * 1024 * 1024 + CHUNK_SIZE = 64 * 1024 * 1024 # 64 MiB SHARED_MODE_MAP = {None: (None, None), "group": (None, None)} PARAM_CHECKSUM: ClassVar[Optional[str]] = None From e0902f6c8f870fc6ea6ad11a42169c1089cc3850 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 11 Jan 2021 13:04:09 +0300 Subject: [PATCH 26/36] Use shutil.copyfileobj() --- dvc/tree/base.py | 2 +- dvc/tree/local.py | 30 ++++++++---------------------- dvc/tree/ssh/__init__.py | 10 +++++++--- dvc/utils/fs.py | 16 ++-------------- tests/unit/utils/test_fs.py | 11 ----------- 5 files changed, 18 insertions(+), 51 deletions(-) diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 9ce264b954..7b4ed1be3f 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -239,7 +239,7 @@ def move(self, from_info, to_info, mode=None): def copy(self, from_info, to_info): raise RemoteActionNotImplemented("copy", self.scheme) - def copy_fobj(self, fobj, to_info): + def copy_fobj(self, fobj, to_info, chunk_size=None): raise RemoteActionNotImplemented("copy_fobj", self.scheme) def symlink(self, from_info, to_info): diff --git a/dvc/tree/local.py b/dvc/tree/local.py index 6934dd65b1..b75255d480 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -11,14 +11,7 @@ from dvc.scheme import Schemes from dvc.system import System from dvc.utils import file_md5, is_exec, tmp_fname -from dvc.utils.fs import ( - copy_fobj_to_file, - copy_fobj_to_fobj, - copyfile, - makedirs, - move, - remove, -) +from dvc.utils.fs import copy_fobj_to_file, copyfile, makedirs, move, remove from .base import BaseTree @@ -187,11 +180,11 @@ def copy(self, from_info, to_info): self.remove(tmp_info) raise - def copy_fobj(self, fobj, to_info): + def copy_fobj(self, fobj, to_info, chunk_size=None): self.makedirs(to_info.parent) tmp_info = to_info.parent / tmp_fname("") try: - copy_fobj_to_file(fobj, tmp_info) + copy_fobj_to_file(fobj, tmp_info, chunk_size=chunk_size) os.chmod(tmp_info, self.file_mode) os.rename(tmp_info, to_info) except Exception: @@ -290,18 +283,11 @@ def _upload( os.replace(tmp_file, to_info) def upload_fobj(self, fobj, to_info, no_progress_bar=False): - self.makedirs(to_info.parent) - tmp_info = to_info.parent / tmp_fname("") - try: - with open(tmp_info, mode="wb") as fdest: - copy_fobj_to_fobj( - fobj, fdest, self.CHUNK_SIZE, no_progress_bar - ) - os.chmod(tmp_info, self.file_mode) - os.rename(tmp_info, to_info) - except Exception: - os.remove(tmp_info) - raise + from dvc.progress import Tqdm + + with Tqdm(bytes=True, disable=no_progress_bar) as pbar: + with pbar.wrapattr(fobj, "read") as fobj: + self.copy_fobj(fobj, to_info, chunk_size=self.CHUNK_SIZE) @staticmethod def _download( diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index e4b48b9ce4..d650c4b62f 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -2,6 +2,7 @@ import io import logging import os +import shutil import threading from contextlib import closing, contextmanager from urllib.parse import urlparse @@ -11,7 +12,6 @@ import dvc.prompt as prompt from dvc.hash_info import HashInfo from dvc.scheme import Schemes -from dvc.utils.fs import copy_fobj_to_fobj from ..base import BaseTree from ..pool import get_connection @@ -262,8 +262,12 @@ def _download(self, from_info, to_file, name=None, no_progress_bar=False): ) def upload_fobj(self, fobj, to_info, no_progress_bar=False): - with self.open(to_info, mode="wb") as fdest: - copy_fobj_to_fobj(fobj, fdest, self.CHUNK_SIZE, no_progress_bar) + from dvc.progress import Tqdm + + with Tqdm(bytes=True, disable=no_progress_bar) as pbar: + with pbar.wrapattr(fobj, "read") as fobj: + with self.open(to_info, mode="wb") as fdest: + shutil.copyfileobj(fobj, fdest, length=self.CHUNK_SIZE) def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index 568ca2e6c6..578048434b 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -201,22 +201,10 @@ def copyfile(src, dest, no_progress_bar=False, name=None): fdest_wrapped.write(buf) -def copy_fobj_to_file(fsrc, dest): +def copy_fobj_to_file(fsrc, dest, chunk_size=None): """Copy contents of open file object to destination path.""" with open(dest, "wb+") as fdest: - shutil.copyfileobj(fsrc, fdest) - - -def copy_fobj_to_fobj(fsrc, fdest, chunk_size=-1, no_progress_bar=False): - from dvc.progress import Tqdm - - with Tqdm(disable=no_progress_bar, bytes=True) as pbar: - while True: - data = fsrc.read(chunk_size) - if not data: - break - fdest.write(data) - pbar.update(len(data)) + shutil.copyfileobj(fsrc, fdest, length=chunk_size) def walk_files(directory): diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index ff6205d526..2db42cc093 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -14,7 +14,6 @@ BasePathNotInCheckedPathException, contains_symlink_up_to, copy_fobj_to_file, - copy_fobj_to_fobj, copyfile, get_inode, get_mtime_and_size, @@ -272,15 +271,5 @@ def test_copy_fobj_to_file(tmp_dir): assert filecmp.cmp(src, dest) -def test_copy_fobj_to_fobj(tmp_dir): - tmp_dir.gen({"foo": "foo"}) - src = tmp_dir / "foo" - dest = tmp_dir / "bar" - - with open(src, "rb") as fsrc, open(dest, "wb") as fdest: - copy_fobj_to_fobj(fsrc, fdest) - assert filecmp.cmp(src, dest) - - def test_walk_files(tmp_dir): assert list(walk_files(".")) == list(walk_files(tmp_dir)) From c7eee368d58f6b67516e6482525865d776c99884 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 11 Jan 2021 13:51:09 +0300 Subject: [PATCH 27/36] Rebase... --- dvc/cache/base.py | 1 - dvc/tree/dvc.py | 4 ++-- tests/func/test_s3.py | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 3cbf543b5e..a4f27e29a4 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -3,7 +3,6 @@ import json import logging import os -import tempfile from concurrent import futures from concurrent.futures import ThreadPoolExecutor from copy import copy diff --git a/dvc/tree/dvc.py b/dvc/tree/dvc.py index e2e628acd5..14a094dbc5 100644 --- a/dvc/tree/dvc.py +++ b/dvc/tree/dvc.py @@ -65,8 +65,8 @@ def _get_granular_hash( return file_hash raise FileNotFoundError - def open( - self, path: PathInfo, mode="r", encoding="utf-8", remote=None + def open( # type: ignore + self, path: PathInfo, mode="r", encoding="utf-8", remote=None, **kwargs ): # pylint: disable=arguments-differ try: outs = self._find_outs(path, strict=False) diff --git a/tests/func/test_s3.py b/tests/func/test_s3.py index 67b431d85f..3c17eb5a07 100644 --- a/tests/func/test_s3.py +++ b/tests/func/test_s3.py @@ -122,6 +122,7 @@ def test_s3_isdir(tmp_dir, dvc, s3): assert not tree.isdir(s3 / "data" / "foo") assert tree.isdir(s3 / "data") + @mock_s3 def test_multipart_upload(dvc): from_info, to_info = _get_src_dst() From ef6fd4bb5e771a43fc32b288b4c25dfc41800e52 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 11 Jan 2021 14:13:36 +0300 Subject: [PATCH 28/36] Better tests --- tests/func/test_s3.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/tests/func/test_s3.py b/tests/func/test_s3.py index 3c17eb5a07..fd612d236e 100644 --- a/tests/func/test_s3.py +++ b/tests/func/test_s3.py @@ -1,4 +1,3 @@ -import io from functools import wraps import boto3 @@ -124,16 +123,12 @@ def test_s3_isdir(tmp_dir, dvc, s3): @mock_s3 -def test_multipart_upload(dvc): - from_info, to_info = _get_src_dst() - s3 = boto3.client("s3") - s3.create_bucket(Bucket=from_info.bucket) - - tree = S3Tree(dvc, {"url": str(to_info)}) +def test_s3_upload_fobj(tmp_dir, dvc, s3): + s3.gen({"data": {"foo": "foo"}}) + tree = S3Tree(dvc, s3.config) - stream = io.BytesIO(b"data") - tree.upload_fobj(stream, to_info, 1) + to_info = s3 / "data" / "bar" + with tree.open(s3 / "data" / "foo", "wb") as stream: + tree.upload_fobj(stream, to_info, 1) - assert tree.exists(to_info) - with tree.open(to_info) as stream: - assert stream.read() == "data" + assert to_info.read_text() == "foo" From 313ef1a10eadf9c378f16c2ed9fc6440cf7a1333 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 11 Jan 2021 15:25:37 +0300 Subject: [PATCH 29/36] Inline transfer operation to .add() / .transfer() --- dvc/repo/__init__.py | 1 - dvc/repo/_transfer.py | 73 ------------------------------------------ dvc/repo/add.py | 71 ++++++++++++++++++++++++++++++---------- dvc/repo/imp_url.py | 17 +++------- tests/func/test_add.py | 2 +- 5 files changed, 59 insertions(+), 105 deletions(-) delete mode 100644 dvc/repo/_transfer.py diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 5bb557d412..4b64fd0cda 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -54,7 +54,6 @@ def wrapper(repo, *args, **kwargs): class Repo: DVC_DIR = ".dvc" - from dvc.repo._transfer import _transfer from dvc.repo.add import add from dvc.repo.brancher import brancher from dvc.repo.checkout import checkout diff --git a/dvc/repo/_transfer.py b/dvc/repo/_transfer.py deleted file mode 100644 index e378ab3a16..0000000000 --- a/dvc/repo/_transfer.py +++ /dev/null @@ -1,73 +0,0 @@ -import os - -from dvc.repo.scm_context import scm_context -from dvc.utils import relpath, resolve_output, resolve_paths -from dvc.utils.fs import path_isin - -from ..exceptions import OutputDuplicationError -from . import locked - - -@locked -@scm_context -def _transfer( - self, - url, - command, - out=None, - fname=None, - erepo=None, - frozen=True, - remote=None, - desc=None, - jobs=None, -): - from dvc.dvcfile import Dvcfile - from dvc.stage import Stage, create_stage, restore_meta - - out = resolve_output(url, out) - path, wdir, out = resolve_paths(self, out) - - # NOTE: when user is transfering something from within their own repository - if ( - erepo is None - and os.path.exists(url) - and path_isin(os.path.abspath(url), self.root_dir) - ): - url = relpath(url, wdir) - - deps = [] - if command == "import-url": - deps.append(url) - - stage = create_stage( - Stage, - self, - fname or path, - wdir=wdir, - deps=deps, - outs=[out], - erepo=erepo, - ) - restore_meta(stage) - - if stage.can_be_skipped: - return None - if desc: - stage.outs[0].desc = desc - - dvcfile = Dvcfile(self, stage.path) - dvcfile.remove() - - try: - self.check_modified_graph([stage]) - except OutputDuplicationError as exc: - raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) - - stage.frozen = frozen - stage.outs[0].hash_info = self.cloud.transfer( - url, jobs=jobs, remote=remote, command=command - ) - - dvcfile.dump(stage) - return stage diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 802fd46404..77594f4b08 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -12,7 +12,7 @@ ) from ..progress import Tqdm from ..repo.scm_context import scm_context -from ..utils import LARGE_DIR_SIZE, glob_targets, resolve_paths +from ..utils import LARGE_DIR_SIZE, glob_targets, resolve_output, resolve_paths from . import locked if TYPE_CHECKING: @@ -31,8 +31,6 @@ def add( no_commit=False, fname=None, to_remote=False, - out=None, - remote=None, **kwargs, ): from dvc.utils.collections import ensure_list @@ -43,20 +41,19 @@ def add( targets = ensure_list(targets) if to_remote: + invalid_opt = None if len(targets) != 1: + invalid_opt = "multiple targets" + elif no_commit: + invalid_opt = "no commit option" + elif recursive: + invalid_opt = "recursive option" + + if invalid_opt is not None: raise ValueError( - "Can't use straigh_to_remote option with multiple targets" + f"to-remote option can't be used with {invalid_opt}" ) - return repo._transfer( # pylint: disable=protected-access - targets[0], - "add", - out=out, - fname=kwargs.get("fname"), - desc=kwargs.get("desc"), - remote=remote, - ) - link_failures = [] stages_list = [] num_targets = len(targets) @@ -83,7 +80,12 @@ def add( ) stages = _create_stages( - repo, sub_targets, fname, pbar=pbar, **kwargs, + repo, + sub_targets, + fname, + pbar=pbar, + to_remote=to_remote, + **kwargs, ) try: @@ -108,7 +110,15 @@ def add( ) link_failures.extend( - _process_stages(repo, stages, no_commit, pbar) + _process_stages( + repo, + sub_targets, + stages, + no_commit, + pbar, + to_remote, + **kwargs, + ) ) stages_list += stages @@ -129,12 +139,29 @@ def add( return stages_list -def _process_stages(repo, stages, no_commit, pbar): +def _process_stages( + repo, sub_targets, stages, no_commit, pbar, to_remote, **kwargs +): link_failures = [] from dvc.dvcfile import Dvcfile from ..output.base import OutputDoesNotExistError + if to_remote: + # Already verified in the add() + assert len(stages) == 1 + assert len(sub_targets) == 1 + + [stage] = stages + stage.outs[0].hash_info = repo.cloud.transfer( + sub_targets[0], + jobs=kwargs.get("jobs"), + remote=kwargs.get("remote"), + command="add", + ) + Dvcfile(repo, stage.path).dump(stage) + return link_failures + with Tqdm( total=len(stages), desc="Processing", @@ -181,7 +208,15 @@ def _find_all_targets(repo, target, recursive): def _create_stages( - repo, targets, fname, pbar=None, external=False, glob=False, desc=None, + repo, + targets, + fname, + to_remote=False, + pbar=None, + external=False, + glob=False, + desc=None, + **kwargs, ): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta @@ -195,6 +230,8 @@ def _create_stages( disable=len(expanded_targets) < LARGE_DIR_SIZE, unit="file", ): + if to_remote: + out = resolve_output(out, kwargs.get("out")) path, wdir, out = resolve_paths(repo, out) stage = create_stage( Stage, diff --git a/dvc/repo/imp_url.py b/dvc/repo/imp_url.py index e5caf958e6..6ab35dc26d 100644 --- a/dvc/repo/imp_url.py +++ b/dvc/repo/imp_url.py @@ -26,19 +26,6 @@ def imp_url( from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta - if to_remote: - return self._transfer( # pylint: disable=protected-access - url, - "import-url", - out=out, - fname=fname, - erepo=erepo, - frozen=frozen, - remote=remote, - desc=desc, - jobs=jobs, - ) - out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out) @@ -76,6 +63,10 @@ def imp_url( if no_exec: stage.ignore_outs() + elif to_remote: + stage.outs[0].hash_info = self.cloud.transfer( + url, jobs=jobs, remote=remote, command="import-url" + ) else: stage.run(jobs=jobs) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 9ad4be7112..f4a6cc08d8 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -997,7 +997,7 @@ def test_add_to_remote(tmp_dir, dvc, local_cloud, local_remote): local_cloud.gen("foo", "foo") url = "remote://upstream/foo" - stage = dvc.add(url, to_remote=True) + [stage] = dvc.add(url, to_remote=True) assert not (tmp_dir / "foo").exists() assert (tmp_dir / "foo.dvc").exists() From 41d0f1ad784e7b64c296f43e3338decedb9a9e86 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 11 Jan 2021 15:55:59 +0300 Subject: [PATCH 30/36] Open in the 'rb' mode for the tests --- tests/func/test_s3.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/func/test_s3.py b/tests/func/test_s3.py index fd612d236e..8f6f4c680c 100644 --- a/tests/func/test_s3.py +++ b/tests/func/test_s3.py @@ -113,7 +113,6 @@ def test_copy_multipart_preserve_etag(): S3Tree._copy(s3, from_info, to_info, {}) -@mock_s3 def test_s3_isdir(tmp_dir, dvc, s3): s3.gen({"data": {"foo": "foo"}}) tree = S3Tree(dvc, s3.config) @@ -122,13 +121,12 @@ def test_s3_isdir(tmp_dir, dvc, s3): assert tree.isdir(s3 / "data") -@mock_s3 def test_s3_upload_fobj(tmp_dir, dvc, s3): s3.gen({"data": {"foo": "foo"}}) tree = S3Tree(dvc, s3.config) to_info = s3 / "data" / "bar" - with tree.open(s3 / "data" / "foo", "wb") as stream: + with tree.open(s3 / "data" / "foo", "rb") as stream: tree.upload_fobj(stream, to_info, 1) assert to_info.read_text() == "foo" From 4fb69b38f54edb85d7dd2828d3c3ef92950c0dfe Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 11 Jan 2021 18:15:25 +0300 Subject: [PATCH 31/36] More tests, better invalid option handling at repo level --- dvc/repo/add.py | 6 +++--- dvc/repo/imp_url.py | 3 +++ tests/func/test_add.py | 13 +++++++++++++ tests/func/test_import_url.py | 5 +++++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 77594f4b08..a0e1122c90 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -45,13 +45,13 @@ def add( if len(targets) != 1: invalid_opt = "multiple targets" elif no_commit: - invalid_opt = "no commit option" + invalid_opt = "no_commit" elif recursive: - invalid_opt = "recursive option" + invalid_opt = "recursive" if invalid_opt is not None: raise ValueError( - f"to-remote option can't be used with {invalid_opt}" + f"to-remote option can't be combined with {invalid_opt}" ) link_failures = [] diff --git a/dvc/repo/imp_url.py b/dvc/repo/imp_url.py index 6ab35dc26d..bb187b5fd7 100644 --- a/dvc/repo/imp_url.py +++ b/dvc/repo/imp_url.py @@ -29,6 +29,9 @@ def imp_url( out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out) + if to_remote and no_exec: + raise ValueError("to-remote option can't be combined with no_exec") + # NOTE: when user is importing something from within their own repository if ( erepo is None diff --git a/tests/func/test_add.py b/tests/func/test_add.py index f4a6cc08d8..4c3ef5ab98 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -1007,3 +1007,16 @@ def test_add_to_remote(tmp_dir, dvc, local_cloud, local_remote): hash_info = stage.outs[0].hash_info assert local_remote.hash_to_path_info(hash_info.value).read_text() == "foo" + + +@pytest.mark.parametrize( + "invalid_opt, kwargs", + [ + ("multiple targets", {"targets": ["foo", "bar", "baz"]}), + ("no_commit", {"targets": ["foo"], "no_commit": True}), + ("recursive", {"targets": ["foo"], "recursive": True},), + ], +) +def test_add_to_remote_invalid_combinations(dvc, invalid_opt, kwargs): + with pytest.raises(ValueError, match=invalid_opt): + dvc.add(to_remote=True, **kwargs) diff --git a/tests/func/test_import_url.py b/tests/func/test_import_url.py index 84888448a0..d7fb5c3caa 100644 --- a/tests/func/test_import_url.py +++ b/tests/func/test_import_url.py @@ -337,3 +337,8 @@ def test_import_url_to_remote_directory(tmp_dir, dvc, workspace, local_remote): local_remote.hash_to_path_info(file_part["md5"]).read_text() == file_part["relpath"] ) + + +def test_import_url_to_remote_invalid_combinations(dvc): + with pytest.raises(ValueError, match="no_exec"): + dvc.imp_url("s3://bucket/foo", no_exec=True, to_remote=True) From 4f05fc04a7105f94e1a8d0976b96a551b0ef0d93 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 12 Jan 2021 09:30:05 +0300 Subject: [PATCH 32/36] .read() certain amount of bytes to validate the content --- tests/unit/utils/test_stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/utils/test_stream.py b/tests/unit/utils/test_stream.py index 94ce01b1ff..bde29cdb2f 100644 --- a/tests/unit/utils/test_stream.py +++ b/tests/unit/utils/test_stream.py @@ -8,7 +8,7 @@ def test_hashed_stream_reader(tmp_dir): foo = tmp_dir / "foo" with open(foo, "rb") as fobj: stream_reader = HashedStreamReader(fobj) - stream_reader.read() + assert stream_reader.read(3) == b"foo" hex_digest, _ = file_md5(foo) assert stream_reader.is_text_file From 9bd778ceeb76765f18d00abe8f26bc6df5d1603a Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Tue, 12 Jan 2021 15:27:40 +0300 Subject: [PATCH 33/36] Better invalid argument handling --- dvc/cache/base.py | 11 +++++------ dvc/command/add.py | 19 ------------------- dvc/command/imp_url.py | 8 -------- dvc/repo/add.py | 24 +++++++++++++++--------- dvc/repo/imp_url.py | 11 +++++++++-- tests/func/test_add.py | 7 ++++--- tests/func/test_import_url.py | 3 ++- tests/unit/command/test_add.py | 4 ++-- tests/unit/command/test_imp_url.py | 2 +- 9 files changed, 38 insertions(+), 51 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index a4f27e29a4..dc8c946249 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -292,11 +292,10 @@ def _transfer_file(self, from_tree, from_info): return hash_info def _transfer_directory_contents(self, from_tree, from_info, jobs, pbar): - from_infos = from_tree.walk_files(from_info) - rel_path_infos = {} + from_infos = from_tree.walk_files(from_info) - def create_futures(executor, amount): + def create_tasks(executor, amount): for entry_info in itertools.islice(from_infos, amount): pbar.total += 1 task = executor.submit( @@ -307,13 +306,13 @@ def create_futures(executor, amount): pbar.total = 0 with ThreadPoolExecutor(max_workers=jobs) as executor: - tasks = set(create_futures(executor, jobs * 5)) + tasks = set(create_tasks(executor, jobs * 5)) while tasks: done, tasks = futures.wait( tasks, return_when=futures.FIRST_COMPLETED ) - tasks.update(create_futures(executor, len(done))) + tasks.update(create_tasks(executor, len(done))) for task in done: yield rel_path_infos.pop(task), task.result() @@ -322,7 +321,7 @@ def _transfer_directory( ): dir_info = DirInfo() - with Tqdm(total=1, unit="file", disable=no_progress_bar) as pbar: + with Tqdm(total=1, unit="Files", disable=no_progress_bar) as pbar: for entry_info, entry_hash in self._transfer_directory_contents( from_tree, from_info, jobs, pbar ): diff --git a/dvc/command/add.py b/dvc/command/add.py index 2da6d03ff8..a71fea7d70 100644 --- a/dvc/command/add.py +++ b/dvc/command/add.py @@ -18,24 +18,6 @@ def run(self): if len(self.args.targets) > 1 and self.args.file: raise RecursiveAddingWhileUsingFilename() - if len(self.args.targets) > 1 and self.args.to_remote: - logger.error("--to-remote can't used with multiple targets") - return 1 - - if not self.args.to_remote: - if self.args.remote: - command = "--remote" - elif self.args.out: - command = "--out" - else: - command = None - - if command is not None: - logger.error( - f"--{command} can't be used without" " --to-remote" - ) - return 1 - self.repo.add( self.args.targets, recursive=self.args.recursive, @@ -99,7 +81,6 @@ def add_parser(subparsers, parent_parser): "-o", "--out", help="Destination path to put files to.", - nargs="?", metavar="", ) parser.add_argument( diff --git a/dvc/command/imp_url.py b/dvc/command/imp_url.py index 8cb8c612bd..b2404ebb0a 100644 --- a/dvc/command/imp_url.py +++ b/dvc/command/imp_url.py @@ -10,14 +10,6 @@ class CmdImportUrl(CmdBase): def run(self): - if self.args.no_exec and self.args.to_remote: - logger.error("--no-exec can't be combined with --to-remote") - return 1 - - if self.args.remote and not self.args.to_remote: - logger.error("--remote can't be used alone without --to-remote") - return 1 - try: self.repo.imp_url( self.args.url, diff --git a/dvc/repo/add.py b/dvc/repo/add.py index a0e1122c90..a343df7d4e 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -6,6 +6,7 @@ from ..exceptions import ( CacheLinkError, + InvalidArgumentError, OutputDuplicationError, OverlappingOutputPathsError, RecursiveAddingWhileUsingFilename, @@ -24,7 +25,7 @@ @locked @scm_context -def add( +def add( # noqa: C901 repo, targets: "TargetType", recursive=False, @@ -40,19 +41,24 @@ def add( targets = ensure_list(targets) + invalid_opt = None if to_remote: - invalid_opt = None + message = "{option} can't be used with --to-remote" if len(targets) != 1: invalid_opt = "multiple targets" elif no_commit: - invalid_opt = "no_commit" + invalid_opt = "--no-commit option" elif recursive: - invalid_opt = "recursive" - - if invalid_opt is not None: - raise ValueError( - f"to-remote option can't be combined with {invalid_opt}" - ) + invalid_opt = "--recursive option" + else: + message = "{option} can't be used without --to-remote" + if kwargs.get("out"): + invalid_opt = "--out" + elif kwargs.get("remote"): + invalid_opt = "--remote" + + if invalid_opt is not None: + raise InvalidArgumentError(message.format(option=invalid_opt)) link_failures = [] stages_list = [] diff --git a/dvc/repo/imp_url.py b/dvc/repo/imp_url.py index bb187b5fd7..dc607efec4 100644 --- a/dvc/repo/imp_url.py +++ b/dvc/repo/imp_url.py @@ -4,7 +4,7 @@ from dvc.utils import relpath, resolve_output, resolve_paths from dvc.utils.fs import path_isin -from ..exceptions import OutputDuplicationError +from ..exceptions import InvalidArgumentError, OutputDuplicationError from . import locked @@ -30,7 +30,14 @@ def imp_url( path, wdir, out = resolve_paths(self, out) if to_remote and no_exec: - raise ValueError("to-remote option can't be combined with no_exec") + raise InvalidArgumentError( + "--no-exec can't be combined with --to-remote" + ) + + if not to_remote and remote: + raise InvalidArgumentError( + "--remote can't be used without --to-remote" + ) # NOTE: when user is importing something from within their own repository if ( diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 4c3ef5ab98..77dcd9e3be 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -16,6 +16,7 @@ from dvc.dvcfile import DVC_FILE_SUFFIX from dvc.exceptions import ( DvcException, + InvalidArgumentError, OutputDuplicationError, OverlappingOutputPathsError, RecursiveAddingWhileUsingFilename, @@ -1013,10 +1014,10 @@ def test_add_to_remote(tmp_dir, dvc, local_cloud, local_remote): "invalid_opt, kwargs", [ ("multiple targets", {"targets": ["foo", "bar", "baz"]}), - ("no_commit", {"targets": ["foo"], "no_commit": True}), - ("recursive", {"targets": ["foo"], "recursive": True},), + ("--no-commit", {"targets": ["foo"], "no_commit": True}), + ("--recursive", {"targets": ["foo"], "recursive": True},), ], ) def test_add_to_remote_invalid_combinations(dvc, invalid_opt, kwargs): - with pytest.raises(ValueError, match=invalid_opt): + with pytest.raises(InvalidArgumentError, match=invalid_opt): dvc.add(to_remote=True, **kwargs) diff --git a/tests/func/test_import_url.py b/tests/func/test_import_url.py index d7fb5c3caa..925675c6de 100644 --- a/tests/func/test_import_url.py +++ b/tests/func/test_import_url.py @@ -7,6 +7,7 @@ from dvc.cache import Cache from dvc.dependency.base import DependencyDoesNotExistError +from dvc.exceptions import InvalidArgumentError from dvc.main import main from dvc.stage import Stage from dvc.utils.fs import makedirs @@ -340,5 +341,5 @@ def test_import_url_to_remote_directory(tmp_dir, dvc, workspace, local_remote): def test_import_url_to_remote_invalid_combinations(dvc): - with pytest.raises(ValueError, match="no_exec"): + with pytest.raises(InvalidArgumentError, match="--no-exec"): dvc.imp_url("s3://bucket/foo", no_exec=True, to_remote=True) diff --git a/tests/unit/command/test_add.py b/tests/unit/command/test_add.py index 4278c259b6..10e6e58dae 100644 --- a/tests/unit/command/test_add.py +++ b/tests/unit/command/test_add.py @@ -82,7 +82,7 @@ def test_add_to_remote_invalid_combinations(mocker, caplog): cmd = cli_args.func(cli_args) with caplog.at_level(logging.ERROR, logger="dvc"): assert cmd.run() == 1 - expected_msg = "--to-remote can't used with multiple targets" + expected_msg = "multiple targets can't be used with --to-remote" assert expected_msg in caplog.text for option in "--remote", "--out": @@ -91,5 +91,5 @@ def test_add_to_remote_invalid_combinations(mocker, caplog): cmd = cli_args.func(cli_args) with caplog.at_level(logging.ERROR, logger="dvc"): assert cmd.run() == 1 - expected_msg = f"--{option} can't be used without --to-remote" + expected_msg = f"{option} can't be used without --to-remote" assert expected_msg in caplog.text diff --git a/tests/unit/command/test_imp_url.py b/tests/unit/command/test_imp_url.py index 0288bd6d8e..a816e4ea11 100644 --- a/tests/unit/command/test_imp_url.py +++ b/tests/unit/command/test_imp_url.py @@ -131,5 +131,5 @@ def test_import_url_to_remote_invalid_combination(mocker, caplog): cmd = cli_args.func(cli_args) with caplog.at_level(logging.ERROR, logger="dvc"): assert cmd.run() == 1 - expected_msg = "--remote can't be used alone without --to-remote" + expected_msg = "--remote can't be used without --to-remote" assert expected_msg in caplog.text From 6bc2b2ec0fec26d4054773e11d13723fdbc73b7b Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 14 Jan 2021 10:08:16 +0300 Subject: [PATCH 34/36] Better progress bars by using .getsize() API --- dvc/cache/base.py | 7 ++++++- dvc/tree/local.py | 4 ++-- dvc/tree/s3.py | 11 ++++++++--- dvc/tree/ssh/__init__.py | 4 ++-- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index dc8c946249..c876b4407d 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -277,7 +277,12 @@ def _transfer_file_as_chunked(self, from_tree, from_info): # Since we don't know the hash beforehand, we'll # upload it to a temporary location and then move # it. - self.tree.upload_fobj(stream_reader, tmp_info) + self.tree.upload_fobj( + stream_reader, + tmp_info, + total=from_tree.getsize(from_info), + desc=from_info.name, + ) hash_info = stream_reader.hash_info self.tree.move(tmp_info, self.tree.hash_to_path_info(hash_info.value)) diff --git a/dvc/tree/local.py b/dvc/tree/local.py index b75255d480..d68b61cd9f 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -282,10 +282,10 @@ def _upload( self.chmod(tmp_file, file_mode) os.replace(tmp_file, to_info) - def upload_fobj(self, fobj, to_info, no_progress_bar=False): + def upload_fobj(self, fobj, to_info, no_progress_bar=False, **pbar_args): from dvc.progress import Tqdm - with Tqdm(bytes=True, disable=no_progress_bar) as pbar: + with Tqdm(bytes=True, disable=no_progress_bar, **pbar_args) as pbar: with pbar.wrapattr(fobj, "read") as fobj: self.copy_fobj(fobj, to_info, chunk_size=self.CHUNK_SIZE) diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index 1f634eb5e7..1aee7de6de 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -80,7 +80,10 @@ def s3(self): session = boto3.session.Session(**session_opts) return session.resource( - "s3", endpoint_url=self.endpoint_url, use_ssl=self.use_ssl + "s3", + endpoint_url=self.endpoint_url, + use_ssl=self.use_ssl, + config=boto3.session.Config(signature_version="s3v4"), ) @contextmanager @@ -362,7 +365,7 @@ def _upload( from_file, Callback=pbar.update, ExtraArgs=self.extra_args, ) - def upload_fobj(self, fobj, to_info, no_progress_bar=False): + def upload_fobj(self, fobj, to_info, no_progress_bar=False, **pbar_args): from boto3.s3.transfer import TransferConfig config = TransferConfig( @@ -372,7 +375,9 @@ def upload_fobj(self, fobj, to_info, no_progress_bar=False): use_threads=False, ) with self._get_s3() as s3: - with Tqdm(disable=no_progress_bar, bytes=True) as pbar: + with Tqdm( + disable=no_progress_bar, bytes=True, **pbar_args + ) as pbar: s3.meta.client.upload_fileobj( fobj, to_info.bucket, diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index d650c4b62f..0ce4088ae4 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -261,10 +261,10 @@ def _download(self, from_info, to_file, name=None, no_progress_bar=False): no_progress_bar=no_progress_bar, ) - def upload_fobj(self, fobj, to_info, no_progress_bar=False): + def upload_fobj(self, fobj, to_info, no_progress_bar=False, **pbar_args): from dvc.progress import Tqdm - with Tqdm(bytes=True, disable=no_progress_bar) as pbar: + with Tqdm(bytes=True, disable=no_progress_bar, **pbar_args) as pbar: with pbar.wrapattr(fobj, "read") as fobj: with self.open(to_info, mode="wb") as fdest: shutil.copyfileobj(fobj, fdest, length=self.CHUNK_SIZE) From d9e5385ed94e5202e691553fccfe3ab601b3c142 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 14 Jan 2021 11:27:39 +0300 Subject: [PATCH 35/36] For fallback upload, use the from_info.name instead of temporary naem --- dvc/cache/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index c876b4407d..759d66a48d 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -261,7 +261,9 @@ def _transfer_file_as_whole(self, from_tree, from_info): hash_info = local_tree.get_file_hash(local_info) self.tree.upload( - local_info, self.tree.hash_to_path_info(hash_info.value), + local_info, + self.tree.hash_to_path_info(hash_info.value), + name=from_info.name, ) return hash_info From 06e703538ac44e8f9741c637e12f451a0860f258 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Fri, 15 Jan 2021 14:16:49 +0300 Subject: [PATCH 36/36] introduce --jobs for add/import-url --- dvc/command/add.py | 14 +++++++++++++- dvc/command/imp_url.py | 12 ++++++++++++ dvc/repo/add.py | 2 ++ tests/unit/command/test_add.py | 10 ++++++++-- tests/unit/command/test_imp_url.py | 15 ++++++++++++++- 5 files changed, 49 insertions(+), 4 deletions(-) diff --git a/dvc/command/add.py b/dvc/command/add.py index a71fea7d70..af359b6af7 100644 --- a/dvc/command/add.py +++ b/dvc/command/add.py @@ -29,6 +29,7 @@ def run(self): out=self.args.out, remote=self.args.remote, to_remote=self.args.to_remote, + jobs=self.args.jobs, ) except DvcException: @@ -81,7 +82,7 @@ def add_parser(subparsers, parent_parser): "-o", "--out", help="Destination path to put files to.", - metavar="", + metavar="", ) parser.add_argument( "--to-remote", @@ -95,6 +96,17 @@ def add_parser(subparsers, parent_parser): help="Remote storage to download to", metavar="", ) + parser.add_argument( + "-j", + "--jobs", + type=int, + help=( + "Number of jobs to run simultaneously. " + "The default value is 4 * cpu_count(). " + "For SSH remotes, the default is 4. " + ), + metavar="", + ) parser.add_argument( "--desc", type=str, diff --git a/dvc/command/imp_url.py b/dvc/command/imp_url.py index b2404ebb0a..b11245da6c 100644 --- a/dvc/command/imp_url.py +++ b/dvc/command/imp_url.py @@ -19,6 +19,7 @@ def run(self): remote=self.args.remote, to_remote=self.args.to_remote, desc=self.args.desc, + jobs=self.args.jobs, ) except DvcException: logger.exception( @@ -82,6 +83,17 @@ def add_parser(subparsers, parent_parser): help="Remote storage to download to", metavar="", ) + import_parser.add_argument( + "-j", + "--jobs", + type=int, + help=( + "Number of jobs to run simultaneously. " + "The default value is 4 * cpu_count(). " + "For SSH remotes, the default is 4. " + ), + metavar="", + ) import_parser.add_argument( "--desc", type=str, diff --git a/dvc/repo/add.py b/dvc/repo/add.py index a343df7d4e..ac93f35ad9 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -56,6 +56,8 @@ def add( # noqa: C901 invalid_opt = "--out" elif kwargs.get("remote"): invalid_opt = "--remote" + elif kwargs.get("jobs"): + invalid_opt = "--jobs" if invalid_opt is not None: raise InvalidArgumentError(message.format(option=invalid_opt)) diff --git a/tests/unit/command/test_add.py b/tests/unit/command/test_add.py index 10e6e58dae..1c4f5266af 100644 --- a/tests/unit/command/test_add.py +++ b/tests/unit/command/test_add.py @@ -37,6 +37,7 @@ def test_add(mocker, dvc): remote=None, to_remote=False, desc="stage description", + jobs=None, ) @@ -70,6 +71,7 @@ def test_add_to_remote(mocker): remote="remote", to_remote=True, desc=None, + jobs=None, ) @@ -85,8 +87,12 @@ def test_add_to_remote_invalid_combinations(mocker, caplog): expected_msg = "multiple targets can't be used with --to-remote" assert expected_msg in caplog.text - for option in "--remote", "--out": - cli_args = parse_args(["add", "foo", option, "bar"]) + for option, value in ( + ("--remote", "remote"), + ("--out", "bar"), + ("--jobs", "4"), + ): + cli_args = parse_args(["add", "foo", option, value]) cmd = cli_args.func(cli_args) with caplog.at_level(logging.ERROR, logger="dvc"): diff --git a/tests/unit/command/test_imp_url.py b/tests/unit/command/test_imp_url.py index a816e4ea11..ef82af8c78 100644 --- a/tests/unit/command/test_imp_url.py +++ b/tests/unit/command/test_imp_url.py @@ -7,7 +7,17 @@ def test_import_url(mocker): cli_args = parse_args( - ["import-url", "src", "out", "--file", "file", "--desc", "description"] + [ + "import-url", + "src", + "out", + "--file", + "file", + "--jobs", + "4", + "--desc", + "description", + ] ) assert cli_args.func == CmdImportUrl @@ -24,6 +34,7 @@ def test_import_url(mocker): remote=None, to_remote=False, desc="description", + jobs=4, ) @@ -70,6 +81,7 @@ def test_import_url_no_exec(mocker): remote=None, to_remote=False, desc="description", + jobs=None, ) @@ -101,6 +113,7 @@ def test_import_url_to_remote(mocker): remote="remote", to_remote=True, desc="description", + jobs=None, )