Skip to content

remote: separate cloud remote and cloud cache classes #4019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 12, 2020
8 changes: 4 additions & 4 deletions dvc/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ def _make_remote_property(name):
"""

def getter(self):
from dvc.remote import Remote
from dvc.remote import Cache as CloudCache

remote = self.config.get(name)
if not remote:
return None

return Remote(self.repo, name=remote)
return CloudCache(self.repo, name=remote)

getter.__name__ = name
return cached_property(getter)
Expand All @@ -50,7 +50,7 @@ class Cache:
CACHE_DIR = "cache"

def __init__(self, repo):
from dvc.remote import Remote
from dvc.remote import Cache as CloudCache

self.repo = repo
self.config = config = repo.config["cache"]
Expand All @@ -62,7 +62,7 @@ def __init__(self, repo):
else:
settings = {**config, "url": config["dir"]}

self.local = Remote(repo, **settings)
self.local = CloudCache(repo, **settings)

s3 = _make_remote_property("s3")
gs = _make_remote_property("gs")
Expand Down
4 changes: 1 addition & 3 deletions dvc/dependency/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ def _get_checksum(self, locked=True):

# We are polluting our repo cache with some dir listing here
if tree.isdir(path):
return self.repo.cache.local.get_dir_checksum(
path, tree=tree
)
return self.repo.cache.local.get_checksum(path, tree)
return tree.get_file_checksum(path)

def status(self):
Expand Down
3 changes: 2 additions & 1 deletion dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ def download_update(result):
raise PathMissingError(path, self.url)
save_info = self.local_cache.save(
path,
self.repo_tree,
None,
tree=self.repo_tree,
save_link=False,
download_callback=download_update,
)
save_infos.append(save_info)
Expand Down
2 changes: 1 addition & 1 deletion dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def save(self):

def commit(self):
if self.use_cache:
self.cache.save(self.path_info, self.info)
self.cache.save(self.path_info, self.cache.tree, self.info)

def dumpd(self):
ret = copy(self.info)
Expand Down
42 changes: 30 additions & 12 deletions dvc/remote/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,25 @@
import posixpath
from urllib.parse import urlparse

from dvc.remote.azure import AzureRemote
from dvc.remote.azure import AzureCache, AzureRemote
from dvc.remote.gdrive import GDriveRemote
from dvc.remote.gs import GSRemote
from dvc.remote.hdfs import HDFSRemote
from dvc.remote.gs import GSCache, GSRemote
from dvc.remote.hdfs import HDFSCache, HDFSRemote
from dvc.remote.http import HTTPRemote
from dvc.remote.https import HTTPSRemote
from dvc.remote.local import LocalRemote
from dvc.remote.local import LocalCache, LocalRemote
from dvc.remote.oss import OSSRemote
from dvc.remote.s3 import S3Remote
from dvc.remote.ssh import SSHRemote
from dvc.remote.s3 import S3Cache, S3Remote
from dvc.remote.ssh import SSHCache, SSHRemote

CACHES = [
AzureCache,
GSCache,
HDFSCache,
S3Cache,
SSHCache,
# LocalCache is the default
]

REMOTES = [
AzureRemote,
Expand All @@ -26,21 +35,30 @@
]


def _get(remote_conf):
for remote in REMOTES:
def _get(remote_conf, remotes, default):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@efiop I think cleaning this up to use tree.supported(...) will also require moving scheme and the remaining constants into the tree classes like we discussed. And then once we have single Remote and Cache classes that take a tree as a parameter, we'll be able to get rid of these helper methods.

I think it would be best to leave this as-is for now, and then handle it in the next PR

for remote in remotes:
if remote.supported(remote_conf):
return remote
return LocalRemote
return default


def Remote(repo, **kwargs):
def _get_conf(repo, **kwargs):
name = kwargs.get("name")
if name:
remote_conf = repo.config["remote"][name.lower()]
else:
remote_conf = kwargs
remote_conf = _resolve_remote_refs(repo.config, remote_conf)
return _get(remote_conf)(repo, remote_conf)
return _resolve_remote_refs(repo.config, remote_conf)


def Remote(repo, **kwargs):
remote_conf = _get_conf(repo, **kwargs)
return _get(remote_conf, REMOTES, LocalRemote)(repo, remote_conf)


def Cache(repo, **kwargs):
remote_conf = _get_conf(repo, **kwargs)
return _get(remote_conf, CACHES, LocalCache)(repo, remote_conf)


def _resolve_remote_refs(config, remote_conf):
Expand Down
12 changes: 8 additions & 4 deletions dvc/remote/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from dvc.path_info import CloudURLInfo
from dvc.progress import Tqdm
from dvc.remote.base import BaseRemote, BaseRemoteTree
from dvc.remote.base import BaseRemote, BaseRemoteTree, CacheMixin
from dvc.scheme import Schemes

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -108,6 +108,9 @@ def remove(self, path_info):
logger.debug(f"Removing {path_info}")
self.blob_service.delete_blob(path_info.bucket, path_info.path)

def get_file_checksum(self, path_info):
return self.get_etag(path_info)

def _upload(
self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs
):
Expand All @@ -134,10 +137,11 @@ def _download(
class AzureRemote(BaseRemote):
scheme = Schemes.AZURE
REQUIRES = {"azure-storage-blob": "azure.storage.blob"}
TREE_CLS = AzureRemoteTree
PARAM_CHECKSUM = "etag"
COPY_POLL_SECONDS = 5
LIST_OBJECT_PAGE_SIZE = 5000
TREE_CLS = AzureRemoteTree

def get_file_checksum(self, path_info):
return self.tree.get_etag(path_info)

class AzureCache(AzureRemote, CacheMixin):
pass
Loading