From e75049612302f8211c736089592511d6051f5b48 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Tue, 26 Nov 2019 19:10:07 +0545 Subject: [PATCH 01/16] remote/gs: support adding directories --- dvc/remote/gs.py | 15 +++- .../test_s3.py => func/test_remote_dir.py} | 82 +++++++++++++++++-- 2 files changed, 87 insertions(+), 10 deletions(-) rename tests/{unit/remote/test_s3.py => func/test_remote_dir.py} (55%) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 8ba47c2923..66408abc96 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -5,6 +5,7 @@ from functools import wraps import io import os.path +import posixpath from funcy import cached_property @@ -143,9 +144,19 @@ def _list_paths(self, bucket, prefix): def list_cache_paths(self): return self._list_paths(self.path_info.bucket, self.path_info.path) + def walk_files(self, path_info): + for fname in self._list_paths(path_info.bucket, path_info.path): + yield path_info / posixpath.relpath(fname, path_info.path) + + def isdir(self, path_info): + dir_path = path_info / "" + file = next(self._list_paths(path_info.bucket, dir_path.path), "") + return file.startswith(dir_path.path) + def exists(self, path_info): - paths = set(self._list_paths(path_info.bucket, path_info.path)) - return any(path_info.path == path for path in paths) + dir_path = path_info / "" + file = next(self._list_paths(path_info.bucket, path_info.path), "") + return path_info.path == file or file.startswith(dir_path.path) def _upload(self, from_file, to_info, name=None, no_progress_bar=True): bucket = self.gs.bucket(to_info.bucket) diff --git a/tests/unit/remote/test_s3.py b/tests/func/test_remote_dir.py similarity index 55% rename from tests/unit/remote/test_s3.py rename to tests/func/test_remote_dir.py index a82d005c4d..81fa7b3629 100644 --- a/tests/unit/remote/test_s3.py +++ b/tests/func/test_remote_dir.py @@ -1,12 +1,25 @@ # -*- coding: utf-8 -*- import pytest +import uuid + from moto import mock_s3 +from dvc.remote.gs import RemoteGS from dvc.remote.s3 import RemoteS3 +from tests.func.test_data_cloud import _should_test_gcp + +test_gs = pytest.mark.skipif(not _should_test_gcp(), reason="Skipping on gs.") + + +def create_object_gs(client, bucket, key, body): + bucket = client.get_bucket(bucket) + blob = bucket.blob(key) + blob.upload_from_string(body) + @pytest.fixture -def remote(): +def remote_s3(): """Returns a RemoteS3 connected to a bucket with the following structure: bucket @@ -38,7 +51,54 @@ def remote(): yield remote -def test_isdir(remote): +@pytest.fixture +def remote_gs(): + """Returns a RemoteGS connected to a bucket with the following structure: + bucket + ├── data + │ ├── alice + │ ├── alpha + │ └── subdir + │ ├── 1 + │ ├── 2 + │ └── 3 + ├── empty_dir + ├── empty_file + └── foo + """ + prefix = str(uuid.uuid4()) + REMOTE_URL = "gs://dvc-test/" + prefix + remote = RemoteGS(None, {"url": REMOTE_URL}) + teardowns = [] + + def put_object(file, content): + create_object_gs(remote.gs, "dvc-test", prefix + "/" + file, content) + teardowns.append(lambda: remote.remove(remote.path_info / file)) + + put_object("empty_dir/", "") + put_object("empty_file", "") + put_object("foo", "foo") + put_object("data/alice", "alice") + put_object("data/alpha", "alpha") + put_object("data/subdir/1", "1") + put_object("data/subdir/2", "2") + put_object("data/subdir/3", "3") + + yield remote + + for teardown in teardowns: + teardown() + + +remote_parameterized = pytest.mark.parametrize( + "remote_name", [pytest.param("remote_gs", marks=test_gs), "remote_s3"] +) + + +@remote_parameterized +def test_isdir(request, remote_name): + remote = request.getfixturevalue(remote_name) + test_cases = [ (True, "data"), (True, "data/"), @@ -54,7 +114,10 @@ def test_isdir(remote): assert remote.isdir(remote.path_info / path) == expected -def test_exists(remote): +@remote_parameterized +def test_exists(request, remote_name): + remote = request.getfixturevalue(remote_name) + test_cases = [ (True, "data"), (True, "data/"), @@ -72,7 +135,10 @@ def test_exists(remote): assert remote.exists(remote.path_info / path) == expected -def test_walk_files(remote): +@remote_parameterized +def test_walk_files(request, remote_name): + remote = request.getfixturevalue(remote_name) + files = [ remote.path_info / "data/alice", remote.path_info / "data/alpha", @@ -84,16 +150,16 @@ def test_walk_files(remote): assert list(remote.walk_files(remote.path_info / "data")) == files -def test_copy_preserve_etag_across_buckets(remote): - s3 = remote.s3 +def test_copy_preserve_etag_across_buckets(remote_s3): + s3 = remote_s3.s3 s3.create_bucket(Bucket="another") another = RemoteS3(None, {"url": "s3://another", "region": "us-east-1"}) - from_info = remote.path_info / "foo" + from_info = remote_s3.path_info / "foo" to_info = another.path_info / "foo" - remote.copy(from_info, to_info) + remote_s3.copy(from_info, to_info) from_etag = RemoteS3.get_etag(s3, "bucket", "foo") to_etag = RemoteS3.get_etag(s3, "another", "foo") From 5e2150d509fd7caa1f0bc6375025b4d310b436e5 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 28 Nov 2019 00:07:04 +0545 Subject: [PATCH 02/16] remote/gs: set max_results to improve performance when checking exists/isdir --- dvc/remote/gs.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 66408abc96..aa0958bd38 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -137,12 +137,14 @@ def remove(self, path_info): blob.delete() - def _list_paths(self, bucket, prefix): - for blob in self.gs.bucket(bucket).list_blobs(prefix=prefix): + def _list_paths(self, bucket, prefix, max_items=None): + for blob in self.gs.bucket(bucket).list_blobs( + prefix=prefix, max_results=max_items + ): yield blob.name def list_cache_paths(self): - return self._list_paths(self.path_info.bucket, self.path_info.path) + return self.walk_files(self.path_info.bucket, self.path_info.path) def walk_files(self, path_info): for fname in self._list_paths(path_info.bucket, path_info.path): @@ -150,12 +152,17 @@ def walk_files(self, path_info): def isdir(self, path_info): dir_path = path_info / "" - file = next(self._list_paths(path_info.bucket, dir_path.path), "") - return file.startswith(dir_path.path) + return bool( + list( + self._list_paths(path_info.bucket, dir_path.path, max_items=1) + ) + ) def exists(self, path_info): dir_path = path_info / "" - file = next(self._list_paths(path_info.bucket, path_info.path), "") + file = next( + self._list_paths(path_info.bucket, path_info.path, max_items=1), "" + ) return path_info.path == file or file.startswith(dir_path.path) def _upload(self, from_file, to_info, name=None, no_progress_bar=True): From 0c08c09e5d35e1112de1c33bd82d487aae85fb8b Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 28 Nov 2019 05:45:30 +0545 Subject: [PATCH 03/16] dvc: append slash at the end for dir path_info --- dvc/remote/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 011b0af4aa..f2f0edbb4b 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -312,7 +312,7 @@ def get_checksum(self, path_info): return checksum if self.isdir(path_info): - checksum = self.get_dir_checksum(path_info) + checksum = self.get_dir_checksum(path_info / "") else: checksum = self.get_file_checksum(path_info) From fe248aa130c438d63309779d499fef3791f8e709 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 28 Nov 2019 05:47:04 +0545 Subject: [PATCH 04/16] dvc: use replace instead of posixpath --- dvc/remote/gs.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index aa0958bd38..882e329369 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -5,7 +5,6 @@ from functools import wraps import io import os.path -import posixpath from funcy import cached_property @@ -148,7 +147,7 @@ def list_cache_paths(self): def walk_files(self, path_info): for fname in self._list_paths(path_info.bucket, path_info.path): - yield path_info / posixpath.relpath(fname, path_info.path) + yield path_info.replace(fname) def isdir(self, path_info): dir_path = path_info / "" @@ -160,10 +159,10 @@ def isdir(self, path_info): def exists(self, path_info): dir_path = path_info / "" - file = next( + fname = next( self._list_paths(path_info.bucket, path_info.path, max_items=1), "" ) - return path_info.path == file or file.startswith(dir_path.path) + return path_info.path == fname or fname.startswith(dir_path.path) def _upload(self, from_file, to_info, name=None, no_progress_bar=True): bucket = self.gs.bucket(to_info.bucket) From db12416eacc10dbb8eb264826f22ba3f220856ec Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 28 Nov 2019 06:05:30 +0545 Subject: [PATCH 05/16] remove extra args --- dvc/remote/gs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 882e329369..e8140fc33e 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -143,7 +143,7 @@ def _list_paths(self, bucket, prefix, max_items=None): yield blob.name def list_cache_paths(self): - return self.walk_files(self.path_info.bucket, self.path_info.path) + return self.walk_files(self.path_info) def walk_files(self, path_info): for fname in self._list_paths(path_info.bucket, path_info.path): From 54dd9890dc83b7fa977ca588477852884ce68268 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 28 Nov 2019 07:37:14 +0545 Subject: [PATCH 06/16] gs: return relative path in list_cache_paths --- dvc/remote/gs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index e8140fc33e..6fdd1d4b1f 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -143,7 +143,8 @@ def _list_paths(self, bucket, prefix, max_items=None): yield blob.name def list_cache_paths(self): - return self.walk_files(self.path_info) + for cache in self.walk_files(self.path_info): + yield cache.path def walk_files(self, path_info): for fname in self._list_paths(path_info.bucket, path_info.path): From ee69ac835d9453fb804e71ed9011552a2461c1cb Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Fri, 29 Nov 2019 07:22:15 +0545 Subject: [PATCH 07/16] gs: make it work with empty dir --- dvc/remote/gs.py | 8 ++++++++ tests/func/test_remote_dir.py | 1 + 2 files changed, 9 insertions(+) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 6fdd1d4b1f..bfbd510103 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -148,8 +148,16 @@ def list_cache_paths(self): def walk_files(self, path_info): for fname in self._list_paths(path_info.bucket, path_info.path): + # skip nested empty directories + if fname.endswith("/"): + continue yield path_info.replace(fname) + def makedirs(self, path_info): + self.gs.bucket(path_info.bucket).blob( + (path_info / "").path + ).upload_from_string("") + def isdir(self, path_info): dir_path = path_info / "" return bool( diff --git a/tests/func/test_remote_dir.py b/tests/func/test_remote_dir.py index 81fa7b3629..b78f80041b 100644 --- a/tests/func/test_remote_dir.py +++ b/tests/func/test_remote_dir.py @@ -83,6 +83,7 @@ def put_object(file, content): put_object("data/subdir/1", "1") put_object("data/subdir/2", "2") put_object("data/subdir/3", "3") + put_object("data/subdir/4/", "") yield remote From caa67c725e1e351ed122bdad17db0f29a8e73c39 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sat, 30 Nov 2019 08:22:39 +0545 Subject: [PATCH 08/16] dvc: refactor tests --- dvc/remote/base.py | 2 +- dvc/remote/gs.py | 21 ++-- tests/func/remotes.py | 138 ++++++++++++++++++++++ tests/func/test_api.py | 54 +-------- tests/func/test_remote_dir.py | 168 --------------------------- tests/unit/remote/test_remote_dir.py | 98 ++++++++++++++++ 6 files changed, 245 insertions(+), 236 deletions(-) create mode 100644 tests/func/remotes.py delete mode 100644 tests/func/test_remote_dir.py create mode 100644 tests/unit/remote/test_remote_dir.py diff --git a/dvc/remote/base.py b/dvc/remote/base.py index f2f0edbb4b..011b0af4aa 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -312,7 +312,7 @@ def get_checksum(self, path_info): return checksum if self.isdir(path_info): - checksum = self.get_dir_checksum(path_info / "") + checksum = self.get_dir_checksum(path_info) else: checksum = self.get_file_checksum(path_info) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index bfbd510103..f89654461b 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -136,18 +136,17 @@ def remove(self, path_info): blob.delete() - def _list_paths(self, bucket, prefix, max_items=None): - for blob in self.gs.bucket(bucket).list_blobs( - prefix=prefix, max_results=max_items + def _list_paths(self, path_info, max_items=None): + for blob in self.gs.bucket(path_info.bucket).list_blobs( + prefix=path_info.path, max_results=max_items ): yield blob.name def list_cache_paths(self): - for cache in self.walk_files(self.path_info): - yield cache.path + return self._list_paths(self.path_info) def walk_files(self, path_info): - for fname in self._list_paths(path_info.bucket, path_info.path): + for fname in self._list_paths(path_info): # skip nested empty directories if fname.endswith("/"): continue @@ -160,17 +159,11 @@ def makedirs(self, path_info): def isdir(self, path_info): dir_path = path_info / "" - return bool( - list( - self._list_paths(path_info.bucket, dir_path.path, max_items=1) - ) - ) + return bool(list(self._list_paths(dir_path, max_items=1))) def exists(self, path_info): dir_path = path_info / "" - fname = next( - self._list_paths(path_info.bucket, path_info.path, max_items=1), "" - ) + fname = next(self._list_paths(path_info, max_items=1), "") return path_info.path == fname or fname.startswith(dir_path.path) def _upload(self, from_file, to_info, name=None, no_progress_bar=True): diff --git a/tests/func/remotes.py b/tests/func/remotes.py new file mode 100644 index 0000000000..cee9cc963a --- /dev/null +++ b/tests/func/remotes.py @@ -0,0 +1,138 @@ +from .test_data_cloud import _should_test_aws +from .test_data_cloud import _should_test_azure +from .test_data_cloud import _should_test_gcp +from .test_data_cloud import _should_test_hdfs +from .test_data_cloud import _should_test_oss +from .test_data_cloud import _should_test_ssh +from .test_data_cloud import get_aws_url +from .test_data_cloud import get_azure_url +from .test_data_cloud import get_gcp_url +from .test_data_cloud import get_hdfs_url +from .test_data_cloud import get_local_url +from .test_data_cloud import get_oss_url +from .test_data_cloud import get_ssh_url + +from dvc.remote.gs import RemoteGS +from dvc.remote.s3 import RemoteS3 + +from contextlib import contextmanager + +from moto.s3 import mock_s3 + + +# NOTE: staticmethod is only needed in Python 2 +class Local: + should_test = staticmethod(lambda: True) + get_url = staticmethod(get_local_url) + + +class S3: + should_test = staticmethod(_should_test_aws) + get_url = staticmethod(get_aws_url) + + +class S3Mocked: + should_test = staticmethod(lambda: True) + get_url = staticmethod(get_aws_url) + + @classmethod + def remote(cls): + @contextmanager + def inner(): + with mock_s3(): + remote = RemoteS3(None, {"url": cls.get_url()}) + yield remote + + return inner() + + @classmethod + def put_objects(cls, remote, objects): + @contextmanager + def inner(): + s3 = cls.get_client(remote) + s3.create_bucket(Bucket="dvc-test") + for key, body in objects.items(): + cls.put_object(remote, key, body) + yield + + return inner() + + @staticmethod + def get_client(remote): + return remote.s3 + + @classmethod + def put_object(cls, remote, key, body): + s3 = cls.get_client(remote) + bucket = remote.path_info.bucket + + s3.put_object( + Bucket=bucket, Key=remote.path_info.path + "/" + key, Body=body + ) + + +class GCP: + should_test = staticmethod(_should_test_gcp) + get_url = staticmethod(get_gcp_url) + + @classmethod + def remote(cls): + @contextmanager + def inner(): + remote = RemoteGS(None, {"url": cls.get_url()}) + yield remote + + return inner() + + @classmethod + def put_objects(cls, remote, objects): + @contextmanager + def inner(): + for key, body in objects.items(): + cls.put_object(remote, key, body) + yield + cls.remove(remote, objects.keys()) + + return inner() + + @classmethod + def put_object(cls, remote, key, body): + client = cls.get_client(remote) + bucket = remote.path_info.bucket + + bucket = client.get_bucket(bucket) + blob = bucket.blob(remote.path_info.path + "/" + key) + blob.upload_from_string(body) + + @staticmethod + def get_client(remote): + return remote.gs + + @staticmethod + def remove(remote, files): + for fname in files: + remote.remove(remote.path_info / fname) + + +class Azure: + should_test = staticmethod(_should_test_azure) + get_url = staticmethod(get_azure_url) + + +class OSS: + should_test = staticmethod(_should_test_oss) + get_url = staticmethod(get_oss_url) + + +class SSH: + should_test = staticmethod(_should_test_ssh) + get_url = staticmethod(get_ssh_url) + + +class HDFS: + should_test = staticmethod(_should_test_hdfs) + get_url = staticmethod(get_hdfs_url) + + +remote_params = [S3, GCP, Azure, OSS, SSH, HDFS] +all_remote_params = [Local] + remote_params diff --git a/tests/func/test_api.py b/tests/func/test_api.py index 3ba71ea467..a8f2938530 100644 --- a/tests/func/test_api.py +++ b/tests/func/test_api.py @@ -3,64 +3,12 @@ import pytest -from .test_data_cloud import _should_test_aws -from .test_data_cloud import _should_test_azure -from .test_data_cloud import _should_test_gcp -from .test_data_cloud import _should_test_hdfs -from .test_data_cloud import _should_test_oss -from .test_data_cloud import _should_test_ssh -from .test_data_cloud import get_aws_url -from .test_data_cloud import get_azure_url -from .test_data_cloud import get_gcp_url -from .test_data_cloud import get_hdfs_url -from .test_data_cloud import get_local_url -from .test_data_cloud import get_oss_url -from .test_data_cloud import get_ssh_url from dvc import api from dvc.exceptions import FileMissingError from dvc.main import main from dvc.path_info import URLInfo from dvc.remote.config import RemoteConfig - - -# NOTE: staticmethod is only needed in Python 2 -class Local: - should_test = staticmethod(lambda: True) - get_url = staticmethod(get_local_url) - - -class S3: - should_test = staticmethod(_should_test_aws) - get_url = staticmethod(get_aws_url) - - -class GCP: - should_test = staticmethod(_should_test_gcp) - get_url = staticmethod(get_gcp_url) - - -class Azure: - should_test = staticmethod(_should_test_azure) - get_url = staticmethod(get_azure_url) - - -class OSS: - should_test = staticmethod(_should_test_oss) - get_url = staticmethod(get_oss_url) - - -class SSH: - should_test = staticmethod(_should_test_ssh) - get_url = staticmethod(get_ssh_url) - - -class HDFS: - should_test = staticmethod(_should_test_hdfs) - get_url = staticmethod(get_hdfs_url) - - -remote_params = [S3, GCP, Azure, OSS, SSH, HDFS] -all_remote_params = [Local] + remote_params +from tests.func.remotes import remote_params, all_remote_params @pytest.fixture diff --git a/tests/func/test_remote_dir.py b/tests/func/test_remote_dir.py deleted file mode 100644 index b78f80041b..0000000000 --- a/tests/func/test_remote_dir.py +++ /dev/null @@ -1,168 +0,0 @@ -# -*- coding: utf-8 -*- -import pytest -import uuid - -from moto import mock_s3 - -from dvc.remote.gs import RemoteGS -from dvc.remote.s3 import RemoteS3 - -from tests.func.test_data_cloud import _should_test_gcp - -test_gs = pytest.mark.skipif(not _should_test_gcp(), reason="Skipping on gs.") - - -def create_object_gs(client, bucket, key, body): - bucket = client.get_bucket(bucket) - blob = bucket.blob(key) - blob.upload_from_string(body) - - -@pytest.fixture -def remote_s3(): - """Returns a RemoteS3 connected to a bucket with the following structure: - - bucket - ├── data - │ ├── alice - │ ├── alpha - │ └── subdir - │ ├── 1 - │ ├── 2 - │ └── 3 - ├── empty_dir - ├── empty_file - └── foo - """ - with mock_s3(): - remote = RemoteS3(None, {"url": "s3://bucket", "region": "us-east-1"}) - s3 = remote.s3 - - s3.create_bucket(Bucket="bucket") - s3.put_object(Bucket="bucket", Key="empty_dir/") - s3.put_object(Bucket="bucket", Key="empty_file", Body=b"") - s3.put_object(Bucket="bucket", Key="foo", Body=b"foo") - s3.put_object(Bucket="bucket", Key="data/alice", Body=b"alice") - s3.put_object(Bucket="bucket", Key="data/alpha", Body=b"alpha") - s3.put_object(Bucket="bucket", Key="data/subdir/1", Body=b"1") - s3.put_object(Bucket="bucket", Key="data/subdir/2", Body=b"2") - s3.put_object(Bucket="bucket", Key="data/subdir/3", Body=b"3") - - yield remote - - -@pytest.fixture -def remote_gs(): - """Returns a RemoteGS connected to a bucket with the following structure: - bucket - ├── data - │ ├── alice - │ ├── alpha - │ └── subdir - │ ├── 1 - │ ├── 2 - │ └── 3 - ├── empty_dir - ├── empty_file - └── foo - """ - prefix = str(uuid.uuid4()) - REMOTE_URL = "gs://dvc-test/" + prefix - remote = RemoteGS(None, {"url": REMOTE_URL}) - teardowns = [] - - def put_object(file, content): - create_object_gs(remote.gs, "dvc-test", prefix + "/" + file, content) - teardowns.append(lambda: remote.remove(remote.path_info / file)) - - put_object("empty_dir/", "") - put_object("empty_file", "") - put_object("foo", "foo") - put_object("data/alice", "alice") - put_object("data/alpha", "alpha") - put_object("data/subdir/1", "1") - put_object("data/subdir/2", "2") - put_object("data/subdir/3", "3") - put_object("data/subdir/4/", "") - - yield remote - - for teardown in teardowns: - teardown() - - -remote_parameterized = pytest.mark.parametrize( - "remote_name", [pytest.param("remote_gs", marks=test_gs), "remote_s3"] -) - - -@remote_parameterized -def test_isdir(request, remote_name): - remote = request.getfixturevalue(remote_name) - - test_cases = [ - (True, "data"), - (True, "data/"), - (True, "data/subdir"), - (True, "empty_dir"), - (False, "foo"), - (False, "data/alice"), - (False, "data/al"), - (False, "data/subdir/1"), - ] - - for expected, path in test_cases: - assert remote.isdir(remote.path_info / path) == expected - - -@remote_parameterized -def test_exists(request, remote_name): - remote = request.getfixturevalue(remote_name) - - test_cases = [ - (True, "data"), - (True, "data/"), - (True, "data/subdir"), - (True, "empty_dir"), - (True, "empty_file"), - (True, "foo"), - (True, "data/alice"), - (True, "data/subdir/1"), - (False, "data/al"), - (False, "foo/"), - ] - - for expected, path in test_cases: - assert remote.exists(remote.path_info / path) == expected - - -@remote_parameterized -def test_walk_files(request, remote_name): - remote = request.getfixturevalue(remote_name) - - files = [ - remote.path_info / "data/alice", - remote.path_info / "data/alpha", - remote.path_info / "data/subdir/1", - remote.path_info / "data/subdir/2", - remote.path_info / "data/subdir/3", - ] - - assert list(remote.walk_files(remote.path_info / "data")) == files - - -def test_copy_preserve_etag_across_buckets(remote_s3): - s3 = remote_s3.s3 - s3.create_bucket(Bucket="another") - - another = RemoteS3(None, {"url": "s3://another", "region": "us-east-1"}) - - from_info = remote_s3.path_info / "foo" - to_info = another.path_info / "foo" - - remote_s3.copy(from_info, to_info) - - from_etag = RemoteS3.get_etag(s3, "bucket", "foo") - to_etag = RemoteS3.get_etag(s3, "another", "foo") - - assert from_etag == to_etag diff --git a/tests/unit/remote/test_remote_dir.py b/tests/unit/remote/test_remote_dir.py new file mode 100644 index 0000000000..827fcab0eb --- /dev/null +++ b/tests/unit/remote/test_remote_dir.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +import pytest + +from dvc.remote.s3 import RemoteS3 + +from tests.func.remotes import GCP, S3Mocked + +remotes = [GCP, S3Mocked] + +FILE_WITH_CONTENTS = { + "empty_dir/": "", + "empty_file": "", + "foo": "foo", + "data/alice": "alice", + "data/alpha": "alpha", + "data/subdir/1": "1", + "data/subdir/2": "2", + "data/subdir/3": "3", + "data/subdir/empty_dir/": "", + "data/subdir/empty_file": "", +} + + +@pytest.fixture +def remote(request): + if not request.param.should_test(): + raise pytest.skip() + with request.param.remote() as remote: + with request.param.put_objects(remote, FILE_WITH_CONTENTS): + yield remote + + +@pytest.mark.parametrize("remote", remotes, indirect=True) +def test_isdir(remote): + test_cases = [ + (True, "data"), + (True, "data/"), + (True, "data/subdir"), + (True, "empty_dir"), + (False, "foo"), + (False, "data/alice"), + (False, "data/al"), + (False, "data/subdir/1"), + ] + + for expected, path in test_cases: + assert remote.isdir(remote.path_info / path) == expected + + +@pytest.mark.parametrize("remote", remotes, indirect=True) +def test_exists(remote): + test_cases = [ + (True, "data"), + (True, "data/"), + (True, "data/subdir"), + (True, "empty_dir"), + (True, "empty_file"), + (True, "foo"), + (True, "data/alice"), + (True, "data/subdir/1"), + (False, "data/al"), + (False, "foo/"), + ] + + for expected, path in test_cases: + assert remote.exists(remote.path_info / path) == expected + + +@pytest.mark.parametrize("remote", remotes, indirect=True) +def test_walk_files(remote): + files = [ + remote.path_info / "data/alice", + remote.path_info / "data/alpha", + remote.path_info / "data/subdir/1", + remote.path_info / "data/subdir/2", + remote.path_info / "data/subdir/3", + remote.path_info / "data/subdir/empty_file", + ] + + assert list(remote.walk_files(remote.path_info / "data")) == files + + +@pytest.mark.parametrize("remote", [S3Mocked], indirect=True) +def test_copy_preserve_etag_across_buckets(remote): + s3 = remote.s3 + s3.create_bucket(Bucket="another") + + another = RemoteS3(None, {"url": "s3://another", "region": "us-east-1"}) + + from_info = remote.path_info / "foo" + to_info = another.path_info / "foo" + + remote.copy(from_info, to_info) + + from_etag = RemoteS3.get_etag(s3, "bucket", "foo") + to_etag = RemoteS3.get_etag(s3, "another", "foo") + + assert from_etag == to_etag From 63cf117ee939d187c098084f3b49aa6144bb58b7 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sat, 30 Nov 2019 08:58:43 +0545 Subject: [PATCH 09/16] Move to tests/remotes.py --- tests/func/test_api.py | 2 +- tests/{func => }/remotes.py | 0 tests/unit/remote/test_remote_dir.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename tests/{func => }/remotes.py (100%) diff --git a/tests/func/test_api.py b/tests/func/test_api.py index 959d9a1988..0e4375a1a6 100644 --- a/tests/func/test_api.py +++ b/tests/func/test_api.py @@ -8,7 +8,7 @@ from dvc.main import main from dvc.path_info import URLInfo from dvc.remote.config import RemoteConfig -from tests.func.remotes import remote_params, all_remote_params +from tests.remotes import remote_params, all_remote_params @pytest.fixture diff --git a/tests/func/remotes.py b/tests/remotes.py similarity index 100% rename from tests/func/remotes.py rename to tests/remotes.py diff --git a/tests/unit/remote/test_remote_dir.py b/tests/unit/remote/test_remote_dir.py index e919d40409..9213a670f3 100644 --- a/tests/unit/remote/test_remote_dir.py +++ b/tests/unit/remote/test_remote_dir.py @@ -3,7 +3,7 @@ from dvc.remote.s3 import RemoteS3 -from tests.func.remotes import GCP, S3Mocked +from tests.remotes import GCP, S3Mocked remotes = [GCP, S3Mocked] From 42991d3b75c7551649736a4a48504831b53d6457 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sat, 30 Nov 2019 10:04:45 +0545 Subject: [PATCH 10/16] Move more stuff from data_cloud to tests/remotes.py --- tests/func/test_data_cloud.py | 230 +++--------------------------- tests/func/test_remote.py | 3 +- tests/func/test_repro.py | 16 ++- tests/func/test_s3.py | 2 +- tests/remotes.py | 229 +++++++++++++++++++++++++++-- tests/unit/remote/ssh/test_ssh.py | 2 +- 6 files changed, 249 insertions(+), 233 deletions(-) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index ad0d33020b..0d56734668 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -1,13 +1,8 @@ import copy -import getpass import logging import os -import platform import shutil import uuid -from subprocess import CalledProcessError -from subprocess import check_output -from subprocess import Popen from unittest import SkipTest import pytest @@ -29,7 +24,6 @@ from dvc.remote.base import STATUS_DELETED from dvc.remote.base import STATUS_NEW from dvc.remote.base import STATUS_OK -from dvc.utils import env2bool from dvc.utils import file_md5 from dvc.utils.compat import str from dvc.utils.stage import dump_stage_file @@ -37,208 +31,30 @@ from tests.basic_env import TestDvc from tests.utils import spy - -TEST_REMOTE = "upstream" -TEST_SECTION = 'remote "{}"'.format(TEST_REMOTE) -TEST_CONFIG = { - Config.SECTION_CACHE: {}, - Config.SECTION_CORE: {Config.SECTION_CORE_REMOTE: TEST_REMOTE}, - TEST_SECTION: {Config.SECTION_REMOTE_URL: ""}, -} - -TEST_AWS_REPO_BUCKET = os.environ.get("DVC_TEST_AWS_REPO_BUCKET", "dvc-test") -TEST_GCP_REPO_BUCKET = os.environ.get("DVC_TEST_GCP_REPO_BUCKET", "dvc-test") -TEST_OSS_REPO_BUCKET = "dvc-test" - -TEST_GCP_CREDS_FILE = os.path.abspath( - os.environ.get( - "GOOGLE_APPLICATION_CREDENTIALS", - os.path.join("scripts", "ci", "gcp-creds.json"), - ) -) -# Ensure that absolute path is used -os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = TEST_GCP_CREDS_FILE - -TEST_GDRIVE_CLIENT_ID = ( - "719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com" +from tests.remotes import ( + _should_test_aws, + _should_test_azure, + _should_test_gcp, + _should_test_gdrive, + _should_test_hdfs, + _should_test_oss, + _should_test_ssh, + TEST_CONFIG, + TEST_SECTION, + TEST_GCP_CREDS_FILE, + TEST_GDRIVE_CLIENT_ID, + TEST_GDRIVE_CLIENT_SECRET, + TEST_REMOTE, + get_aws_url, + get_azure_url, + get_gcp_url, + get_gdrive_url, + get_hdfs_url, + get_local_url, + get_oss_url, + get_ssh_url, + get_ssh_url_mocked, ) -TEST_GDRIVE_CLIENT_SECRET = "2fy_HyzSwkxkGzEken7hThXb" - - -def _should_test_aws(): - do_test = env2bool("DVC_TEST_AWS", undefined=None) - if do_test is not None: - return do_test - - if os.getenv("AWS_ACCESS_KEY_ID") and os.getenv("AWS_SECRET_ACCESS_KEY"): - return True - - return False - - -def _should_test_gdrive(): - if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): - return True - - return False - - -def _should_test_gcp(): - do_test = env2bool("DVC_TEST_GCP", undefined=None) - if do_test is not None: - return do_test - - if not os.path.exists(TEST_GCP_CREDS_FILE): - return False - - try: - check_output( - [ - "gcloud", - "auth", - "activate-service-account", - "--key-file", - TEST_GCP_CREDS_FILE, - ] - ) - except (CalledProcessError, OSError): - return False - return True - - -def _should_test_azure(): - do_test = env2bool("DVC_TEST_AZURE", undefined=None) - if do_test is not None: - return do_test - - return os.getenv("AZURE_STORAGE_CONTAINER_NAME") and os.getenv( - "AZURE_STORAGE_CONNECTION_STRING" - ) - - -def _should_test_oss(): - do_test = env2bool("DVC_TEST_OSS", undefined=None) - if do_test is not None: - return do_test - - return ( - os.getenv("OSS_ENDPOINT") - and os.getenv("OSS_ACCESS_KEY_ID") - and os.getenv("OSS_ACCESS_KEY_SECRET") - ) - - -def _should_test_ssh(): - do_test = env2bool("DVC_TEST_SSH", undefined=None) - if do_test is not None: - return do_test - - # FIXME: enable on windows - if os.name == "nt": - return False - - try: - check_output(["ssh", "-o", "BatchMode=yes", "127.0.0.1", "ls"]) - except (CalledProcessError, IOError): - return False - - return True - - -def _should_test_hdfs(): - if platform.system() != "Linux": - return False - - try: - check_output( - ["hadoop", "version"], shell=True, executable=os.getenv("SHELL") - ) - except (CalledProcessError, IOError): - return False - - p = Popen( - "hadoop fs -ls hdfs://127.0.0.1/", - shell=True, - executable=os.getenv("SHELL"), - ) - p.communicate() - if p.returncode != 0: - return False - - return True - - -def get_local_storagepath(): - return TestDvc.mkdtemp() - - -def get_local_url(): - return get_local_storagepath() - - -def get_ssh_url(): - return "ssh://{}@127.0.0.1:22{}".format( - getpass.getuser(), get_local_storagepath() - ) - - -def get_ssh_url_mocked(user, port): - path = get_local_storagepath() - if os.name == "nt": - # NOTE: On Windows get_local_storagepath() will return an ntpath - # that looks something like `C:\some\path`, which is not compatible - # with SFTP paths [1], so we need to convert it to a proper posixpath. - # To do that, we should construct a posixpath that would be relative - # to the server's root. In our case our ssh server is running with - # `c:/` as a root, and our URL format requires absolute paths, so the - # resulting path would look like `/some/path`. - # - # [1]https://tools.ietf.org/html/draft-ietf-secsh-filexfer-13#section-6 - drive, path = os.path.splitdrive(path) - assert drive.lower() == "c:" - path = path.replace("\\", "/") - url = "ssh://{}@127.0.0.1:{}{}".format(user, port, path) - return url - - -def get_hdfs_url(): - return "hdfs://{}@127.0.0.1{}".format( - getpass.getuser(), get_local_storagepath() - ) - - -def get_aws_storagepath(): - return TEST_AWS_REPO_BUCKET + "/" + str(uuid.uuid4()) - - -def get_aws_url(): - return "s3://" + get_aws_storagepath() - - -def get_gdrive_url(): - return "gdrive://root/" + str(uuid.uuid4()) - - -def get_gcp_storagepath(): - return TEST_GCP_REPO_BUCKET + "/" + str(uuid.uuid4()) - - -def get_gcp_url(): - return "gs://" + get_gcp_storagepath() - - -def get_azure_url(): - container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME") - assert container_name is not None - return "azure://{}/{}".format(container_name, str(uuid.uuid4())) - - -def get_oss_storagepath(): - return "{}/{}".format(TEST_OSS_REPO_BUCKET, (uuid.uuid4())) - - -def get_oss_url(): - return "oss://{}".format(get_oss_storagepath()) class TestDataCloud(TestDvc): diff --git a/tests/func/test_remote.py b/tests/func/test_remote.py index 8e8f00deeb..23f318861b 100644 --- a/tests/func/test_remote.py +++ b/tests/func/test_remote.py @@ -4,14 +4,13 @@ import configobj from mock import patch -from .test_data_cloud import get_local_url from dvc.config import Config from dvc.main import main from dvc.path_info import PathInfo from dvc.remote import RemoteLOCAL from dvc.remote.base import RemoteBASE from tests.basic_env import TestDvc -from tests.func.test_data_cloud import get_local_storagepath +from tests.remotes import get_local_url, get_local_storagepath class TestRemote(TestDvc): diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index 4e46fdde90..a85995fe9e 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -36,13 +36,15 @@ from dvc.utils.stage import dump_stage_file from dvc.utils.stage import load_stage_file from tests.basic_env import TestDvc -from tests.func.test_data_cloud import _should_test_aws -from tests.func.test_data_cloud import _should_test_gcp -from tests.func.test_data_cloud import _should_test_hdfs -from tests.func.test_data_cloud import _should_test_ssh -from tests.func.test_data_cloud import get_ssh_url -from tests.func.test_data_cloud import TEST_AWS_REPO_BUCKET -from tests.func.test_data_cloud import TEST_GCP_REPO_BUCKET +from tests.remotes import ( + _should_test_aws, + _should_test_gcp, + _should_test_hdfs, + _should_test_ssh, + get_ssh_url, + TEST_AWS_REPO_BUCKET, + TEST_GCP_REPO_BUCKET, +) from tests.utils.httpd import StaticFileServer, ContentMD5Handler diff --git a/tests/func/test_s3.py b/tests/func/test_s3.py index 7a58f9c527..a2cdb26ef3 100644 --- a/tests/func/test_s3.py +++ b/tests/func/test_s3.py @@ -5,7 +5,7 @@ from moto import mock_s3 from dvc.remote.s3 import RemoteS3 -from tests.func.test_data_cloud import get_aws_url +from tests.remotes import get_aws_url # from https://github.com/spulec/moto/blob/v1.3.5/tests/test_s3/test_s3.py#L40 diff --git a/tests/remotes.py b/tests/remotes.py index cee9cc963a..3fdfae3f87 100644 --- a/tests/remotes.py +++ b/tests/remotes.py @@ -1,25 +1,224 @@ -from .test_data_cloud import _should_test_aws -from .test_data_cloud import _should_test_azure -from .test_data_cloud import _should_test_gcp -from .test_data_cloud import _should_test_hdfs -from .test_data_cloud import _should_test_oss -from .test_data_cloud import _should_test_ssh -from .test_data_cloud import get_aws_url -from .test_data_cloud import get_azure_url -from .test_data_cloud import get_gcp_url -from .test_data_cloud import get_hdfs_url -from .test_data_cloud import get_local_url -from .test_data_cloud import get_oss_url -from .test_data_cloud import get_ssh_url +import os +import platform +import uuid +import getpass +from contextlib import contextmanager +from subprocess import CalledProcessError, check_output, Popen + +from dvc.utils import env2bool +from dvc.config import Config +from dvc.remote import RemoteGDrive from dvc.remote.gs import RemoteGS from dvc.remote.s3 import RemoteS3 - -from contextlib import contextmanager +from tests.basic_env import TestDvc from moto.s3 import mock_s3 +TEST_REMOTE = "upstream" +TEST_SECTION = 'remote "{}"'.format(TEST_REMOTE) +TEST_CONFIG = { + Config.SECTION_CACHE: {}, + Config.SECTION_CORE: {Config.SECTION_CORE_REMOTE: TEST_REMOTE}, + TEST_SECTION: {Config.SECTION_REMOTE_URL: ""}, +} + +TEST_AWS_REPO_BUCKET = os.environ.get("DVC_TEST_AWS_REPO_BUCKET", "dvc-test") +TEST_GCP_REPO_BUCKET = os.environ.get("DVC_TEST_GCP_REPO_BUCKET", "dvc-test") +TEST_OSS_REPO_BUCKET = "dvc-test" + +TEST_GCP_CREDS_FILE = os.path.abspath( + os.environ.get( + "GOOGLE_APPLICATION_CREDENTIALS", + os.path.join("scripts", "ci", "gcp-creds.json"), + ) +) +# Ensure that absolute path is used +os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = TEST_GCP_CREDS_FILE + +TEST_GDRIVE_CLIENT_ID = ( + "719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com" +) +TEST_GDRIVE_CLIENT_SECRET = "2fy_HyzSwkxkGzEken7hThXb" + + +def _should_test_aws(): + do_test = env2bool("DVC_TEST_AWS", undefined=None) + if do_test is not None: + return do_test + + if os.getenv("AWS_ACCESS_KEY_ID") and os.getenv("AWS_SECRET_ACCESS_KEY"): + return True + + return False + + +def _should_test_gdrive(): + if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): + return True + + return False + + +def _should_test_gcp(): + do_test = env2bool("DVC_TEST_GCP", undefined=None) + if do_test is not None: + return do_test + + if not os.path.exists(TEST_GCP_CREDS_FILE): + return False + + try: + check_output( + [ + "gcloud", + "auth", + "activate-service-account", + "--key-file", + TEST_GCP_CREDS_FILE, + ] + ) + except (CalledProcessError, OSError): + return False + return True + + +def _should_test_azure(): + do_test = env2bool("DVC_TEST_AZURE", undefined=None) + if do_test is not None: + return do_test + + return os.getenv("AZURE_STORAGE_CONTAINER_NAME") and os.getenv( + "AZURE_STORAGE_CONNECTION_STRING" + ) + + +def _should_test_oss(): + do_test = env2bool("DVC_TEST_OSS", undefined=None) + if do_test is not None: + return do_test + + return ( + os.getenv("OSS_ENDPOINT") + and os.getenv("OSS_ACCESS_KEY_ID") + and os.getenv("OSS_ACCESS_KEY_SECRET") + ) + + +def _should_test_ssh(): + do_test = env2bool("DVC_TEST_SSH", undefined=None) + if do_test is not None: + return do_test + + # FIXME: enable on windows + if os.name == "nt": + return False + + try: + check_output(["ssh", "-o", "BatchMode=yes", "127.0.0.1", "ls"]) + except (CalledProcessError, IOError): + return False + + return True + + +def _should_test_hdfs(): + if platform.system() != "Linux": + return False + + try: + check_output( + ["hadoop", "version"], shell=True, executable=os.getenv("SHELL") + ) + except (CalledProcessError, IOError): + return False + + p = Popen( + "hadoop fs -ls hdfs://127.0.0.1/", + shell=True, + executable=os.getenv("SHELL"), + ) + p.communicate() + if p.returncode != 0: + return False + + return True + + +def get_local_storagepath(): + return TestDvc.mkdtemp() + + +def get_local_url(): + return get_local_storagepath() + + +def get_ssh_url(): + return "ssh://{}@127.0.0.1:22{}".format( + getpass.getuser(), get_local_storagepath() + ) + + +def get_ssh_url_mocked(user, port): + path = get_local_storagepath() + if os.name == "nt": + # NOTE: On Windows get_local_storagepath() will return an ntpath + # that looks something like `C:\some\path`, which is not compatible + # with SFTP paths [1], so we need to convert it to a proper posixpath. + # To do that, we should construct a posixpath that would be relative + # to the server's root. In our case our ssh server is running with + # `c:/` as a root, and our URL format requires absolute paths, so the + # resulting path would look like `/some/path`. + # + # [1]https://tools.ietf.org/html/draft-ietf-secsh-filexfer-13#section-6 + drive, path = os.path.splitdrive(path) + assert drive.lower() == "c:" + path = path.replace("\\", "/") + url = "ssh://{}@127.0.0.1:{}{}".format(user, port, path) + return url + + +def get_hdfs_url(): + return "hdfs://{}@127.0.0.1{}".format( + getpass.getuser(), get_local_storagepath() + ) + + +def get_aws_storagepath(): + return TEST_AWS_REPO_BUCKET + "/" + str(uuid.uuid4()) + + +def get_aws_url(): + return "s3://" + get_aws_storagepath() + + +def get_gdrive_url(): + return "gdrive://root/" + str(uuid.uuid4()) + + +def get_gcp_storagepath(): + return TEST_GCP_REPO_BUCKET + "/" + str(uuid.uuid4()) + + +def get_gcp_url(): + return "gs://" + get_gcp_storagepath() + + +def get_azure_url(): + container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME") + assert container_name is not None + return "azure://{}/{}".format(container_name, str(uuid.uuid4())) + + +def get_oss_storagepath(): + return "{}/{}".format(TEST_OSS_REPO_BUCKET, (uuid.uuid4())) + + +def get_oss_url(): + return "oss://{}".format(get_oss_storagepath()) + + # NOTE: staticmethod is only needed in Python 2 class Local: should_test = staticmethod(lambda: True) diff --git a/tests/unit/remote/ssh/test_ssh.py b/tests/unit/remote/ssh/test_ssh.py index 95efa231c6..630436cd45 100644 --- a/tests/unit/remote/ssh/test_ssh.py +++ b/tests/unit/remote/ssh/test_ssh.py @@ -9,7 +9,7 @@ from dvc.remote.ssh import RemoteSSH from dvc.system import System -from tests.func.test_data_cloud import get_ssh_url_mocked +from tests.remotes import get_ssh_url_mocked class TestRemoteSSH(TestCase): From 8a62e2f92bbc4a0ea54dc7607ae3c0241c5fab58 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sat, 30 Nov 2019 18:03:41 +0545 Subject: [PATCH 11/16] gs: check if the path is directory when checking for exists --- dvc/remote/gs.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 5849776363..35477e1fd2 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -148,7 +148,7 @@ def list_cache_paths(self): return self._list_paths(self.path_info) def walk_files(self, path_info): - for fname in self._list_paths(path_info): + for fname in self._list_paths(path_info / ""): # skip nested empty directories if fname.endswith("/"): continue @@ -165,8 +165,13 @@ def isdir(self, path_info): def exists(self, path_info): dir_path = path_info / "" - fname = next(self._list_paths(path_info, max_items=1), "") - return path_info.path == fname or fname.startswith(dir_path.path) + + if self.isdir(dir_path): + return True + + for fname in self._list_paths(path_info): + if path_info.path == fname: + return True def _upload(self, from_file, to_info, name=None, no_progress_bar=True): bucket = self.gs.bucket(to_info.bucket) From 433d2f8017794cb3c694cd94503d3794e3fbae3b Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sat, 30 Nov 2019 19:12:35 +0545 Subject: [PATCH 12/16] test: remove nested contextmanagers, return False on exists() --- dvc/remote/gs.py | 2 ++ tests/remotes.py | 48 ++++++++++++++++++------------------------------ 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 35477e1fd2..2fd8b492a3 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -173,6 +173,8 @@ def exists(self, path_info): if path_info.path == fname: return True + return False + def _upload(self, from_file, to_info, name=None, no_progress_bar=True): bucket = self.gs.bucket(to_info.bucket) _upload_to_bucket( diff --git a/tests/remotes.py b/tests/remotes.py index 3fdfae3f87..b9f170d327 100644 --- a/tests/remotes.py +++ b/tests/remotes.py @@ -235,26 +235,20 @@ class S3Mocked: get_url = staticmethod(get_aws_url) @classmethod + @contextmanager def remote(cls): - @contextmanager - def inner(): - with mock_s3(): - remote = RemoteS3(None, {"url": cls.get_url()}) - yield remote - - return inner() + with mock_s3(): + remote = RemoteS3(None, {"url": cls.get_url()}) + yield remote @classmethod + @contextmanager def put_objects(cls, remote, objects): - @contextmanager - def inner(): - s3 = cls.get_client(remote) - s3.create_bucket(Bucket="dvc-test") - for key, body in objects.items(): - cls.put_object(remote, key, body) - yield - - return inner() + s3 = cls.get_client(remote) + s3.create_bucket(Bucket="dvc-test") + for key, body in objects.items(): + cls.put_object(remote, key, body) + yield @staticmethod def get_client(remote): @@ -275,24 +269,18 @@ class GCP: get_url = staticmethod(get_gcp_url) @classmethod + @contextmanager def remote(cls): - @contextmanager - def inner(): - remote = RemoteGS(None, {"url": cls.get_url()}) - yield remote - - return inner() + remote = RemoteGS(None, {"url": cls.get_url()}) + yield remote @classmethod + @contextmanager def put_objects(cls, remote, objects): - @contextmanager - def inner(): - for key, body in objects.items(): - cls.put_object(remote, key, body) - yield - cls.remove(remote, objects.keys()) - - return inner() + for key, body in objects.items(): + cls.put_object(remote, key, body) + yield + cls.remove(remote, objects.keys()) @classmethod def put_object(cls, remote, key, body): From 25f0e0765c1a644c901dbe95734245a5b78e1aee Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sun, 1 Dec 2019 08:25:50 +0545 Subject: [PATCH 13/16] gs: use Blob.exists() instead of _list_paths to check for blob --- dvc/remote/gs.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 2fd8b492a3..d732f6045a 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -164,16 +164,13 @@ def isdir(self, path_info): return bool(list(self._list_paths(dir_path, max_items=1))) def exists(self, path_info): - dir_path = path_info / "" - - if self.isdir(dir_path): + blob = self.gs.bucket(path_info.bucket).blob(path_info.path) + if blob.exists(): return True - for fname in self._list_paths(path_info): - if path_info.path == fname: - return True - - return False + # if the blob does not exist, it could be a part of a directory path + # eg: if `data/file.txt` exists, check for `data` should return True + return self.isdir(path_info / "") def _upload(self, from_file, to_info, name=None, no_progress_bar=True): bucket = self.gs.bucket(to_info.bucket) From 7e62a7d7e534cc3d3787e1fe3656e28f4734b761 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sun, 1 Dec 2019 08:56:07 +0545 Subject: [PATCH 14/16] gs: implement isfile --- dvc/remote/gs.py | 18 ++++++++++++------ tests/unit/remote/test_remote_dir.py | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index d732f6045a..190c521766 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -163,14 +163,20 @@ def isdir(self, path_info): dir_path = path_info / "" return bool(list(self._list_paths(dir_path, max_items=1))) - def exists(self, path_info): + def isfile(self, path_info): + if path_info.path.endswith("/"): + return False + blob = self.gs.bucket(path_info.bucket).blob(path_info.path) - if blob.exists(): - return True + return blob.exists() + + def exists(self, path_info): + """Check if the blob exists. If it does not exist, + it could be a part of a directory path. - # if the blob does not exist, it could be a part of a directory path - # eg: if `data/file.txt` exists, check for `data` should return True - return self.isdir(path_info / "") + eg: if `data/file.txt` exists, check for `data` should return True + """ + return True if self.isfile(path_info) else self.isdir(path_info / "") def _upload(self, from_file, to_info, name=None, no_progress_bar=True): bucket = self.gs.bucket(to_info.bucket) diff --git a/tests/unit/remote/test_remote_dir.py b/tests/unit/remote/test_remote_dir.py index 9213a670f3..e0e48d1f71 100644 --- a/tests/unit/remote/test_remote_dir.py +++ b/tests/unit/remote/test_remote_dir.py @@ -106,3 +106,25 @@ def test_makedirs(remote): remote.makedirs(empty_dir) assert remote.exists(empty_dir) assert remote.isdir(empty_dir) + + +@pytest.mark.parametrize("remote", [GCP], indirect=True) +def test_isfile(remote): + test_cases = [ + (False, "empty_dir/"), + (True, "empty_file"), + (True, "foo"), + (True, "data/alice"), + (True, "data/alpha"), + (True, "data/subdir/1"), + (True, "data/subdir/2"), + (True, "data/subdir/3"), + (False, "data/subdir/empty_dir/"), + (True, "data/subdir/empty_file"), + (False, "something-that-does-not-exist"), + (False, "data/subdir/empty-file/"), + (False, "empty_dir"), + ] + + for expected, path in test_cases: + assert remote.isfile(remote.path_info / path) == expected From 22c0a644e566ee9be49a958d96a64ef4b171f987 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sun, 1 Dec 2019 16:08:53 +0545 Subject: [PATCH 15/16] address @efiop's suggestions --- dvc/remote/gs.py | 2 +- tests/func/test_api.py | 6 +++- tests/remotes.py | 51 +++++----------------------- tests/unit/remote/test_remote_dir.py | 4 +-- 4 files changed, 17 insertions(+), 46 deletions(-) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 190c521766..b71dab3d4b 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -176,7 +176,7 @@ def exists(self, path_info): eg: if `data/file.txt` exists, check for `data` should return True """ - return True if self.isfile(path_info) else self.isdir(path_info / "") + return self.isfile(path_info) or self.isdir(path_info / "") def _upload(self, from_file, to_info, name=None, no_progress_bar=True): bucket = self.gs.bucket(to_info.bucket) diff --git a/tests/func/test_api.py b/tests/func/test_api.py index de029960be..b0e0085e31 100644 --- a/tests/func/test_api.py +++ b/tests/func/test_api.py @@ -8,7 +8,11 @@ from dvc.main import main from dvc.path_info import URLInfo from dvc.remote.config import RemoteConfig -from tests.remotes import remote_params, all_remote_params +from tests.remotes import Azure, GCP, HDFS, Local, OSS, S3, SSH + + +remote_params = [S3, GCP, Azure, OSS, SSH, HDFS] +all_remote_params = [Local] + remote_params @pytest.fixture diff --git a/tests/remotes.py b/tests/remotes.py index b9f170d327..4c0d776607 100644 --- a/tests/remotes.py +++ b/tests/remotes.py @@ -242,27 +242,16 @@ def remote(cls): yield remote @classmethod - @contextmanager def put_objects(cls, remote, objects): - s3 = cls.get_client(remote) - s3.create_bucket(Bucket="dvc-test") + s3 = remote.s3 + bucket = remote.path_info.bucket + s3.create_bucket(Bucket=bucket) for key, body in objects.items(): - cls.put_object(remote, key, body) + s3.put_object( + Bucket=bucket, Key=(remote.path_info / key).path, Body=body + ) yield - @staticmethod - def get_client(remote): - return remote.s3 - - @classmethod - def put_object(cls, remote, key, body): - s3 = cls.get_client(remote) - bucket = remote.path_info.bucket - - s3.put_object( - Bucket=bucket, Key=remote.path_info.path + "/" + key, Body=body - ) - class GCP: should_test = staticmethod(_should_test_gcp) @@ -275,30 +264,12 @@ def remote(cls): yield remote @classmethod - @contextmanager def put_objects(cls, remote, objects): + client = remote.gs + bucket = client.get_bucket(remote.path_info.bucket) for key, body in objects.items(): - cls.put_object(remote, key, body) + bucket.blob((remote.path_info / key).path).upload_from_string(body) yield - cls.remove(remote, objects.keys()) - - @classmethod - def put_object(cls, remote, key, body): - client = cls.get_client(remote) - bucket = remote.path_info.bucket - - bucket = client.get_bucket(bucket) - blob = bucket.blob(remote.path_info.path + "/" + key) - blob.upload_from_string(body) - - @staticmethod - def get_client(remote): - return remote.gs - - @staticmethod - def remove(remote, files): - for fname in files: - remote.remove(remote.path_info / fname) class Azure: @@ -319,7 +290,3 @@ class SSH: class HDFS: should_test = staticmethod(_should_test_hdfs) get_url = staticmethod(get_hdfs_url) - - -remote_params = [S3, GCP, Azure, OSS, SSH, HDFS] -all_remote_params = [Local] + remote_params diff --git a/tests/unit/remote/test_remote_dir.py b/tests/unit/remote/test_remote_dir.py index e0e48d1f71..bc58c5e716 100644 --- a/tests/unit/remote/test_remote_dir.py +++ b/tests/unit/remote/test_remote_dir.py @@ -26,8 +26,8 @@ def remote(request): if not request.param.should_test(): raise pytest.skip() with request.param.remote() as remote: - with request.param.put_objects(remote, FILE_WITH_CONTENTS): - yield remote + request.param.put_objects(remote, FILE_WITH_CONTENTS) + yield remote @pytest.mark.parametrize("remote", remotes, indirect=True) From 4f011f62c76eb562677a2a1d3ab03b98203a65cc Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sun, 1 Dec 2019 16:24:22 +0545 Subject: [PATCH 16/16] use yield and staticmethod --- tests/remotes.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/remotes.py b/tests/remotes.py index 4c0d776607..679f3f93b0 100644 --- a/tests/remotes.py +++ b/tests/remotes.py @@ -241,8 +241,8 @@ def remote(cls): remote = RemoteS3(None, {"url": cls.get_url()}) yield remote - @classmethod - def put_objects(cls, remote, objects): + @staticmethod + def put_objects(remote, objects): s3 = remote.s3 bucket = remote.path_info.bucket s3.create_bucket(Bucket=bucket) @@ -250,7 +250,6 @@ def put_objects(cls, remote, objects): s3.put_object( Bucket=bucket, Key=(remote.path_info / key).path, Body=body ) - yield class GCP: @@ -263,13 +262,12 @@ def remote(cls): remote = RemoteGS(None, {"url": cls.get_url()}) yield remote - @classmethod - def put_objects(cls, remote, objects): + @staticmethod + def put_objects(remote, objects): client = remote.gs bucket = client.get_bucket(remote.path_info.bucket) for key, body in objects.items(): bucket.blob((remote.path_info / key).path).upload_from_string(body) - yield class Azure: