Skip to content

s3,gs: don't append "/" to prefixed path when listing objects #4149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dvc/remote/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ def _list_paths(self, bucket, prefix):
next_marker = blobs.next_marker

def walk_files(self, path_info, **kwargs):
if not kwargs.pop("prefix", False):
path_info = path_info / ""
for fname in self._list_paths(
path_info.bucket, path_info.path, **kwargs
):
Expand Down
13 changes: 10 additions & 3 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,12 @@ def iscopy(self, path_info):
return False # We can't be sure by default

def walk_files(self, path_info, **kwargs):
"""Return a generator with `PathInfo`s to all the files"""
"""Return a generator with `PathInfo`s to all the files.

Optional kwargs:
prefix (bool): If true `path_info` will be treated as a prefix
rather than directory path.
"""
raise NotImplementedError

def is_empty(self, path_info):
Expand Down Expand Up @@ -522,14 +527,16 @@ def list_paths(self, prefix=None, progress_callback=None):
path_info = self.path_info / prefix[:2] / prefix[2:]
else:
path_info = self.path_info / prefix[:2]
prefix = True
else:
path_info = self.path_info
prefix = False
if progress_callback:
for file_info in self.walk_files(path_info):
for file_info in self.walk_files(path_info, prefix=prefix):
progress_callback()
yield file_info.path
else:
yield from self.walk_files(path_info)
yield from self.walk_files(path_info, prefix=prefix)

def list_hashes(self, prefix=None, progress_callback=None):
"""Iterate over hashes in this tree.
Expand Down
2 changes: 1 addition & 1 deletion dvc/remote/gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ def _list_paths(self, prefix=None):
)

def walk_files(self, path_info, **kwargs):
if path_info == self.path_info:
if path_info == self.path_info or not kwargs.pop("prefix", False):
prefix = None
else:
prefix = path_info.path
Expand Down
4 changes: 3 additions & 1 deletion dvc/remote/gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,9 @@ def _list_paths(self, path_info, max_items=None):
yield blob.name

def walk_files(self, path_info, **kwargs):
for fname in self._list_paths(path_info / "", **kwargs):
if not kwargs.pop("prefix", False):
path_info = path_info / ""
for fname in self._list_paths(path_info, **kwargs):
# skip nested empty directories
if fname.endswith("/"):
continue
Expand Down
2 changes: 2 additions & 0 deletions dvc/remote/oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def _list_paths(self, path_info):
yield blob.key

def walk_files(self, path_info, **kwargs):
if not kwargs.pop("prefix", False):
path_info = path_info / ""
for fname in self._list_paths(path_info):
if fname.endswith("/"):
continue
Expand Down
4 changes: 3 additions & 1 deletion dvc/remote/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,9 @@ def _list_paths(self, path_info, max_items=None):
)

def walk_files(self, path_info, **kwargs):
for fname in self._list_paths(path_info / "", **kwargs):
if not kwargs.pop("prefix", False):
path_info = path_info / ""
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@efiop after this change we'll treat paths as directories and append the trailing slash for S3-like remote walk_files unless prefix kwarg is enabled

for fname in self._list_paths(path_info, **kwargs):
if fname.endswith("/"):
continue

Expand Down
14 changes: 14 additions & 0 deletions tests/unit/remote/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,20 @@ def test_list_hashes(dvc):
assert hashes == ["123456"]


def test_list_paths(dvc):
tree = BaseRemoteTree(dvc, {})
tree.path_info = PathInfo("foo")

with mock.patch.object(tree, "walk_files", return_value=[]) as walk_mock:
for _ in tree.list_paths():
pass
walk_mock.assert_called_with(tree.path_info, prefix=False)

for _ in tree.list_paths(prefix="000"):
pass
walk_mock.assert_called_with(tree.path_info / "00" / "0", prefix=True)


@pytest.mark.parametrize(
"hash_, result",
[(None, False), ("", False), ("3456.dir", True), ("3456", False)],
Expand Down