Skip to content

Commit 504483c

Browse files
aramesh117Aaditya Ramesh
and
Aaditya Ramesh
authored
Add more consistent ls for TarFileSystem (#914)
Add consistent ls that lists subdirectories of directories and add tests Co-authored-by: Aaditya Ramesh <[email protected]>
1 parent 732caa0 commit 504483c

File tree

2 files changed

+53
-6
lines changed

2 files changed

+53
-6
lines changed

fsspec/implementations/tar.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,22 +88,25 @@ def _index(self):
8888
out = {}
8989
for ti in self.tar:
9090
info = ti.get_info()
91-
info["type"] = typemap[info["type"]]
91+
info["type"] = typemap.get(info["type"], "file")
9292
name = ti.get_info()["name"].rstrip("/")
9393
out[name] = (info, ti.offset_data)
9494

9595
self.index = out
9696
# TODO: save index to self.index_store here, if set
9797

9898
def _get_dirs(self):
99-
10099
if self.dir_cache is not None:
101100
return
102101

103-
self.dir_cache = {}
102+
# This enables ls to get directories as children as well as files
103+
self.dir_cache = {
104+
dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"}
105+
for dirname in self._all_dirnames(self.tar.getnames())
106+
}
104107
for member in self.tar.getmembers():
105108
info = member.get_info()
106-
info["type"] = typemap[info["type"]]
109+
info["type"] = typemap.get(info["type"], "file")
107110
self.dir_cache[info["name"]] = info
108111

109112
def _open(self, path, mode="rb", **kwargs):

fsspec/implementations/tests/test_tar.py

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
import os
22
import shutil
3+
import tarfile
34
import tempfile
5+
from io import BytesIO
6+
from pathlib import Path
7+
from typing import Dict
48

59
import pytest
610

711
import fsspec
812
from fsspec.core import OpenFile
913
from fsspec.implementations.cached import WholeFileCacheFileSystem
14+
from fsspec.implementations.tar import TarFileSystem
1015
from fsspec.implementations.tests.test_archive import archive_data, temptar
1116

1217

@@ -171,7 +176,6 @@ def test_filesystem_cached(recipe, tmpdir):
171176
ids=["tar", "tar-gz", "tar-bz2", "tar-xz"],
172177
)
173178
def test_url_to_fs_direct(recipe, tmpdir):
174-
175179
with temptar(archive_data, mode=recipe["mode"], suffix=recipe["suffix"]) as tf:
176180
url = f"tar://inner::file://{tf}"
177181
fs, url = fsspec.core.url_to_fs(url=url)
@@ -189,8 +193,48 @@ def test_url_to_fs_direct(recipe, tmpdir):
189193
ids=["tar", "tar-gz", "tar-bz2", "tar-xz"],
190194
)
191195
def test_url_to_fs_cached(recipe, tmpdir):
192-
193196
with temptar(archive_data, mode=recipe["mode"], suffix=recipe["suffix"]) as tf:
194197
url = f"tar://inner::simplecache::file://{tf}"
195198
fs, url = fsspec.core.url_to_fs(url=url)
196199
assert fs.cat("b") == b"hello"
200+
201+
202+
@pytest.mark.parametrize(
203+
"compression", ["", "gz", "bz2", "xz"], ids=["tar", "tar-gz", "tar-bz2", "tar-xz"]
204+
)
205+
def test_ls_with_folders(compression: str, tmp_path: Path):
206+
"""
207+
Create a tar file that doesn't include the intermediate folder structure,
208+
but make sure that the reading filesystem is still able to resolve the
209+
intermediate folders, like the ZipFileSystem.
210+
"""
211+
tar_data: Dict[str, bytes] = {
212+
"a.pdf": b"Hello A!",
213+
"b/c.pdf": b"Hello C!",
214+
"d/e/f.pdf": b"Hello F!",
215+
"d/g.pdf": b"Hello G!",
216+
}
217+
if compression:
218+
temp_archive_file = tmp_path / f"test_tar_file.tar.{compression}"
219+
else:
220+
temp_archive_file = tmp_path / "test_tar_file.tar"
221+
with open(temp_archive_file, "wb") as fd:
222+
# We need to manually write the tarfile here, because temptar
223+
# creates intermediate directories which is not how tars are always created
224+
with tarfile.open(fileobj=fd, mode=f"w:{compression}") as tf:
225+
for tar_file_path, data in tar_data.items():
226+
content = data
227+
info = tarfile.TarInfo(name=tar_file_path)
228+
info.size = len(content)
229+
tf.addfile(info, BytesIO(content))
230+
with open(temp_archive_file, "rb") as fd:
231+
fs = TarFileSystem(fd)
232+
assert fs.find("/", withdirs=True) == [
233+
"a.pdf",
234+
"b/",
235+
"b/c.pdf",
236+
"d/",
237+
"d/e/",
238+
"d/e/f.pdf",
239+
"d/g.pdf",
240+
]

0 commit comments

Comments
 (0)