Skip to content

Commit 4032091

Browse files
authored
Merge pull request #243 from martindurant/caches
instance- and dir-caches
2 parents 4c66e09 + 120ec16 commit 4032091

File tree

8 files changed

+153
-16
lines changed

8 files changed

+153
-16
lines changed

docs/source/api.rst

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@ Base Classes
3232
fsspec.core.OpenFile
3333
fsspec.core.BaseCache
3434
fsspec.core.get_fs_token_paths
35+
fsspec.dircache.DirCache
3536

3637
.. autoclass:: fsspec.spec.AbstractFileSystem
38+
:members:
3739

3840
.. autoclass:: fsspec.spec.Transaction
3941
:members:
@@ -52,6 +54,9 @@ Base Classes
5254

5355
.. autofunction:: fsspec.core.get_fs_token_paths
5456

57+
.. autoclass:: fsspec.dircache.DirCache
58+
:members: __init__
59+
5560
.. _implementations:
5661

5762
Built-in Implementations
@@ -84,7 +89,7 @@ Built-in Implementations
8489
:members: __init__
8590

8691
.. autoclass:: fsspec.implementations.local.LocalFileSystem
87-
:members:
92+
:members: __init__
8893

8994
.. autoclass:: fsspec.implementations.memory.MemoryFileSystem
9095
:members: __init__
@@ -105,6 +110,7 @@ Built-in Implementations
105110
:members: __init__
106111

107112
.. autoclass:: fsspec.implementations.cached.WholeFileCacheFileSystem
113+
:members: __init__
108114

109115
Other Known Implementations
110116
---------------------------

docs/source/features.rst

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,28 @@ the instance cache may cause excessive memory usage in some situations; but norm
166166
will get ``close``d, and the data discarded. Only when there is also an unfinalised transaction or
167167
captured traceback might this be anticipated becoming a problem.
168168
169+
To disable instance caching, i.e., get a fresh instance which is not in the cache
170+
even for a cachable class, pass ``skip_instance_cache=True``.
171+
172+
Listings Caching
173+
----------------
174+
175+
For some implementations, getting file listigns (i.e., ``ls`` and anything that
176+
depends on it) is expensive. These implementations use dict-like instances of
177+
:class:`fsspec.dircache.DirCache` to manage the listings.
178+
179+
The cache allows for time-based expiry of entries with the ``listings_expiry_time``
180+
parameter, or or LRU expiry with the ``max_paths`` parameter. These can be
181+
set on any implementation instance that uses listings caching; or to skip the
182+
caching altogether, use ``use_listings_cache=False``. That would be appropriate
183+
when the target location is known to be volatile because it is being written
184+
to from other sources.
185+
186+
When the ``fsspec`` instance writes to the backend, the method ``invalidate_cache``
187+
is called, so that subsequent listing of the given paths will force a refresh. In
188+
addition, some methods like ``ls`` have a ``refresh`` parameter to force fetching
189+
the listing again.
190+
169191
File Buffering
170192
--------------
171193

fsspec/dircache.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
from functools import lru_cache
2+
import time
3+
from collections.abc import MutableMapping
4+
5+
6+
class DirCache(MutableMapping):
7+
"""
8+
Caching of directory listings, in a structure like
9+
10+
{"path0": [
11+
{"name": "path0/file0",
12+
"size": 123,
13+
"type": "file",
14+
...
15+
},
16+
{"name": "path0/file1",
17+
},
18+
...
19+
],
20+
"path1": [...]
21+
}
22+
23+
Parameters to this class control listing expiry or indeed turn
24+
caching off
25+
"""
26+
27+
def __init__(
28+
self,
29+
use_listings_cache=True,
30+
listings_expiry_time=None,
31+
max_paths=None,
32+
**kwargs
33+
):
34+
"""
35+
36+
Parameters
37+
----------
38+
use_listings_cache: bool
39+
If False, this cache never returns items, but always reports KeyError,
40+
and setting items has no effect
41+
listings_expiry_time: int (optional)
42+
Time in seconds that a listing is considered valid. If None,
43+
listings do not expire.
44+
max_paths: int (optional)
45+
The number of most recent listings that are considered valid; 'recent'
46+
refers to when the entry was set.
47+
"""
48+
self._cache = {}
49+
self._times = {}
50+
if max_paths:
51+
self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
52+
self.use_listings_cache = use_listings_cache
53+
self.listings_expiry_time = listings_expiry_time
54+
self.max_paths = max_paths
55+
56+
def __getitem__(self, item):
57+
if self.listings_expiry_time:
58+
if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
59+
del self._cache[item]
60+
if self.max_paths:
61+
self._q(item)
62+
return self._cache[item] # maybe raises KeyError
63+
64+
def clear(self):
65+
self._cache.clear()
66+
67+
def __len__(self):
68+
return len(self._cache)
69+
70+
def __contains__(self, item):
71+
try:
72+
self[item]
73+
return True
74+
except KeyError:
75+
return False
76+
77+
def __setitem__(self, key, value):
78+
if not self.use_listings_cache:
79+
return
80+
if self.max_paths:
81+
self._q(key)
82+
self._cache[key] = value
83+
if self.listings_expiry_time:
84+
self._times[key] = time.time()
85+
86+
def __delitem__(self, key):
87+
del self._cache[key]
88+
89+
def __iter__(self):
90+
return (k for k in self._cache if k in self)
91+
92+
def __reduce__(self):
93+
return (
94+
DirCache,
95+
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
96+
)

fsspec/implementations/ftp.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ class FTPFileSystem(AbstractFileSystem):
99

1010
root_marker = "/"
1111
cachable = False
12+
protocol = "ftp"
1213

1314
def __init__(
1415
self,
@@ -74,12 +75,6 @@ def _get_kwargs_from_urls(urlpath):
7475
out.pop("protocol", None)
7576
return out
7677

77-
def invalidate_cache(self, path=None):
78-
if path is not None:
79-
self.dircache.pop(path, None)
80-
else:
81-
self.dircache.clear()
82-
8378
def ls(self, path, detail=True):
8479
path = self._strip_protocol(path)
8580
out = []

fsspec/implementations/local.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class LocalFileSystem(AbstractFileSystem):
2121
"""
2222

2323
root_marker = "/"
24+
protocol = "file"
2425

2526
def __init__(self, auto_mkdir=None, **kwargs):
2627
super().__init__(**kwargs)

fsspec/implementations/sftp.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ class SFTPFileSystem(AbstractFileSystem):
1010
"""Files over SFTP/SSH
1111
1212
Peer-to-peer filesystem over SSH using paramiko.
13+
14+
Note: if using this with the ``open`` or ``open_files``, with full URLs,
15+
there is no way to tell if a path is relative, so all paths are assumed
16+
to be absolute.
1317
"""
1418

1519
protocol = "sftp", "ssh"

fsspec/spec.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from .transaction import Transaction
88
from .utils import read_block, tokenize, stringify_path
9+
from .dircache import DirCache
910

1011
logger = logging.getLogger("fsspec")
1112

@@ -42,11 +43,12 @@ def __init__(cls, *args, **kwargs):
4243
cls._cache = {}
4344

4445
def __call__(cls, *args, **kwargs):
46+
skip = kwargs.get("skip_instance_cache", False)
4547
extra_tokens = tuple(
4648
getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
4749
)
4850
token = tokenize(cls, *args, *extra_tokens, **kwargs)
49-
if cls.cachable and token in cls._cache:
51+
if not skip and cls.cachable and token in cls._cache:
5052
return cls._cache[token]
5153
else:
5254
obj = super().__call__(*args, **kwargs)
@@ -55,7 +57,7 @@ def __call__(cls, *args, **kwargs):
5557
obj.storage_args = args
5658
obj.storage_options = kwargs
5759

58-
if cls.cachable:
60+
if cls.cachable and not skip:
5961
cls._cache[token] = obj
6062
return obj
6163

@@ -97,17 +99,24 @@ def __init__(self, *args, **storage_options):
9799
98100
Subclasses should call this method.
99101
100-
Magic kwargs that affect functionality here:
101-
add_docs: if True, will append docstrings from this spec to the
102-
specific implementation
102+
Parameters
103+
----------
104+
use_listings_cache, listings_expiry_time, max_paths:
105+
passed to ``DirCache``, if the implementation supports
106+
directory listing caching. Pass use_listings_cache=False
107+
to disable such caching.
108+
skip_instance_cache: bool
109+
If this is a cachable implementation, pass True here to force
110+
creating a new instance even if a matching instance exists, and prevent
111+
storing this instance.
103112
"""
104113
if self._cached:
105114
# reusing instance, don't change
106115
return
107116
self._cached = True
108117
self._intrans = False
109118
self._transaction = None
110-
self.dircache = {}
119+
self.dircache = DirCache(**storage_options)
111120

112121
if storage_options.pop("add_docs", None):
113122
warnings.warn("add_docs is no longer supported.", FutureWarning)
@@ -290,14 +299,18 @@ def _ls_from_cache(self, path):
290299
but contains nothing), None if not in cache.
291300
"""
292301
parent = self._parent(path)
293-
if path in self.dircache:
302+
try:
294303
return self.dircache[path]
295-
elif parent in self.dircache:
304+
except KeyError:
305+
pass
306+
try:
296307
files = [f for f in self.dircache[parent] if f["name"] == path]
297308
if len(files) == 0:
298309
# parent dir was listed but did not contain this file
299310
raise FileNotFoundError(path)
300311
return files
312+
except KeyError:
313+
pass
301314

302315
def walk(self, path, maxdepth=None, **kwargs):
303316
""" Return all files belows path

fsspec/tests/test_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from fsspec.utils import infer_storage_options, seek_delimiter, read_block
55

66

7-
WIN = "win" in sys.platform
7+
WIN = sys.platform.startswith("win")
88

99

1010
def test_read_block():

0 commit comments

Comments
 (0)