1
+ import contextlib
1
2
import threading
3
+ import warnings
2
4
3
5
from ..core import utils
4
6
from ..core .options import OPTIONS
7
+ from .locks import acquire
5
8
from .lru_cache import LRUCache
6
9
7
10
11
14
assert FILE_CACHE .maxsize , 'file cache must be at least size one'
12
15
13
16
17
+ REF_COUNTS = {}
18
+
14
19
_DEFAULT_MODE = utils .ReprObject ('<unused>' )
15
20
16
21
@@ -22,7 +27,7 @@ class FileManager(object):
22
27
many open files and transferring them between multiple processes.
23
28
"""
24
29
25
- def acquire (self ):
30
+ def acquire (self , needs_lock = True ):
26
31
"""Acquire the file object from this manager."""
27
32
raise NotImplementedError
28
33
@@ -62,6 +67,9 @@ class CachingFileManager(FileManager):
62
67
def __init__ (self , opener , * args , ** keywords ):
63
68
"""Initialize a FileManager.
64
69
70
+ The cache and ref_counts arguments exist solely to facilitate
71
+ dependency injection, and should only be set for tests.
72
+
65
73
Parameters
66
74
----------
67
75
opener : callable
@@ -90,13 +98,17 @@ def __init__(self, opener, *args, **keywords):
90
98
global variable and contains non-picklable file objects, an
91
99
unpickled FileManager objects will be restored with the default
92
100
cache.
101
+ ref_counts : dict, optional
102
+ Optional dict to use for keeping track the number of references to
103
+ the same file.
93
104
"""
94
105
# TODO: replace with real keyword arguments when we drop Python 2
95
106
# support
96
107
mode = keywords .pop ('mode' , _DEFAULT_MODE )
97
108
kwargs = keywords .pop ('kwargs' , None )
98
109
lock = keywords .pop ('lock' , None )
99
110
cache = keywords .pop ('cache' , FILE_CACHE )
111
+ ref_counts = keywords .pop ('ref_counts' , REF_COUNTS )
100
112
if keywords :
101
113
raise TypeError ('FileManager() got unexpected keyword arguments: '
102
114
'%s' % list (keywords ))
@@ -105,34 +117,52 @@ def __init__(self, opener, *args, **keywords):
105
117
self ._args = args
106
118
self ._mode = mode
107
119
self ._kwargs = {} if kwargs is None else dict (kwargs )
120
+
108
121
self ._default_lock = lock is None or lock is False
109
122
self ._lock = threading .Lock () if self ._default_lock else lock
123
+
124
+ # cache[self._key] stores the file associated with this object.
110
125
self ._cache = cache
111
126
self ._key = self ._make_key ()
112
127
128
+ # ref_counts[self._key] stores the number of CachingFileManager objects
129
+ # in memory referencing this same file. We use this to know if we can
130
+ # close a file when the manager is deallocated.
131
+ self ._ref_counter = _RefCounter (ref_counts )
132
+ self ._ref_counter .increment (self ._key )
133
+
113
134
def _make_key (self ):
114
135
"""Make a key for caching files in the LRU cache."""
115
136
value = (self ._opener ,
116
137
self ._args ,
117
- self ._mode ,
138
+ 'a' if self . _mode == 'w' else self ._mode ,
118
139
tuple (sorted (self ._kwargs .items ())))
119
140
return _HashedSequence (value )
120
141
121
- def acquire (self ):
142
+ @contextlib .contextmanager
143
+ def _optional_lock (self , needs_lock ):
144
+ """Context manager for optionally acquiring a lock."""
145
+ if needs_lock :
146
+ with self ._lock :
147
+ yield
148
+ else :
149
+ yield
150
+
151
+ def acquire (self , needs_lock = True ):
122
152
"""Acquiring a file object from the manager.
123
153
124
154
A new file is only opened if it has expired from the
125
155
least-recently-used cache.
126
156
127
- This method uses a reentrant lock, which ensures that it is
128
- thread-safe. You can safely acquire a file in multiple threads at the
129
- same time, as long as the underlying file object is thread-safe.
157
+ This method uses a lock, which ensures that it is thread-safe. You can
158
+ safely acquire a file in multiple threads at the same time, as long as
159
+ the underlying file object is thread-safe.
130
160
131
161
Returns
132
162
-------
133
163
An open file object, as returned by ``opener(*args, **kwargs)``.
134
164
"""
135
- with self ._lock :
165
+ with self ._optional_lock ( needs_lock ) :
136
166
try :
137
167
file = self ._cache [self ._key ]
138
168
except KeyError :
@@ -144,28 +174,53 @@ def acquire(self):
144
174
if self ._mode == 'w' :
145
175
# ensure file doesn't get overriden when opened again
146
176
self ._mode = 'a'
147
- self ._key = self ._make_key ()
148
177
self ._cache [self ._key ] = file
149
178
return file
150
179
151
- def _close (self ):
152
- default = None
153
- file = self ._cache .pop (self ._key , default )
154
- if file is not None :
155
- file .close ()
156
-
157
180
def close (self , needs_lock = True ):
158
181
"""Explicitly close any associated file object (if necessary)."""
159
182
# TODO: remove needs_lock if/when we have a reentrant lock in
160
183
# dask.distributed: https://github.com/dask/dask/issues/3832
161
- if needs_lock :
162
- with self ._lock :
163
- self ._close ()
164
- else :
165
- self ._close ()
184
+ with self ._optional_lock (needs_lock ):
185
+ default = None
186
+ file = self ._cache .pop (self ._key , default )
187
+ if file is not None :
188
+ file .close ()
189
+
190
+ def __del__ (self ):
191
+ # If we're the only CachingFileManger referencing a unclosed file, we
192
+ # should remove it from the cache upon garbage collection.
193
+ #
194
+ # Keeping our own count of file references might seem like overkill,
195
+ # but it's actually pretty common to reopen files with the same
196
+ # variable name in a notebook or command line environment, e.g., to
197
+ # fix the parameters used when opening a file:
198
+ # >>> ds = xarray.open_dataset('myfile.nc')
199
+ # >>> ds = xarray.open_dataset('myfile.nc', decode_times=False)
200
+ # This second assignment to "ds" drops CPython's ref-count on the first
201
+ # "ds" argument to zero, which can trigger garbage collections. So if
202
+ # we didn't check whether another object is referencing 'myfile.nc',
203
+ # the newly opened file would actually be immediately closed!
204
+ ref_count = self ._ref_counter .decrement (self ._key )
205
+
206
+ if not ref_count and self ._key in self ._cache :
207
+ if acquire (self ._lock , blocking = False ):
208
+ # Only close files if we can do so immediately.
209
+ try :
210
+ self .close (needs_lock = False )
211
+ finally :
212
+ self ._lock .release ()
213
+
214
+ if OPTIONS ['warn_for_unclosed_files' ]:
215
+ warnings .warn (
216
+ 'deallocating {}, but file is not already closed. '
217
+ 'This may indicate a bug.'
218
+ .format (self ), RuntimeWarning , stacklevel = 2 )
166
219
167
220
def __getstate__ (self ):
168
221
"""State for pickling."""
222
+ # cache and ref_counts are intentionally omitted: we don't want to try
223
+ # to serialize these global objects.
169
224
lock = None if self ._default_lock else self ._lock
170
225
return (self ._opener , self ._args , self ._mode , self ._kwargs , lock )
171
226
@@ -174,6 +229,34 @@ def __setstate__(self, state):
174
229
opener , args , mode , kwargs , lock = state
175
230
self .__init__ (opener , * args , mode = mode , kwargs = kwargs , lock = lock )
176
231
232
+ def __repr__ (self ):
233
+ args_string = ', ' .join (map (repr , self ._args ))
234
+ if self ._mode is not _DEFAULT_MODE :
235
+ args_string += ', mode={!r}' .format (self ._mode )
236
+ return '{}({!r}, {}, kwargs={})' .format (
237
+ type (self ).__name__ , self ._opener , args_string , self ._kwargs )
238
+
239
+
240
+ class _RefCounter (object ):
241
+ """Class for keeping track of reference counts."""
242
+ def __init__ (self , counts ):
243
+ self ._counts = counts
244
+ self ._lock = threading .Lock ()
245
+
246
+ def increment (self , name ):
247
+ with self ._lock :
248
+ count = self ._counts [name ] = self ._counts .get (name , 0 ) + 1
249
+ return count
250
+
251
+ def decrement (self , name ):
252
+ with self ._lock :
253
+ count = self ._counts [name ] - 1
254
+ if count :
255
+ self ._counts [name ] = count
256
+ else :
257
+ del self ._counts [name ]
258
+ return count
259
+
177
260
178
261
class _HashedSequence (list ):
179
262
"""Speedup repeated look-ups by caching hash values.
@@ -198,7 +281,8 @@ class DummyFileManager(FileManager):
198
281
def __init__ (self , value ):
199
282
self ._value = value
200
283
201
- def acquire (self ):
284
+ def acquire (self , needs_lock = True ):
285
+ del needs_lock # ignored
202
286
return self ._value
203
287
204
288
def close (self , needs_lock = True ):
0 commit comments