Skip to content

Commit 371d034

Browse files
pwolframshoyer
authored andcommitted
Fixes open_mfdataset too many open file error (#1198)
Includes testing to demonstrate an OSError associated with opening too many files as encountered using open_mfdataset. Fixed for the following backends: * netCDF4 backend * scipy backend * pynio backend Open/close operations on h5netcdf appear to have an error associated with the h5netcdf library following correspondence with @shoyer. Thus, there are still challenges with h5netcdf; hence, support for h5netcdf is currently disabled. Note, by default `autoclose=False` for open_mfdataset so standard behavior is unchanged unless `autoclose=True`. This choice of default is to select standard xarray performance over general removal of the OSError associated with opening too many files as encountered using open_mfdataset.
1 parent b3fc6c4 commit 371d034

File tree

11 files changed

+623
-205
lines changed

11 files changed

+623
-205
lines changed

doc/whats-new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ Added new method :py:meth:`~Dataset.assign_attrs` to ``DataArray`` and
3535
``dict.update`` method on attrs (:issue:`1281`).
3636
By `Henry S. Harrison <https://hsharrison.github.io>`_.
3737

38+
- It is now possible to set the ``autoclose=True`` argument to
39+
:py:func:`~xarray.open_mfdataset` to explicitly close opened files when not
40+
in use to prevent occurrence of an OS Error related to too many open files.
41+
Note, the default is ``autoclose=False``, which is consistent with previous
42+
xarray behavior. By `Phillip J. Wolfram <https://github.com/pwolfram>`_.
43+
3844
Bug fixes
3945
~~~~~~~~~
4046
- ``rolling`` now keeps its original dimension order (:issue:`1125`).

xarray/backends/api.py

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import absolute_import
22
from __future__ import division
33
from __future__ import print_function
4-
import gzip
54
import os.path
65
from distutils.version import LooseVersion
76
from glob import glob
@@ -133,7 +132,7 @@ def _protect_dataset_variables_inplace(dataset, cache):
133132

134133

135134
def open_dataset(filename_or_obj, group=None, decode_cf=True,
136-
mask_and_scale=True, decode_times=True,
135+
mask_and_scale=True, decode_times=True, autoclose=False,
137136
concat_characters=True, decode_coords=True, engine=None,
138137
chunks=None, lock=None, cache=None, drop_variables=None):
139138
"""Load and decode a dataset from a file or file-like object.
@@ -163,6 +162,10 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
163162
decode_times : bool, optional
164163
If True, decode times encoded in the standard NetCDF datetime format
165164
into datetime objects. Otherwise, leave them encoded as numbers.
165+
autoclose : bool, optional
166+
If True, automatically close files to avoid OS Error of too many files
167+
being open. However, this option doesn't work with streams, e.g.,
168+
BytesIO.
166169
concat_characters : bool, optional
167170
If True, concatenate along the last dimension of character arrays to
168171
form string arrays. Dimensions will only be concatenated over (and
@@ -251,6 +254,12 @@ def maybe_decode_store(store, lock=False):
251254
else:
252255
ds2 = ds
253256

257+
# protect so that dataset store isn't necessarily closed, e.g.,
258+
# streams like BytesIO can't be reopened
259+
# datastore backend is responsible for determining this capability
260+
if store._autoclose:
261+
store.close()
262+
254263
return ds2
255264

256265
if isinstance(filename_or_obj, backends.AbstractDataStore):
@@ -271,33 +280,30 @@ def maybe_decode_store(store, lock=False):
271280
if engine is not None and engine != 'scipy':
272281
raise ValueError('can only read gzipped netCDF files with '
273282
"default engine or engine='scipy'")
274-
# if the string ends with .gz, then gunzip and open as netcdf file
275-
try:
276-
store = backends.ScipyDataStore(gzip.open(filename_or_obj))
277-
except TypeError as e:
278-
# TODO: gzipped loading only works with NetCDF3 files.
279-
if 'is not a valid NetCDF 3 file' in e.message:
280-
raise ValueError('gzipped file loading only supports '
281-
'NetCDF 3 files.')
282-
else:
283-
raise
284-
else:
285-
if engine is None:
286-
engine = _get_default_engine(filename_or_obj,
287-
allow_remote=True)
288-
if engine == 'netcdf4':
289-
store = backends.NetCDF4DataStore(filename_or_obj, group=group)
290-
elif engine == 'scipy':
291-
store = backends.ScipyDataStore(filename_or_obj)
292-
elif engine == 'pydap':
293-
store = backends.PydapDataStore(filename_or_obj)
294-
elif engine == 'h5netcdf':
295-
store = backends.H5NetCDFStore(filename_or_obj, group=group)
296-
elif engine == 'pynio':
297-
store = backends.NioDataStore(filename_or_obj)
298283
else:
299-
raise ValueError('unrecognized engine for open_dataset: %r'
300-
% engine)
284+
engine = 'scipy'
285+
286+
if engine is None:
287+
engine = _get_default_engine(filename_or_obj,
288+
allow_remote=True)
289+
if engine == 'netcdf4':
290+
store = backends.NetCDF4DataStore(filename_or_obj, group=group,
291+
autoclose=autoclose)
292+
elif engine == 'scipy':
293+
store = backends.ScipyDataStore(filename_or_obj,
294+
autoclose=autoclose)
295+
elif engine == 'pydap':
296+
store = backends.PydapDataStore(filename_or_obj)
297+
elif engine == 'h5netcdf':
298+
store = backends.H5NetCDFStore(filename_or_obj, group=group,
299+
autoclose=autoclose)
300+
elif engine == 'pynio':
301+
store = backends.NioDataStore(filename_or_obj,
302+
autoclose=autoclose)
303+
else:
304+
raise ValueError('unrecognized engine for open_dataset: %r'
305+
% engine)
306+
301307
if lock is None:
302308
lock = _default_lock(filename_or_obj, engine)
303309
with close_on_error(store):
@@ -479,6 +485,10 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
479485
Engine to use when reading files. If not provided, the default engine
480486
is chosen based on available dependencies, with a preference for
481487
'netcdf4'.
488+
autoclose : bool, optional
489+
If True, automatically close files to avoid OS Error of too many files
490+
being open. However, this option doesn't work with streams, e.g.,
491+
BytesIO.
482492
lock : False, True or threading.Lock, optional
483493
This argument is passed on to :py:func:`dask.array.from_array`. By
484494
default, a per-variable lock is used when reading data from netCDF

xarray/backends/common.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
import time
77
import traceback
8+
import contextlib
89
from collections import Mapping
910
from distutils.version import LooseVersion
1011

@@ -41,6 +42,15 @@ def _decode_variable_name(name):
4142
return name
4243

4344

45+
def find_root(ds):
46+
"""
47+
Helper function to find the root of a netcdf or h5netcdf dataset.
48+
"""
49+
while ds.parent is not None:
50+
ds = ds.parent
51+
return ds
52+
53+
4454
def robust_getitem(array, key, catch=Exception, max_retries=6,
4555
initial_delay=500):
4656
"""
@@ -67,6 +77,7 @@ def robust_getitem(array, key, catch=Exception, max_retries=6,
6777

6878

6979
class AbstractDataStore(Mapping):
80+
_autoclose = False
7081

7182
def __iter__(self):
7283
return iter(self.variables)
@@ -107,8 +118,8 @@ def load(self):
107118
This function will be called anytime variables or attributes
108119
are requested, so care should be taken to make sure its fast.
109120
"""
110-
variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in
111-
iteritems(self.get_variables()))
121+
variables = FrozenOrderedDict((_decode_variable_name(k), v)
122+
for k, v in self.get_variables().items())
112123
attributes = FrozenOrderedDict(self.get_attrs())
113124
return variables, attributes
114125

@@ -252,3 +263,27 @@ def __getstate__(self):
252263
def __setstate__(self, state):
253264
self.__dict__.update(state)
254265
self.ds = self._opener(mode=self._mode)
266+
267+
@contextlib.contextmanager
268+
def ensure_open(self, autoclose):
269+
"""
270+
Helper function to make sure datasets are closed and opened
271+
at appropriate times to avoid too many open file errors.
272+
273+
Use requires `autoclose=True` argument to `open_mfdataset`.
274+
"""
275+
if self._autoclose and not self._isopen:
276+
try:
277+
self.ds = self._opener()
278+
self._isopen = True
279+
yield
280+
finally:
281+
if autoclose:
282+
self.close()
283+
else:
284+
yield
285+
286+
def assert_open(self):
287+
if not self._isopen:
288+
raise AssertionError('internal failure: file must be open '
289+
'if `autoclose=True` is used.')

xarray/backends/h5netcdf_.py

Lines changed: 50 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,20 @@
66

77
from .. import Variable
88
from ..core import indexing
9-
from ..core.utils import FrozenOrderedDict, close_on_error, Frozen
9+
from ..core.utils import FrozenOrderedDict, close_on_error
1010
from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict
1111

12-
from .common import WritableCFDataStore, DataStorePickleMixin
12+
from .common import WritableCFDataStore, DataStorePickleMixin, find_root
1313
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype,
1414
_extract_nc4_variable_encoding, BaseNetCDF4Array)
1515

1616

17+
class H5NetCDFArrayWrapper(BaseNetCDF4Array):
18+
def __getitem__(self, key):
19+
with self.datastore.ensure_open(autoclose=True):
20+
return self.get_array()[key]
21+
22+
1723
def maybe_decode_bytes(txt):
1824
if isinstance(txt, bytes_type):
1925
return txt.decode('utf-8')
@@ -49,49 +55,63 @@ class H5NetCDFStore(WritableCFDataStore, DataStorePickleMixin):
4955
"""Store for reading and writing data via h5netcdf
5056
"""
5157
def __init__(self, filename, mode='r', format=None, group=None,
52-
writer=None):
58+
writer=None, autoclose=False):
5359
if format not in [None, 'NETCDF4']:
5460
raise ValueError('invalid format for h5netcdf backend')
5561
opener = functools.partial(_open_h5netcdf_group, filename, mode=mode,
5662
group=group)
5763
self.ds = opener()
64+
if autoclose:
65+
raise NotImplemented('autoclose=True is not implemented '
66+
'for the h5netcdf backend pending further '
67+
'exploration, e.g., bug fixes (in h5netcdf?)')
68+
self._autoclose = False
69+
self._isopen = True
5870
self.format = format
5971
self._opener = opener
6072
self._filename = filename
6173
self._mode = mode
6274
super(H5NetCDFStore, self).__init__(writer)
6375

6476
def open_store_variable(self, name, var):
65-
dimensions = var.dimensions
66-
data = indexing.LazilyIndexedArray(BaseNetCDF4Array(name, self))
67-
attrs = _read_attributes(var)
68-
69-
# netCDF4 specific encoding
70-
encoding = dict(var.filters())
71-
chunking = var.chunking()
72-
encoding['chunksizes'] = chunking if chunking != 'contiguous' else None
73-
74-
# save source so __repr__ can detect if it's local or not
75-
encoding['source'] = self._filename
76-
encoding['original_shape'] = var.shape
77+
with self.ensure_open(autoclose=False):
78+
dimensions = var.dimensions
79+
data = indexing.LazilyIndexedArray(
80+
H5NetCDFArrayWrapper(name, self))
81+
attrs = _read_attributes(var)
82+
83+
# netCDF4 specific encoding
84+
encoding = dict(var.filters())
85+
chunking = var.chunking()
86+
encoding['chunksizes'] = chunking \
87+
if chunking != 'contiguous' else None
88+
89+
# save source so __repr__ can detect if it's local or not
90+
encoding['source'] = self._filename
91+
encoding['original_shape'] = var.shape
7792

7893
return Variable(dimensions, data, attrs, encoding)
7994

8095
def get_variables(self):
81-
return FrozenOrderedDict((k, self.open_store_variable(k, v))
82-
for k, v in iteritems(self.ds.variables))
96+
with self.ensure_open(autoclose=False):
97+
return FrozenOrderedDict((k, self.open_store_variable(k, v))
98+
for k, v in iteritems(self.ds.variables))
8399

84100
def get_attrs(self):
85-
return Frozen(_read_attributes(self.ds))
101+
with self.ensure_open(autoclose=True):
102+
return FrozenOrderedDict(_read_attributes(self.ds))
86103

87104
def get_dimensions(self):
88-
return self.ds.dimensions
105+
with self.ensure_open(autoclose=True):
106+
return self.ds.dimensions
89107

90108
def set_dimension(self, name, length):
91-
self.ds.createDimension(name, size=length)
109+
with self.ensure_open(autoclose=False):
110+
self.ds.createDimension(name, size=length)
92111

93112
def set_attribute(self, key, value):
94-
self.ds.setncattr(key, value)
113+
with self.ensure_open(autoclose=False):
114+
self.ds.setncattr(key, value)
95115

96116
def prepare_variable(self, name, variable, check_encoding=False,
97117
unlimited_dims=None):
@@ -129,12 +149,14 @@ def prepare_variable(self, name, variable, check_encoding=False,
129149
return nc4_var, variable.data
130150

131151
def sync(self):
132-
super(H5NetCDFStore, self).sync()
133-
self.ds.sync()
152+
with self.ensure_open(autoclose=True):
153+
super(H5NetCDFStore, self).sync()
154+
self.ds.sync()
134155

135156
def close(self):
136-
ds = self.ds
137-
# netCDF4 only allows closing the root group
138-
while ds.parent is not None:
139-
ds = ds.parent
140-
ds.close()
157+
if self._isopen:
158+
# netCDF4 only allows closing the root group
159+
ds = find_root(self.ds)
160+
if not ds._closed:
161+
ds.close()
162+
self._isopen = False

0 commit comments

Comments
 (0)