Skip to content

Commit 7776f72

Browse files
scottyhqpletchm
authored andcommitted
enable loading remote hdf5 files (pydata#2782)
* attempt at loading remote hdf5 * added a couple tests * rewind bytes after reading header * addressed comments for tests and error message * fixed pep8 formatting * created _get_engine_from_magic_number function, new tests * added description in whats-new * fixed test failure on windows * same error on windows and nix
1 parent 48b7ace commit 7776f72

File tree

4 files changed

+110
-28
lines changed

4 files changed

+110
-28
lines changed

doc/whats-new.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ Breaking changes
3333

3434
Enhancements
3535
~~~~~~~~~~~~
36-
36+
- Added ability to open netcdf4/hdf5 file-like objects with ``open_dataset``.
37+
Requires (h5netcdf>0.7 and h5py>2.9.0). (:issue:`2781`)
38+
By `Scott Henderson <https://github.com/scottyhq>`_
3739
- Internal plotting now supports ``cftime.datetime`` objects as time series.
3840
(:issue:`2164`)
3941
By `Julius Busecke <https://github.com/jbusecke>`_ and
@@ -86,6 +88,7 @@ Enhancements
8688
- Allow ``expand_dims`` method to support inserting/broadcasting dimensions
8789
with size > 1. (:issue:`2710`)
8890
By `Martin Pletcher <https://github.com/pletchm>`_.
91+
`Spencer Clark <https://github.com/spencerkclark>`_.
8992

9093
- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`).
9194
By `Kevin Squire <https://github.com/kmsquire>`_.

xarray/backends/api.py

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,34 @@ def _get_default_engine_netcdf():
7575
return engine
7676

7777

78+
def _get_engine_from_magic_number(filename_or_obj):
79+
# check byte header to determine file type
80+
if isinstance(filename_or_obj, bytes):
81+
magic_number = filename_or_obj[:8]
82+
else:
83+
if filename_or_obj.tell() != 0:
84+
raise ValueError("file-like object read/write pointer not at zero "
85+
"please close and reopen, or use a context "
86+
"manager")
87+
magic_number = filename_or_obj.read(8)
88+
filename_or_obj.seek(0)
89+
90+
if magic_number.startswith(b'CDF'):
91+
engine = 'scipy'
92+
elif magic_number.startswith(b'\211HDF\r\n\032\n'):
93+
engine = 'h5netcdf'
94+
if isinstance(filename_or_obj, bytes):
95+
raise ValueError("can't open netCDF4/HDF5 as bytes "
96+
"try passing a path or file-like object")
97+
else:
98+
if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80:
99+
filename_or_obj = filename_or_obj[:80] + b'...'
100+
raise ValueError('{} is not a valid netCDF file '
101+
'did you mean to pass a string for a path instead?'
102+
.format(filename_or_obj))
103+
return engine
104+
105+
78106
def _get_default_engine(path, allow_remote=False):
79107
if allow_remote and is_remote_uri(path):
80108
engine = _get_default_engine_remote_uri()
@@ -170,8 +198,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
170198
Strings and Path objects are interpreted as a path to a netCDF file
171199
or an OpenDAP URL and opened with python-netCDF4, unless the filename
172200
ends with .gz, in which case the file is gunzipped and opened with
173-
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
174-
with scipy.io.netcdf (only netCDF3 supported).
201+
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
202+
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
175203
group : str, optional
176204
Path to the netCDF4 group in the given file to open (only works for
177205
netCDF4 files).
@@ -258,6 +286,13 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
258286
--------
259287
open_mfdataset
260288
"""
289+
engines = [None, 'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio',
290+
'cfgrib', 'pseudonetcdf']
291+
if engine not in engines:
292+
raise ValueError('unrecognized engine for open_dataset: {}\n'
293+
'must be one of: {}'
294+
.format(engine, engines))
295+
261296
if autoclose is not None:
262297
warnings.warn(
263298
'The autoclose argument is no longer used by '
@@ -316,18 +351,9 @@ def maybe_decode_store(store, lock=False):
316351

317352
if isinstance(filename_or_obj, backends.AbstractDataStore):
318353
store = filename_or_obj
319-
ds = maybe_decode_store(store)
320-
elif isinstance(filename_or_obj, str):
321354

322-
if (isinstance(filename_or_obj, bytes) and
323-
filename_or_obj.startswith(b'\x89HDF')):
324-
raise ValueError('cannot read netCDF4/HDF5 file images')
325-
elif (isinstance(filename_or_obj, bytes) and
326-
filename_or_obj.startswith(b'CDF')):
327-
# netCDF3 file images are handled by scipy
328-
pass
329-
elif isinstance(filename_or_obj, str):
330-
filename_or_obj = _normalize_path(filename_or_obj)
355+
elif isinstance(filename_or_obj, str):
356+
filename_or_obj = _normalize_path(filename_or_obj)
331357

332358
if engine is None:
333359
engine = _get_default_engine(filename_or_obj,
@@ -352,18 +378,19 @@ def maybe_decode_store(store, lock=False):
352378
elif engine == 'cfgrib':
353379
store = backends.CfGribDataStore(
354380
filename_or_obj, lock=lock, **backend_kwargs)
355-
else:
356-
raise ValueError('unrecognized engine for open_dataset: %r'
357-
% engine)
358381

359-
with close_on_error(store):
360-
ds = maybe_decode_store(store)
361382
else:
362-
if engine is not None and engine != 'scipy':
363-
raise ValueError('can only read file-like objects with '
364-
"default engine or engine='scipy'")
365-
# assume filename_or_obj is a file-like object
366-
store = backends.ScipyDataStore(filename_or_obj)
383+
if engine not in [None, 'scipy', 'h5netcdf']:
384+
raise ValueError("can only read bytes or file-like objects "
385+
"with engine='scipy' or 'h5netcdf'")
386+
engine = _get_engine_from_magic_number(filename_or_obj)
387+
if engine == 'scipy':
388+
store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
389+
elif engine == 'h5netcdf':
390+
store = backends.H5NetCDFStore(filename_or_obj, group=group,
391+
lock=lock, **backend_kwargs)
392+
393+
with close_on_error(store):
367394
ds = maybe_decode_store(store)
368395

369396
# Ensure source filename always stored in dataset object (GH issue #2550)
@@ -390,8 +417,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
390417
Strings and Paths are interpreted as a path to a netCDF file or an
391418
OpenDAP URL and opened with python-netCDF4, unless the filename ends
392419
with .gz, in which case the file is gunzipped and opened with
393-
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
394-
with scipy.io.netcdf (only netCDF3 supported).
420+
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
421+
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
395422
group : str, optional
396423
Path to the netCDF4 group in the given file to open (only works for
397424
netCDF4 files).

xarray/tests/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,12 @@ def LooseVersion(vstring):
7777
has_cfgrib, requires_cfgrib = _importorskip('cfgrib')
7878

7979
# some special cases
80+
has_h5netcdf07, requires_h5netcdf07 = _importorskip('h5netcdf',
81+
minversion='0.7')
82+
has_h5py29, requires_h5py29 = _importorskip('h5py', minversion='2.9.0')
83+
has_h5fileobj = has_h5netcdf07 and has_h5py29
84+
requires_h5fileobj = pytest.mark.skipif(
85+
not has_h5fileobj, reason='requires h5py>2.9.0 & h5netcdf>0.7')
8086
has_scipy_or_netCDF4 = has_scipy or has_netCDF4
8187
requires_scipy_or_netCDF4 = pytest.mark.skipif(
8288
not has_scipy_or_netCDF4, reason='requires scipy or netCDF4')

xarray/tests/test_backends.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
requires_cftime, requires_dask, requires_h5netcdf, requires_netCDF4,
3636
requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio,
3737
requires_rasterio, requires_scipy, requires_scipy_or_netCDF4,
38-
requires_zarr)
38+
requires_zarr, requires_h5fileobj)
3939
from .test_coding_times import (_STANDARD_CALENDARS, _NON_STANDARD_CALENDARS,
4040
_ALL_CALENDARS)
4141
from .test_dataset import create_test_data
@@ -1770,7 +1770,7 @@ def test_engine(self):
17701770
open_dataset(tmp_file, engine='foobar')
17711771

17721772
netcdf_bytes = data.to_netcdf()
1773-
with raises_regex(ValueError, 'can only read'):
1773+
with raises_regex(ValueError, 'unrecognized engine'):
17741774
open_dataset(BytesIO(netcdf_bytes), engine='foobar')
17751775

17761776
def test_cross_engine_read_write_netcdf3(self):
@@ -1955,6 +1955,52 @@ def test_dump_encodings_h5py(self):
19551955
assert actual.x.encoding['compression_opts'] is None
19561956

19571957

1958+
@requires_h5fileobj
1959+
class TestH5NetCDFFileObject(TestH5NetCDFData):
1960+
engine = 'h5netcdf'
1961+
1962+
def test_open_badbytes(self):
1963+
with raises_regex(ValueError, "HDF5 as bytes"):
1964+
with open_dataset(b'\211HDF\r\n\032\n', engine='h5netcdf'):
1965+
pass
1966+
with raises_regex(ValueError, "not a valid netCDF"):
1967+
with open_dataset(b'garbage'):
1968+
pass
1969+
with raises_regex(ValueError, "can only read bytes"):
1970+
with open_dataset(b'garbage', engine='netcdf4'):
1971+
pass
1972+
with raises_regex(ValueError, "not a valid netCDF"):
1973+
with open_dataset(BytesIO(b'garbage'), engine='h5netcdf'):
1974+
pass
1975+
1976+
def test_open_twice(self):
1977+
expected = create_test_data()
1978+
expected.attrs['foo'] = 'bar'
1979+
with raises_regex(ValueError, 'read/write pointer not at zero'):
1980+
with create_tmp_file() as tmp_file:
1981+
expected.to_netcdf(tmp_file, engine='h5netcdf')
1982+
with open(tmp_file, 'rb') as f:
1983+
with open_dataset(f, engine='h5netcdf'):
1984+
with open_dataset(f, engine='h5netcdf'):
1985+
pass
1986+
1987+
def test_open_fileobj(self):
1988+
# open in-memory datasets instead of local file paths
1989+
expected = create_test_data().drop('dim3')
1990+
expected.attrs['foo'] = 'bar'
1991+
with create_tmp_file() as tmp_file:
1992+
expected.to_netcdf(tmp_file, engine='h5netcdf')
1993+
1994+
with open(tmp_file, 'rb') as f:
1995+
with open_dataset(f, engine='h5netcdf') as actual:
1996+
assert_identical(expected, actual)
1997+
1998+
f.seek(0)
1999+
with BytesIO(f.read()) as bio:
2000+
with open_dataset(bio, engine='h5netcdf') as actual:
2001+
assert_identical(expected, actual)
2002+
2003+
19582004
@requires_h5netcdf
19592005
@requires_dask
19602006
@pytest.mark.filterwarnings('ignore:deallocating CachingFileManager')

0 commit comments

Comments
 (0)