Skip to content

Autodetect the engine only when engine=None #4458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Sep 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ New Features
Bug fixes
~~~~~~~~~

- Fix silently overwriting the `engine` key when passing :py:func:`open_dataset` a file object
to an incompatible netCDF (:issue:`4457`). Now incompatible combinations of files and engines raise
an exception instead. By `Alessandro Amici <https://github.com/alexamici>`_.


Documentation
~~~~~~~~~~~~~
Expand Down
50 changes: 21 additions & 29 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ def _get_engine_from_magic_number(filename_or_obj):
if filename_or_obj.tell() != 0:
raise ValueError(
"file-like object read/write pointer not at zero "
"please close and reopen, or use a context "
"manager"
"please close and reopen, or use a context manager"
)
magic_number = filename_or_obj.read(8)
filename_or_obj.seek(0)
Expand All @@ -136,17 +135,10 @@ def _get_engine_from_magic_number(filename_or_obj):
engine = "scipy"
elif magic_number.startswith(b"\211HDF\r\n\032\n"):
engine = "h5netcdf"
if isinstance(filename_or_obj, bytes):
raise ValueError(
"can't open netCDF4/HDF5 as bytes "
"try passing a path or file-like object"
)
else:
if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80:
filename_or_obj = filename_or_obj[:80] + b"..."
raise ValueError(
"{} is not a valid netCDF file "
"did you mean to pass a string for a path instead?".format(filename_or_obj)
f"{magic_number} is not the signature of any supported file format "
"did you mean to pass a string for a path instead?"
)
return engine

Expand All @@ -163,6 +155,14 @@ def _get_default_engine(path, allow_remote=False):
return engine


def _autodetect_engine(filename_or_obj):
if isinstance(filename_or_obj, str):
engine = _get_default_engine(filename_or_obj, allow_remote=True)
else:
engine = _get_engine_from_magic_number(filename_or_obj)
return engine


def _get_backend_cls(engine):
"""Select open_dataset method based on current engine"""
try:
Expand All @@ -175,10 +175,13 @@ def _get_backend_cls(engine):


def _normalize_path(path):
if is_remote_uri(path):
return path
else:
return os.path.abspath(os.path.expanduser(path))
if isinstance(path, Path):
path = str(path)

if isinstance(path, str) and not is_remote_uri(path):
path = os.path.abspath(os.path.expanduser(path))

return path


def _validate_dataset_names(dataset):
Expand Down Expand Up @@ -532,24 +535,13 @@ def maybe_decode_store(store, chunks, lock=False):
ds2._file_obj = ds._file_obj
return ds2

if isinstance(filename_or_obj, Path):
filename_or_obj = str(filename_or_obj)
filename_or_obj = _normalize_path(filename_or_obj)

if isinstance(filename_or_obj, AbstractDataStore):
store = filename_or_obj
else:
if isinstance(filename_or_obj, str):
filename_or_obj = _normalize_path(filename_or_obj)

if engine is None:
engine = _get_default_engine(filename_or_obj, allow_remote=True)
elif engine != "zarr":
if engine not in [None, "scipy", "h5netcdf"]:
raise ValueError(
"can only read bytes or file-like objects "
"with engine='scipy' or 'h5netcdf'"
)
engine = _get_engine_from_magic_number(filename_or_obj)
if engine is None:
engine = _autodetect_engine(filename_or_obj)

if engine in ["netcdf4", "h5netcdf"]:
extra_kwargs["group"] = group
Expand Down
19 changes: 19 additions & 0 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,25 @@ def open(
):
import h5netcdf

if isinstance(filename, bytes):
raise ValueError(
"can't open netCDF4/HDF5 as bytes "
"try passing a path or file-like object"
)
elif hasattr(filename, "tell"):
if filename.tell() != 0:
raise ValueError(
"file-like object read/write pointer not at zero "
"please close and reopen, or use a context manager"
)
else:
magic_number = filename.read(8)
filename.seek(0)
if not magic_number.startswith(b"\211HDF\r\n\032\n"):
raise ValueError(
f"{magic_number} is not the signature of a valid netCDF file"
)

if format not in [None, "NETCDF4"]:
raise ValueError("invalid format for h5netcdf backend")

Expand Down
6 changes: 6 additions & 0 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,12 @@ def open(
):
import netCDF4

if not isinstance(filename, str):
raise ValueError(
"can only read bytes or file-like objects "
"with engine='scipy' or 'h5netcdf'"
)

if format is None:
format = "NETCDF4"

Expand Down
18 changes: 15 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2224,7 +2224,7 @@ def test_engine(self):
open_dataset(tmp_file, engine="foobar")

netcdf_bytes = data.to_netcdf()
with raises_regex(ValueError, "can only read bytes or file-like"):
with raises_regex(ValueError, "unrecognized engine"):
open_dataset(BytesIO(netcdf_bytes), engine="foobar")

def test_cross_engine_read_write_netcdf3(self):
Expand Down Expand Up @@ -2494,13 +2494,13 @@ def test_open_badbytes(self):
with raises_regex(ValueError, "HDF5 as bytes"):
with open_dataset(b"\211HDF\r\n\032\n", engine="h5netcdf"):
pass
with raises_regex(ValueError, "not a valid netCDF"):
with raises_regex(ValueError, "not the signature of any supported file"):
with open_dataset(b"garbage"):
pass
with raises_regex(ValueError, "can only read bytes"):
with open_dataset(b"garbage", engine="netcdf4"):
pass
with raises_regex(ValueError, "not a valid netCDF"):
with raises_regex(ValueError, "not the signature of a valid netCDF file"):
with open_dataset(BytesIO(b"garbage"), engine="h5netcdf"):
pass

Expand All @@ -2526,11 +2526,23 @@ def test_open_fileobj(self):
with open_dataset(f, engine="h5netcdf") as actual:
assert_identical(expected, actual)

f.seek(0)
with open_dataset(f) as actual:
assert_identical(expected, actual)

f.seek(0)
with BytesIO(f.read()) as bio:
with open_dataset(bio, engine="h5netcdf") as actual:
assert_identical(expected, actual)

f.seek(0)
with raises_regex(TypeError, "not a valid NetCDF 3"):
open_dataset(f, engine="scipy")

f.seek(8)
with raises_regex(ValueError, "read/write pointer not at zero"):
open_dataset(f)


@requires_h5netcdf
@requires_dask
Expand Down