diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5ee67efb1da..b4ef3c4c28c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,6 +34,10 @@ New Features Bug fixes ~~~~~~~~~ +- Fix silently overwriting the `engine` key when passing :py:func:`open_dataset` a file object + to an incompatible netCDF (:issue:`4457`). Now incompatible combinations of files and engines raise + an exception instead. By `Alessandro Amici `_. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9ea222954f4..434ea5a1854 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -126,8 +126,7 @@ def _get_engine_from_magic_number(filename_or_obj): if filename_or_obj.tell() != 0: raise ValueError( "file-like object read/write pointer not at zero " - "please close and reopen, or use a context " - "manager" + "please close and reopen, or use a context manager" ) magic_number = filename_or_obj.read(8) filename_or_obj.seek(0) @@ -136,17 +135,10 @@ def _get_engine_from_magic_number(filename_or_obj): engine = "scipy" elif magic_number.startswith(b"\211HDF\r\n\032\n"): engine = "h5netcdf" - if isinstance(filename_or_obj, bytes): - raise ValueError( - "can't open netCDF4/HDF5 as bytes " - "try passing a path or file-like object" - ) else: - if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80: - filename_or_obj = filename_or_obj[:80] + b"..." raise ValueError( - "{} is not a valid netCDF file " - "did you mean to pass a string for a path instead?".format(filename_or_obj) + f"{magic_number} is not the signature of any supported file format " + "did you mean to pass a string for a path instead?" ) return engine @@ -163,6 +155,14 @@ def _get_default_engine(path, allow_remote=False): return engine +def _autodetect_engine(filename_or_obj): + if isinstance(filename_or_obj, str): + engine = _get_default_engine(filename_or_obj, allow_remote=True) + else: + engine = _get_engine_from_magic_number(filename_or_obj) + return engine + + def _get_backend_cls(engine): """Select open_dataset method based on current engine""" try: @@ -175,10 +175,13 @@ def _get_backend_cls(engine): def _normalize_path(path): - if is_remote_uri(path): - return path - else: - return os.path.abspath(os.path.expanduser(path)) + if isinstance(path, Path): + path = str(path) + + if isinstance(path, str) and not is_remote_uri(path): + path = os.path.abspath(os.path.expanduser(path)) + + return path def _validate_dataset_names(dataset): @@ -532,24 +535,13 @@ def maybe_decode_store(store, chunks, lock=False): ds2._file_obj = ds._file_obj return ds2 - if isinstance(filename_or_obj, Path): - filename_or_obj = str(filename_or_obj) + filename_or_obj = _normalize_path(filename_or_obj) if isinstance(filename_or_obj, AbstractDataStore): store = filename_or_obj else: - if isinstance(filename_or_obj, str): - filename_or_obj = _normalize_path(filename_or_obj) - - if engine is None: - engine = _get_default_engine(filename_or_obj, allow_remote=True) - elif engine != "zarr": - if engine not in [None, "scipy", "h5netcdf"]: - raise ValueError( - "can only read bytes or file-like objects " - "with engine='scipy' or 'h5netcdf'" - ) - engine = _get_engine_from_magic_number(filename_or_obj) + if engine is None: + engine = _autodetect_engine(filename_or_obj) if engine in ["netcdf4", "h5netcdf"]: extra_kwargs["group"] = group diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index f3e61eeee74..2d5292f5d10 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -121,6 +121,25 @@ def open( ): import h5netcdf + if isinstance(filename, bytes): + raise ValueError( + "can't open netCDF4/HDF5 as bytes " + "try passing a path or file-like object" + ) + elif hasattr(filename, "tell"): + if filename.tell() != 0: + raise ValueError( + "file-like object read/write pointer not at zero " + "please close and reopen, or use a context manager" + ) + else: + magic_number = filename.read(8) + filename.seek(0) + if not magic_number.startswith(b"\211HDF\r\n\032\n"): + raise ValueError( + f"{magic_number} is not the signature of a valid netCDF file" + ) + if format not in [None, "NETCDF4"]: raise ValueError("invalid format for h5netcdf backend") diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 0a917cde4d7..bd799d100cb 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -333,6 +333,12 @@ def open( ): import netCDF4 + if not isinstance(filename, str): + raise ValueError( + "can only read bytes or file-like objects " + "with engine='scipy' or 'h5netcdf'" + ) + if format is None: format = "NETCDF4" diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c9030e31a9e..81664737330 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2224,7 +2224,7 @@ def test_engine(self): open_dataset(tmp_file, engine="foobar") netcdf_bytes = data.to_netcdf() - with raises_regex(ValueError, "can only read bytes or file-like"): + with raises_regex(ValueError, "unrecognized engine"): open_dataset(BytesIO(netcdf_bytes), engine="foobar") def test_cross_engine_read_write_netcdf3(self): @@ -2494,13 +2494,13 @@ def test_open_badbytes(self): with raises_regex(ValueError, "HDF5 as bytes"): with open_dataset(b"\211HDF\r\n\032\n", engine="h5netcdf"): pass - with raises_regex(ValueError, "not a valid netCDF"): + with raises_regex(ValueError, "not the signature of any supported file"): with open_dataset(b"garbage"): pass with raises_regex(ValueError, "can only read bytes"): with open_dataset(b"garbage", engine="netcdf4"): pass - with raises_regex(ValueError, "not a valid netCDF"): + with raises_regex(ValueError, "not the signature of a valid netCDF file"): with open_dataset(BytesIO(b"garbage"), engine="h5netcdf"): pass @@ -2526,11 +2526,23 @@ def test_open_fileobj(self): with open_dataset(f, engine="h5netcdf") as actual: assert_identical(expected, actual) + f.seek(0) + with open_dataset(f) as actual: + assert_identical(expected, actual) + f.seek(0) with BytesIO(f.read()) as bio: with open_dataset(bio, engine="h5netcdf") as actual: assert_identical(expected, actual) + f.seek(0) + with raises_regex(TypeError, "not a valid NetCDF 3"): + open_dataset(f, engine="scipy") + + f.seek(8) + with raises_regex(ValueError, "read/write pointer not at zero"): + open_dataset(f) + @requires_h5netcdf @requires_dask