diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 4107b3aa883..a5a41718648 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -78,6 +78,71 @@ def __repr__(self) -> str: ) +class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): + """Decode arrays on the fly from non-native to native endianness + + This is useful for decoding arrays from netCDF3 files (which are all + big endian) into native endianness, so they can be used with Cython + functions, such as those found in bottleneck and pandas. + + >>> x = np.arange(5, dtype=">i2") + + >>> x.dtype + dtype('>i2') + + >>> NativeEndiannessArray(x).dtype + dtype('int16') + + >>> indexer = indexing.BasicIndexer((slice(None),)) + >>> NativeEndiannessArray(x)[indexer].dtype + dtype('int16') + """ + + __slots__ = ("array",) + + def __init__(self, array) -> None: + self.array = indexing.as_indexable(array) + + @property + def dtype(self) -> np.dtype: + return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) + + def __getitem__(self, key) -> np.ndarray: + return np.asarray(self.array[key], dtype=self.dtype) + + +class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): + """Decode arrays on the fly from integer to boolean datatype + + This is useful for decoding boolean arrays from integer typed netCDF + variables. + + >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") + + >>> x.dtype + dtype('int8') + + >>> BoolTypeArray(x).dtype + dtype('bool') + + >>> indexer = indexing.BasicIndexer((slice(None),)) + >>> BoolTypeArray(x)[indexer].dtype + dtype('bool') + """ + + __slots__ = ("array",) + + def __init__(self, array) -> None: + self.array = indexing.as_indexable(array) + + @property + def dtype(self) -> np.dtype: + return np.dtype("bool") + + def __getitem__(self, key) -> np.ndarray: + return np.asarray(self.array[key], dtype=self.dtype) + + def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike): """Lazily apply an element-wise function to an array. Parameters @@ -155,34 +220,39 @@ class CFMaskCoder(VariableCoder): def encode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_encoding(variable) + # get dtype from encoding if available, otherwise use data.dtype dtype = np.dtype(encoding.get("dtype", data.dtype)) fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - if ( - fv is not None - and mv is not None - and not duck_array_ops.allclose_or_equiv(fv, mv) - ): - raise ValueError( - f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." - ) - - if fv is not None: - # Ensure _FillValue is cast to same dtype as data's - encoding["_FillValue"] = dtype.type(fv) - fill_value = pop_to(encoding, attrs, "_FillValue", name=name) - if not pd.isnull(fill_value): - data = duck_array_ops.fillna(data, fill_value) - - if mv is not None: - # Ensure missing_value is cast to same dtype as data's - encoding["missing_value"] = dtype.type(mv) - fill_value = pop_to(encoding, attrs, "missing_value", name=name) - if not pd.isnull(fill_value) and fv is None: - data = duck_array_ops.fillna(data, fill_value) + if fv is not None or mv is not None: + if ( + fv is not None + and mv is not None + and not duck_array_ops.allclose_or_equiv(fv, mv) + ): + raise ValueError( + f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." + ) - return Variable(dims, data, attrs, encoding, fastpath=True) + if fv is not None: + # Ensure _FillValue is cast to same dtype as data's + encoding["_FillValue"] = dtype.type(fv) + fill_value = pop_to(encoding, attrs, "_FillValue", name=name) + if not pd.isnull(fill_value): + data = duck_array_ops.fillna(data, fill_value) + + if mv is not None: + # Use _FillValue if available to align missing_value to prevent issues + # when decoding + # Ensure missing_value is cast to same dtype as data's + encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv)) + fill_value = pop_to(encoding, attrs, "missing_value", name=name) + if not pd.isnull(fill_value) and fv is None: + data = duck_array_ops.fillna(data, fill_value) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable def decode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_decoding(variable) @@ -207,7 +277,13 @@ def decode(self, variable: Variable, name: T_Name = None): stacklevel=3, ) - dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + if "scale_factor" not in attrs and "add_offset" not in attrs: + dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + else: + dtype, decoded_fill_value = ( + _choose_float_dtype(data.dtype, attrs), + np.nan, + ) if encoded_fill_values: transform = partial( @@ -232,20 +308,50 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp return data -def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]: - """Return a float dtype that can losslessly represent `dtype` values.""" - # Keep float32 as-is. Upcast half-precision to single-precision, +def _choose_float_dtype( + dtype: np.dtype, mapping: MutableMapping +) -> type[np.floating[Any]]: + # check scale/offset first to derive wanted float dtype + # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954 + scale_factor = mapping.get("scale_factor") + add_offset = mapping.get("add_offset") + if scale_factor is not None or add_offset is not None: + # get the type from scale_factor/add_offset to determine + # the needed floating point type + if scale_factor is not None: + scale_type = type(scale_factor) + if add_offset is not None: + offset_type = type(add_offset) + # CF conforming, both scale_factor and add-offset are given and + # of same floating point type (float32/64) + if ( + add_offset is not None + and scale_factor is not None + and offset_type == scale_type + and scale_type in [np.float32, np.float64] + ): + # in case of int32 -> we need upcast to float64 + # due to precision issues + if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer): + return np.float64 + return np.dtype(scale_type).type + # Not CF conforming and add_offset given: + # A scale factor is entirely safe (vanishing into the mantissa), + # but a large integer offset could lead to loss of precision. + # Sensitivity analysis can be tricky, so we just use a float64 + # if there's any offset at all - better unoptimised than wrong! + if add_offset is not None: + return np.float64 + # return float32 in other cases where only scale_factor is given + return np.float32 + # If no scale_factor or add_offset is given, use some general rules. + # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): return np.float32 # float32 can exactly represent all integers up to 24 bits if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer): - # A scale factor is entirely safe (vanishing into the mantissa), - # but a large integer offset could lead to loss of precision. - # Sensitivity analysis can be tricky, so we just use a float64 - # if there's any offset at all - better unoptimised than wrong! - if not has_offset: - return np.float32 + return np.float32 # For all other types and circumstances, we just use float64. # (safe because eg. complex numbers are not supported in NetCDF) return np.float64 @@ -260,29 +366,40 @@ class CFScaleOffsetCoder(VariableCoder): def encode(self, variable: Variable, name: T_Name = None) -> Variable: dims, data, attrs, encoding = unpack_for_encoding(variable) - - if "scale_factor" in encoding or "add_offset" in encoding: - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + scale_factor = pop_to(encoding, attrs, "scale_factor", name=name) + add_offset = pop_to(encoding, attrs, "add_offset", name=name) + if scale_factor or add_offset: + # if we have a _FillValue/masked_value we do not want to cast now + # but leave that to CFMaskCoder + dtype = data.dtype + if "_FillValue" not in encoding and "missing_value" not in encoding: + dtype = _choose_float_dtype(data.dtype, attrs) + # but still we need a copy prevent changing original data data = data.astype(dtype=dtype, copy=True) - if "add_offset" in encoding: - data -= pop_to(encoding, attrs, "add_offset", name=name) - if "scale_factor" in encoding: - data /= pop_to(encoding, attrs, "scale_factor", name=name) - - return Variable(dims, data, attrs, encoding, fastpath=True) + if add_offset: + data -= add_offset + if scale_factor: + data /= scale_factor + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable def decode(self, variable: Variable, name: T_Name = None) -> Variable: - _attrs = variable.attrs - if "scale_factor" in _attrs or "add_offset" in _attrs: - dims, data, attrs, encoding = unpack_for_decoding(variable) - - scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) - add_offset = pop_to(attrs, encoding, "add_offset", name=name) - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + dims, data, attrs, encoding = unpack_for_decoding(variable) + scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) + add_offset = pop_to(attrs, encoding, "add_offset", name=name) + if scale_factor or add_offset: if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() if np.ndim(add_offset) > 0: add_offset = np.asarray(add_offset).item() + # if we have a _FillValue/masked_value we already have the wanted + # floating point dtype here (via CFMaskCoder), so no check is necessary + # only check in other cases + dtype = data.dtype + if "_FillValue" not in encoding and "missing_value" not in encoding: + dtype = _choose_float_dtype(dtype, encoding) + transform = partial( _scale_offset_decoding, scale_factor=scale_factor, @@ -349,3 +466,100 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable + + +class DefaultFillvalueCoder(VariableCoder): + """Encode default _FillValue if needed.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + dims, data, attrs, encoding = unpack_for_encoding(variable) + # make NaN the fill value for float types + if ( + "_FillValue" not in attrs + and "_FillValue" not in encoding + and np.issubdtype(variable.dtype, np.floating) + ): + attrs["_FillValue"] = variable.dtype.type(np.nan) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + raise NotImplementedError() + + +class BooleanCoder(VariableCoder): + """Code boolean values.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + if ( + (variable.dtype == bool) + and ("dtype" not in variable.encoding) + and ("dtype" not in variable.attrs) + ): + dims, data, attrs, encoding = unpack_for_encoding(variable) + attrs["dtype"] = "bool" + data = duck_array_ops.astype(data, dtype="i1", copy=True) + + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + if variable.attrs.get("dtype", False) == "bool": + dims, data, attrs, encoding = unpack_for_decoding(variable) + # overwrite (!) encoding accordingly and remove from attrs + encoding["dtype"] = attrs.pop("dtype") + data = BoolTypeArray(data) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + +class EndianCoder(VariableCoder): + """Decode Endianness to native.""" + + def encode(self): + raise NotImplementedError() + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + dims, data, attrs, encoding = unpack_for_decoding(variable) + if not data.dtype.isnative: + data = NativeEndiannessArray(data) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + +class NonStringCoder(VariableCoder): + """Encode NonString variables if dtypes differ.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + if "dtype" in variable.encoding and variable.encoding["dtype"] not in ( + "S1", + str, + ): + dims, data, attrs, encoding = unpack_for_encoding(variable) + dtype = np.dtype(encoding.pop("dtype")) + if dtype != variable.dtype: + if np.issubdtype(dtype, np.integer): + if ( + np.issubdtype(variable.dtype, np.floating) + and "_FillValue" not in variable.attrs + and "missing_value" not in variable.attrs + ): + warnings.warn( + f"saving variable {name} with floating " + "point data as an integer dtype without " + "any _FillValue to use for NaNs", + SerializationWarning, + stacklevel=10, + ) + data = np.around(data) + data = data.astype(dtype=dtype) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self): + raise NotImplementedError() diff --git a/xarray/conventions.py b/xarray/conventions.py index 780172879c6..1506efc31e8 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -10,7 +10,7 @@ from xarray.coding import strings, times, variables from xarray.coding.variables import SerializationWarning, pop_to -from xarray.core import duck_array_ops, indexing +from xarray.core import indexing from xarray.core.common import ( _contains_datetime_like_objects, contains_cftime_datetimes, @@ -48,123 +48,10 @@ T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore] -class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from non-native to native endianness - - This is useful for decoding arrays from netCDF3 files (which are all - big endian) into native endianness, so they can be used with Cython - functions, such as those found in bottleneck and pandas. - - >>> x = np.arange(5, dtype=">i2") - - >>> x.dtype - dtype('>i2') - - >>> NativeEndiannessArray(x).dtype - dtype('int16') - - >>> indexer = indexing.BasicIndexer((slice(None),)) - >>> NativeEndiannessArray(x)[indexer].dtype - dtype('int16') - """ - - __slots__ = ("array",) - - def __init__(self, array): - self.array = indexing.as_indexable(array) - - @property - def dtype(self): - return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - -class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from integer to boolean datatype - - This is useful for decoding boolean arrays from integer typed netCDF - variables. - - >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") - - >>> x.dtype - dtype('int8') - - >>> BoolTypeArray(x).dtype - dtype('bool') - - >>> indexer = indexing.BasicIndexer((slice(None),)) - >>> BoolTypeArray(x)[indexer].dtype - dtype('bool') - """ - - __slots__ = ("array",) - - def __init__(self, array): - self.array = indexing.as_indexable(array) - - @property - def dtype(self): - return np.dtype("bool") - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - def _var_as_tuple(var: Variable) -> T_VarTuple: return var.dims, var.data, var.attrs.copy(), var.encoding.copy() -def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable: - if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str): - dims, data, attrs, encoding = _var_as_tuple(var) - dtype = np.dtype(encoding.pop("dtype")) - if dtype != var.dtype: - if np.issubdtype(dtype, np.integer): - if ( - np.issubdtype(var.dtype, np.floating) - and "_FillValue" not in var.attrs - and "missing_value" not in var.attrs - ): - warnings.warn( - f"saving variable {name} with floating " - "point data as an integer dtype without " - "any _FillValue to use for NaNs", - SerializationWarning, - stacklevel=10, - ) - data = np.around(data) - data = data.astype(dtype=dtype) - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - -def maybe_default_fill_value(var: Variable) -> Variable: - # make NaN the fill value for float types: - if ( - "_FillValue" not in var.attrs - and "_FillValue" not in var.encoding - and np.issubdtype(var.dtype, np.floating) - ): - var.attrs["_FillValue"] = var.dtype.type(np.nan) - return var - - -def maybe_encode_bools(var: Variable) -> Variable: - if ( - (var.dtype == bool) - and ("dtype" not in var.encoding) - and ("dtype" not in var.attrs) - ): - dims, data, attrs, encoding = _var_as_tuple(var) - attrs["dtype"] = "bool" - data = duck_array_ops.astype(data, dtype="i1", copy=True) - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - def _infer_dtype(array, name: T_Name = None) -> np.dtype: """Given an object array with no missing values, infer its dtype from its first element @@ -292,13 +179,13 @@ def encode_cf_variable( variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), variables.UnsignedIntegerCoder(), + variables.NonStringCoder(), + variables.DefaultFillvalueCoder(), + variables.BooleanCoder(), ]: var = coder.encode(var, name=name) - # TODO(shoyer): convert all of these to use coders, too: - var = maybe_encode_nonstring_dtype(var, name=name) - var = maybe_default_fill_value(var) - var = maybe_encode_bools(var) + # TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends: var = ensure_dtype_not_object(var, name=name) for attr_name in CF_RELATED_DATA: @@ -389,19 +276,15 @@ def decode_cf_variable( if decode_times: var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name) - dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) - # TODO(shoyer): convert everything below to use coders + if decode_endianness and not var.dtype.isnative: + var = variables.EndianCoder().decode(var) + original_dtype = var.dtype - if decode_endianness and not data.dtype.isnative: - # do this last, so it's only done if we didn't already unmask/scale - data = NativeEndiannessArray(data) - original_dtype = data.dtype + var = variables.BooleanCoder().decode(var) - encoding.setdefault("dtype", original_dtype) + dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) - if "dtype" in attributes and attributes["dtype"] == "bool": - del attributes["dtype"] - data = BoolTypeArray(data) + encoding.setdefault("dtype", original_dtype) if not is_duck_dask_array(data): data = indexing.LazilyIndexedArray(data) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 14b510d4c97..82c2de60753 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -17,7 +17,7 @@ from contextlib import ExitStack from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Final, cast +from typing import TYPE_CHECKING, Any, Callable, Final, cast import numpy as np import pandas as pd @@ -138,96 +138,110 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset: ) -def create_masked_and_scaled_data() -> Dataset: - x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32) +def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Dataset: + x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype) encoding = { "_FillValue": -1, - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), "dtype": "i2", } return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_masked_and_scaled_data() -> Dataset: - attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)} +def create_encoded_masked_and_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: + attributes = {"_FillValue": -1, "add_offset": dtype(10), "scale_factor": dtype(0.1)} return Dataset( {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)} ) -def create_unsigned_masked_scaled_data() -> Dataset: +def create_unsigned_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": "true", "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_encoded_unsigned_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": "true", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_bad_unsigned_masked_scaled_data() -> Dataset: +def create_bad_unsigned_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": True, "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(0), + "scale_factor": dtype(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_bad_encoded_unsigned_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": True, - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_signed_masked_scaled_data() -> Dataset: +def create_signed_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: encoding = { "_FillValue": -127, "_Unsigned": "false", "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } - x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32) + x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_signed_masked_scaled_data() -> Dataset: +def create_encoded_signed_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -127, "_Unsigned": "false", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([-110, 1, 127, -127], dtype="i1") @@ -630,6 +644,11 @@ def test_roundtrip_boolean_dtype(self) -> None: with self.roundtrip(original) as actual: assert_identical(original, actual) assert actual["x"].dtype == "bool" + # this checks for preserving dtype during second roundtrip + # see https://github.com/pydata/xarray/issues/7652#issuecomment-1476956975 + with self.roundtrip(actual) as actual2: + assert_identical(original, actual2) + assert actual2["x"].dtype == "bool" def test_orthogonal_indexing(self) -> None: in_memory = create_test_data() @@ -854,6 +873,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: with self.roundtrip(original) as actual: assert_identical(expected, actual) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize( "decoded_fn, encoded_fn", [ @@ -873,9 +893,20 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) - def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: - decoded = decoded_fn() - encoded = encoded_fn() + def test_roundtrip_mask_and_scale( + self, + decoded_fn: Callable[[type[np.number]], Dataset], + encoded_fn: Callable[[type[np.number]], Dataset], + dtype: type[np.number], + ) -> None: + if dtype == np.float32 and isinstance( + self, (TestZarrDirectoryStore, TestZarrDictStore) + ): + pytest.skip( + "zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`" + ) + decoded = decoded_fn(dtype) + encoded = encoded_fn(dtype) with self.roundtrip(decoded) as actual: for k in decoded.variables: @@ -896,7 +927,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: # make sure roundtrip encoding didn't change the # original dataset. - assert_allclose(encoded, encoded_fn(), decode_bytes=False) + assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False) with self.roundtrip(encoded) as actual: for k in decoded.variables: @@ -1530,31 +1561,76 @@ def test_encoding_chunksizes_unlimited(self) -> None: with self.roundtrip(ds) as actual: assert_equal(ds, actual) - def test_mask_and_scale(self) -> None: + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_mask_and_scale(self, dtype: type[np.number]) -> None: with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: nc.createDimension("t", 5) nc.createVariable("x", "int16", ("t",), fill_value=-1) v = nc.variables["x"] v.set_auto_maskandscale(False) - v.add_offset = 10 - v.scale_factor = 0.1 + v.add_offset = dtype(10) + v.scale_factor = dtype(0.1) v[:] = np.array([-1, -1, 0, 1, 2]) # first make sure netCDF4 reads the masked and scaled data # correctly with nc4.Dataset(tmp_file, mode="r") as nc: expected = np.ma.array( - [-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False] + [-1, -1, 10, 10.1, 10.2], + mask=[True, True, False, False, False], + dtype=dtype, ) actual = nc.variables["x"][:] assert_array_equal(expected, actual) # now check xarray with open_dataset(tmp_file) as ds: - expected = create_masked_and_scaled_data() + expected = create_masked_and_scaled_data(dtype) assert_identical(expected, ds) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize("offset_cf_conforming", [True, False]) + def test_mask_and_scale_non_cf_conforming( + self, dtype: type[np.number], offset_cf_conforming: bool + ) -> None: + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, mode="w") as nc: + nc.createDimension("t", 5) + nc.createVariable("x", "int16", ("t",), fill_value=-1) + v = nc.variables["x"] + v.set_auto_maskandscale(False) + if offset_cf_conforming: + v.add_offset = dtype(10) + else: + v.add_offset = 10 + v.scale_factor = dtype(0.1) + v[:] = np.array([-1, -1, 0, 1, 2]) + + # first make sure netCDF4 reads the masked and scaled data + # correctly + with nc4.Dataset(tmp_file, mode="r") as nc: + expected = np.ma.array( + [-1, -1, 10, 10.1, 10.2], + mask=[True, True, False, False, False], + dtype=dtype, + ) + actual = nc.variables["x"][:] + assert_array_equal(expected, actual) + + # now check xarray + with open_dataset(tmp_file) as ds: + if not offset_cf_conforming: + # if not conforming, this get's promoted to float64 in any case + # if add_offset is available + expected = create_masked_and_scaled_data(np.float64) + # here we have slight differences possibly + # due to using float32 first and casting to float64 later + assert_allclose(expected, ds) + else: + expected = create_masked_and_scaled_data(dtype) + assert_identical(expected, ds) + def test_0dimensional_variable(self) -> None: # This fix verifies our work-around to this netCDF4-python bug: # https://github.com/Unidata/netcdf4-python/pull/220 diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index f7579c4b488..406921a8556 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -51,9 +51,15 @@ def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None assert encoded.dtype == encoded.attrs["missing_value"].dtype assert encoded.dtype == encoded.attrs["_FillValue"].dtype + +def test_CFMaskCoder_multiple_missing_values_conflict(): + data = np.array([0.0, -1.0, 1.0]) + attrs = dict(_FillValue=np.float64(1e20), missing_value=np.float64(1e21)) + original = xr.Variable(("x",), data, attrs=attrs) with pytest.warns(variables.SerializationWarning): - roundtripped = decode_cf_variable("foo", encoded) - assert_identical(roundtripped, original) + decoded = decode_cf_variable("foo", original) + with pytest.raises(ValueError): + encode_cf_variable(decoded) def test_CFMaskCoder_missing_value() -> None: @@ -95,10 +101,71 @@ def test_coder_roundtrip() -> None: assert_identical(original, roundtripped) -@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) -def test_scaling_converts_to_float32(dtype) -> None: +@pytest.mark.parametrize("ptype", "u1 u2 u4 i1 i2 i4".split()) +@pytest.mark.parametrize("utype", "f4 f8".split()) +def test_mask_scale_roundtrip(utype: str, ptype: str) -> None: + # this tests cf conforming packing/unpacking via + # encode_cf_variable/decode_cf_variable + # f4->i4 packing is skipped as non-conforming + if utype[1] == "4" and ptype[1] == "4": + pytest.skip("Can't pack float32 into int32/uint32") + # fillvalues according to netCDF4 + filldict = { + "i1": -127, + "u1": 255, + "i2": -32767, + "u2": 65535, + "i4": -2147483647, + "u4": 4294967295, + } + fillvalue = filldict[ptype] + unpacked_dtype = np.dtype(utype).type + packed_dtype = np.dtype(ptype).type + info = np.iinfo(packed_dtype) + + # create original "encoded" Variable + packed_data = np.array( + [info.min, fillvalue, info.max - 1, info.max], dtype=packed_dtype + ) + attrs = dict( + scale_factor=unpacked_dtype(1), + add_offset=unpacked_dtype(0), + _FillValue=packed_dtype(fillvalue), + ) + original = xr.Variable(("x",), packed_data, attrs=attrs) + + # create wanted "decoded" Variable + unpacked_data = np.array( + [info.min, fillvalue, info.max - 1, info.max], dtype=unpacked_dtype + ) + encoding = dict( + scale_factor=unpacked_dtype(1), + add_offset=unpacked_dtype(0), + _FillValue=packed_dtype(fillvalue), + ) + wanted = xr.Variable(("x"), unpacked_data, encoding=encoding) + wanted = wanted.where(wanted != fillvalue) + + # decode original and compare with wanted + decoded = decode_cf_variable("x", original) + assert wanted.dtype == decoded.dtype + xr.testing.assert_identical(wanted, decoded) + + # encode again and compare with original + encoded = encode_cf_variable(decoded) + assert original.dtype == encoded.dtype + xr.testing.assert_identical(original, encoded) + + +@pytest.mark.parametrize("unpacked_dtype", "f4 f8 i4".split()) +@pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) +def test_scaling_converts_to_float32(packed_dtype: str, unpacked_dtype: str) -> None: + # if scale_factor but no add_offset is given transform to float32 in any case + # this minimizes memory usage, see #1840, #1842 original = xr.Variable( - ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) + ("x",), + np.arange(10, dtype=packed_dtype), + encoding=dict(scale_factor=np.dtype(unpacked_dtype).type(10)), ) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) @@ -108,10 +175,31 @@ def test_scaling_converts_to_float32(dtype) -> None: assert roundtripped.dtype == np.float32 +@pytest.mark.parametrize("unpacked_dtype", "f4 f8 i4".split()) +@pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) +def test_scaling_converts_to_float64( + packed_dtype: str, unpacked_dtype: type[np.number] +) -> None: + # if add_offset is given, but no scale_factor transform to float64 in any case + # to prevent precision issues + original = xr.Variable( + ("x",), + np.arange(10, dtype=packed_dtype), + encoding=dict(add_offset=np.dtype(unpacked_dtype).type(10)), + ) + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float64 + roundtripped = coder.decode(encoded) + assert_identical(original, roundtripped) + assert roundtripped.dtype == np.float64 + + @pytest.mark.parametrize("scale_factor", (10, [10])) @pytest.mark.parametrize("add_offset", (0.1, [0.1])) def test_scaling_offset_as_list(scale_factor, add_offset) -> None: # test for #4631 + # attention: scale_factor and add_offset are not conforming to cf specs here encoding = dict(scale_factor=scale_factor, add_offset=add_offset) original = xr.Variable(("x",), np.arange(10.0), encoding=encoding) coder = variables.CFScaleOffsetCoder() diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 9485b506b89..6d219f09e0e 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -32,7 +32,7 @@ class TestBoolTypeArray: def test_booltype_array(self) -> None: x = np.array([1, 0, 1, 1, 0], dtype="i1") - bx = conventions.BoolTypeArray(x) + bx = coding.variables.BoolTypeArray(x) assert bx.dtype == bool assert_array_equal(bx, np.array([True, False, True, True, False], dtype=bool)) @@ -41,7 +41,7 @@ class TestNativeEndiannessArray: def test(self) -> None: x = np.arange(5, dtype=">i8") expected = np.arange(5, dtype="int64") - a = conventions.NativeEndiannessArray(x) + a = coding.variables.NativeEndiannessArray(x) assert a.dtype == expected.dtype assert a.dtype == expected[:].dtype assert_array_equal(a, expected) @@ -247,7 +247,7 @@ def test_decode_coordinates(self) -> None: def test_0d_int32_encoding(self) -> None: original = Variable((), np.int32(0), encoding={"dtype": "int64"}) expected = Variable((), np.int64(0)) - actual = conventions.maybe_encode_nonstring_dtype(original) + actual = coding.variables.NonStringCoder().encode(original) assert_identical(expected, actual) def test_decode_cf_with_multiple_missing_values(self) -> None: