diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index cf2b0b4b3ea..5de1e73f043 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -519,8 +519,9 @@ difference between values, with a reference time of the first time value. Coordinates ........... -You can control the ``coordinates`` attribute written to disk by specifying ``DataArray.encoding["coordinates"]``. -If not specified, xarray automatically sets ``DataArray.encoding["coordinates"]`` to a space-delimited list +You can control the ``coordinates`` attribute written to disk by specifying the ``coordinates`` key in the +output encoding keyword argument to `to_netcdf` or `to_zarr`. If not specified, xarray automatically sets +the ``coordinates`` attribute for each variable to a space-delimited list of names of coordinate variables that share dimensions with the ``DataArray`` being written. This allows perfect roundtripping of xarray datasets but may not be desirable. When an xarray ``Dataset`` contains non-dimensional coordinates that do not share dimensions with any of diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 681fb1f070b..f133a28ee7e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -36,7 +36,9 @@ Breaking changes Deprecations ~~~~~~~~~~~~ - +- Setting encoding on Xarray objects directly via the ``encoding`` attribute is deprecated in favor of using the + ``encoding`` kwarg in :py:meth:`Dataset.to_netcdf` / :py:meth:`Dataset.to_zarr`. (:issue:`6323`, :pull:`7708`). + By `Joe Hamman `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e5adedbb576..3acdb7596dc 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1254,7 +1254,7 @@ def dump_to_store( for k, enc in encoding.items(): # no need to shallow copy the variable again; that already happened # in encode_dataset_coordinates - variables[k].encoding = enc + variables[k]._set_encoding_internal(enc) check_encoding.add(k) if encoder: diff --git a/xarray/backends/store.py b/xarray/backends/store.py index a507ee37470..06d829a58c1 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -58,7 +58,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.intersection(vars)) ds.set_close(filename_or_obj.close) - ds.encoding = encoding + ds._set_encoding_internal(encoding) return ds diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7d21c771e06..41d1a6a9e4b 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -594,7 +594,9 @@ def store( vars_with_encoding = {} for vn in existing_variable_names: vars_with_encoding[vn] = variables[vn].copy(deep=False) - vars_with_encoding[vn].encoding = existing_vars[vn].encoding + vars_with_encoding[vn]._set_encoding_internal( + existing_vars[vn].encoding + ) vars_with_encoding, _ = self.encode(vars_with_encoding, {}) variables_encoded.update(vars_with_encoding) @@ -650,7 +652,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No fill_value = attrs.pop("_FillValue", None) if v.encoding == {"_FillValue": None} and fill_value is None: - v.encoding = {} + v._encoding = {} if name in self.zarr_group: # existing variable diff --git a/xarray/conventions.py b/xarray/conventions.py index 780172879c6..74e011f51e8 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -694,7 +694,7 @@ def decode_cf( ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) ds.set_close(close) - ds.encoding = encoding + ds._set_encoding_internal(encoding) return ds diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 05e2ca7eb8b..1eaf5084f53 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -548,7 +548,7 @@ def _reindex_one( self.exclude_dims, self.exclude_vars, ) - new_obj.encoding = obj.encoding + new_obj._set_encoding_internal(obj.encoding) return new_obj def reindex_all(self) -> None: diff --git a/xarray/core/concat.py b/xarray/core/concat.py index f092911948f..4e4efaf83b8 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -643,7 +643,7 @@ def get_indexes(name): f"Variables {absent_coord_names!r} are coordinates in some datasets but not others." ) result = result.set_coords(coord_names) - result.encoding = result_encoding + result._set_encoding_internal(result_encoding) result = result.drop_vars(unlabeled_dims, errors="ignore") diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index ba55837ef7e..d5f32dfa586 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -870,13 +870,23 @@ def attrs(self, value: Mapping[Any, Any]) -> None: @property def encoding(self) -> dict[Any, Any]: """Dictionary of format-specific settings for how this array should be - serialized.""" + serialized. + + .. warning:: + Setting encoding directly using the encoding property is deprecated. + Use the encoding kwarg in to_netcdf/to_zarr to set output encoding. + """ return self.variable.encoding @encoding.setter def encoding(self, value: Mapping[Any, Any]) -> None: + # deprecated (variable setter will warn) self.variable.encoding = dict(value) + def _set_encoding_internal(self, value: Mapping[Any, Any]) -> None: + """temporary method to set encoding without issuing a FutureWarning""" + self.variable._set_encoding_internal(value) + def reset_encoding(self: T_DataArray) -> T_DataArray: """Return a new DataArray without encoding on the array or any attached coords.""" diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d3a5ddd470e..d767ab96051 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -657,14 +657,32 @@ def attrs(self, value: Mapping[Any, Any]) -> None: @property def encoding(self) -> dict[Any, Any]: - """Dictionary of global encoding attributes on this dataset""" + """Dictionary of global encoding attributes on this dataset + + .. warning:: + Setting encoding directly using the encoding property is deprecated. + Use the encoding kwarg in to_netcdf/to_zarr to set output encoding. + """ if self._encoding is None: self._encoding = {} return self._encoding @encoding.setter def encoding(self, value: Mapping[Any, Any]) -> None: - self._encoding = dict(value) + warnings.warn( + "Setting encoding directly using the encoding property is " + "deprecated. Use the encoding kwarg in to_netcdf/to_zarr to " + "set output encoding.", + category=FutureWarning, + ) + self._set_encoding_internal(value) + + def _set_encoding_internal(self, value: Mapping[Any, Any]) -> None: + """temporary method to set encoding without issuing a FutureWarning""" + try: + self._encoding = dict(value) + except ValueError: + raise ValueError("encoding must be castable to a dictionary") def reset_encoding(self: T_Dataset) -> T_Dataset: """Return a new Dataset without encoding on the dataset or any of its @@ -2821,7 +2839,7 @@ def _reindex_callback( var = self._variables.get(name) if var is not None: new_var.attrs = var.attrs - new_var.encoding = var.encoding + new_var._set_encoding_internal(var.encoding) # pass through indexes from excluded dimensions # no extra check needed for multi-coordinate indexes, potential conflicts diff --git a/xarray/core/parallel.py b/xarray/core/parallel.py index 2f8612c5a9b..d8d2edeee7a 100644 --- a/xarray/core/parallel.py +++ b/xarray/core/parallel.py @@ -553,7 +553,7 @@ def subset_dataset_to_block( for index in result._indexes: result[index].attrs = template[index].attrs - result[index].encoding = template[index].encoding + result[index]._set_encoding_internal(template[index].encoding) for name, gname_l in var_key_map.items(): dims = template[name].dims @@ -571,7 +571,7 @@ def subset_dataset_to_block( hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype ) result[name] = (dims, data, template[name].attrs) - result[name].encoding = template[name].encoding + result[name]._set_encoding_internal(template[name].encoding) result = result.set_coords(template._coord_names) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9fe3c953aa6..cb4fe7b265c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -365,7 +365,7 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): if attrs is not None: self.attrs = attrs if encoding is not None: - self.encoding = encoding + self._set_encoding_internal(encoding) @property def dtype(self): @@ -969,13 +969,28 @@ def attrs(self, value: Mapping[Any, Any]) -> None: @property def encoding(self) -> dict[Any, Any]: - """Dictionary of encodings on this variable.""" + """Dictionary of encodings on this variable. + + .. warning:: + Setting encoding directly using the encoding property is deprecated. + Use the encoding kwarg in to_netcdf/to_zarr to set output encoding. + """ if self._encoding is None: self._encoding = {} return self._encoding @encoding.setter - def encoding(self, value): + def encoding(self, value: Mapping[Any, Any]) -> None: + warnings.warn( + "Setting encoding directly using the encoding property is " + "deprecated. Use the encoding kwarg in to_netcdf/to_zarr to " + "set output encoding.", + category=FutureWarning, + ) + self._set_encoding_internal(value) + + def _set_encoding_internal(self, value: Mapping[Any, Any]) -> None: + """temporary method to set encoding without issuing a FutureWarning""" try: self._encoding = dict(value) except ValueError: diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 7e1b964ecba..919f5d0ee51 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -261,7 +261,7 @@ def create_test_data(seed: int | None = None, add_attrs: bool = True) -> Dataset "dim3", np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64"), ) - obj.encoding = {"foo": "bar"} + obj._encoding = {"foo": "bar"} assert all(obj.data.flags.writeable for obj in obj.variables.values()) return obj diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2b1e76774ca..e18b96eadab 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -512,6 +512,7 @@ def test_roundtrip_string_data(self) -> None: with self.roundtrip(expected) as actual: assert_identical(expected, actual) + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_roundtrip_string_encoded_characters(self) -> None: expected = Dataset({"x": ("t", ["ab", "cdef"])}) expected["x"].encoding["dtype"] = "S1" @@ -1504,6 +1505,7 @@ def test_encoding_kwarg_compression(self) -> None: assert ds.x.encoding == {} + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_keep_chunksizes_if_no_original_shape(self) -> None: ds = Dataset({"x": [1, 2, 3]}) chunksizes = (2,) @@ -1515,6 +1517,7 @@ def test_keep_chunksizes_if_no_original_shape(self) -> None: ds["x"].encoding["chunksizes"], actual["x"].encoding["chunksizes"] ) + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_encoding_chunksizes_unlimited(self) -> None: # regression test for GH1225 ds = Dataset({"x": [1, 2, 3], "y": ("x", [2, 3, 4])}) @@ -1581,6 +1584,7 @@ def test_read_variable_len_strings(self) -> None: with open_dataset(tmp_file, **cast(dict, kwargs)) as actual: assert_identical(expected, actual) + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_encoding_unlimited_dims(self) -> None: ds = Dataset({"x": ("y", np.arange(10.0))}) with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual: @@ -2855,6 +2859,7 @@ def test_cross_engine_read_write_netcdf3(self) -> None: for k in data.variables ] + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_encoding_unlimited_dims(self) -> None: ds = Dataset({"x": ("y", np.arange(10.0))}) with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual: @@ -2938,6 +2943,7 @@ def test_read_byte_attrs_as_unicode(self) -> None: expected = Dataset(attrs={"foo": "bar"}) assert_identical(expected, actual) + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_encoding_unlimited_dims(self) -> None: ds = Dataset({"x": ("y", np.arange(10.0))}) with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual: diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index f7579c4b488..16a37dde678 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -56,6 +56,7 @@ def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None assert_identical(roundtripped, original) +@pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_CFMaskCoder_missing_value() -> None: expected = xr.DataArray( np.array([[26915, 27755, -9999, 27705], [25595, -9999, 28315, -9999]]), diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 580de878fe6..9338da914b1 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -712,6 +712,7 @@ def test_decode_cf_time_bounds() -> None: @requires_cftime +@pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_encode_time_bounds() -> None: time = pd.date_range("2000-01-16", periods=1) time_bounds = pd.date_range("2000-01-01", periods=2, freq="MS") diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index f60308f8863..84d05ba46e6 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1015,6 +1015,7 @@ def test_concat(self) -> None: with pytest.raises(ValueError, match=r"not a valid argument"): concat([foo, bar], dim="w", data_vars="minimal") + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_concat_encoding(self) -> None: # Regression test for GH1297 ds = Dataset( diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 9485b506b89..17db1b17b20 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -154,6 +154,7 @@ def test_var_with_coord_attr(self) -> None: # Should not have any global coordinates. assert "coordinates" not in attrs + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_do_not_overwrite_user_coordinates(self) -> None: orig = Dataset( coords={"x": [0, 1, 2], "y": ("x", [5, 6, 7]), "z": ("x", [8, 9, 10])}, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 4ffa95e16e6..aae467acb31 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -269,6 +269,7 @@ def test_sizes(self) -> None: with pytest.raises(TypeError): array.sizes["foo"] = 5 # type: ignore + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_encoding(self) -> None: expected = {"foo": "bar"} self.dv.encoding["foo"] = "bar" @@ -278,6 +279,7 @@ def test_encoding(self) -> None: self.dv.encoding = expected2 assert expected2 is not self.dv.encoding + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_reset_encoding(self) -> None: array = self.mda encoding = {"scale_factor": 10} @@ -3324,6 +3326,7 @@ def test_series_categorical_index(self) -> None: arr = DataArray(s) assert "'a'" in repr(arr) # should not error + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") @pytest.mark.parametrize("encoding", [True, False]) def test_to_and_from_dict(self, encoding) -> None: array = DataArray( @@ -6735,3 +6738,12 @@ def test_error_on_ellipsis_without_list(self) -> None: da = DataArray([[1, 2], [1, 2]], dims=("x", "y")) with pytest.raises(ValueError): da.stack(flat=...) # type: ignore + + +def test_setting_encoding_property_warns_deprecated(): + da = DataArray([0, 1, 2]) + with pytest.warns( + FutureWarning, + match=r"Setting encoding directly using the encoding property is deprecated.*", + ): + da.encoding = {"dtype": "f4"} diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 422166234aa..5b9a07a2635 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2873,6 +2873,7 @@ def test_copy_with_data_errors(self) -> None: with pytest.raises(ValueError, match=r"contain all variables in original"): orig.copy(data={"var1": new_var1}) + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_reset_encoding(self) -> None: orig = create_test_data() vencoding = {"scale_factor": 10} @@ -3037,6 +3038,7 @@ def test_rename_multiindex(self) -> None: with pytest.warns(UserWarning, match="does not create an index anymore"): original.rename({"a": "b"}) + @pytest.mark.filterwarnings("ignore:Setting encoding directly.*:FutureWarning") def test_rename_perserve_attrs_encoding(self) -> None: # test propagate attrs/encoding to new variable(s) created from Index object original = Dataset(coords={"x": ("x", [0, 1, 2])}) @@ -7024,3 +7026,18 @@ def test_transpose_error() -> None: ), ): ds.transpose(["y", "x"]) # type: ignore + + +def test_setting_encoding_property_warns_deprecated(): + ds = xr.Dataset({"foo": (("x", "y"), [[21]]), "bar": (("x", "y"), [[12]])}) + with pytest.warns( + FutureWarning, + match=r"Setting encoding directly using the encoding property is deprecated.*", + ): + ds.encoding = {"unlimited_dims": {"x"}} + + with pytest.warns( + FutureWarning, + match=r"Setting encoding directly using the encoding property is deprecated.*", + ): + ds["foo"].encoding = {"dtype": "f4"} diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 2f571eb9a83..f3103ae4186 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2963,3 +2963,12 @@ def test_pandas_two_only_timedelta_conversion_warning() -> None: var = Variable(["time"], data) assert var.dtype == np.dtype("timedelta64[ns]") + + +def test_setting_encoding_property_warns_deprecated(): + v = Variable("x", [1, 2, 3]) + with pytest.warns( + FutureWarning, + match=r"Setting encoding directly using the encoding property is deprecated.*", + ): + v.encoding = {"dtype": "f4"}