diff --git a/doc/whats-new.rst b/doc/whats-new.rst index bfc040eb271..0a262585a41 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -102,6 +102,10 @@ New Features - Added ability to save ``DataArray`` objects directly to Zarr using :py:meth:`~xarray.DataArray.to_zarr`. (:issue:`7692`, :pull:`7693`) . By `Joe Hamman `_. +- Keyword argument `data='array'` to both :py:meth:`xarray.Dataset.to_dict` and + :py:meth:`xarray.DataArray.to_dict` will now return data as the underlying array type. Python lists are returned for `data='list'` or `data=True`. Supplying `data=False` only returns the schema without data. ``encoding=True`` returns the encoding dictionary for the underlying variable also. + (:issue:`1599`, :pull:`7739`) . + By `James McCreight `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f26085bda58..7c2d02f7b8e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4174,7 +4174,9 @@ def to_zarr( zarr_version=zarr_version, ) - def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]: + def to_dict( + self, data: bool | Literal["list", "array"] = "list", encoding: bool = False + ) -> dict[str, Any]: """ Convert this xarray.DataArray into a dictionary following xarray naming conventions. @@ -4185,9 +4187,14 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]: Parameters ---------- - data : bool, default: True + data : bool or {"list", "array"}, default: "list" Whether to include the actual data in the dictionary. When set to - False, returns just the schema. + False, returns just the schema. If set to "array", returns data as + underlying array type. If set to "list" (or True for backwards + compatibility), returns data in lists of Python data types. Note + that for obtaining the "list" output efficiently, use + `da.compute().to_dict(data="list")`. + encoding : bool, default: False Whether to include the Dataset's encoding in the dictionary. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 74c961bd7a0..7d903d9432d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6441,7 +6441,9 @@ def to_dask_dataframe( return df - def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]: + def to_dict( + self, data: bool | Literal["list", "array"] = "list", encoding: bool = False + ) -> dict[str, Any]: """ Convert this dataset to a dictionary following xarray naming conventions. @@ -6452,9 +6454,14 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]: Parameters ---------- - data : bool, default: True + data : bool or {"list", "array"}, default: "list" Whether to include the actual data in the dictionary. When set to - False, returns just the schema. + False, returns just the schema. If set to "array", returns data as + underlying array type. If set to "list" (or True for backwards + compatibility), returns data in lists of Python data types. Note + that for obtaining the "list" output efficiently, use + `ds.compute().to_dict(data="list")`. + encoding : bool, default: False Whether to include the Dataset's encoding in the dictionary. @@ -6560,7 +6567,8 @@ def from_dict(cls: type[T_Dataset], d: Mapping[Any, Any]) -> T_Dataset: ) try: variable_dict = { - k: (v["dims"], v["data"], v.get("attrs")) for k, v in variables + k: (v["dims"], v["data"], v.get("attrs"), v.get("encoding")) + for k, v in variables } except KeyError as e: raise ValueError( diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9fe3c953aa6..c19cb21cba2 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -633,11 +633,23 @@ def to_index(self) -> pd.Index: """Convert this variable to a pandas.Index""" return self.to_index_variable().to_index() - def to_dict(self, data: bool = True, encoding: bool = False) -> dict: + def to_dict( + self, data: bool | str = "list", encoding: bool = False + ) -> dict[str, Any]: """Dictionary representation of variable.""" - item = {"dims": self.dims, "attrs": decode_numpy_dict_values(self.attrs)} - if data: - item["data"] = ensure_us_time_resolution(self.values).tolist() + item: dict[str, Any] = { + "dims": self.dims, + "attrs": decode_numpy_dict_values(self.attrs), + } + if data is not False: + if data in [True, "list"]: + item["data"] = ensure_us_time_resolution(self.to_numpy()).tolist() + elif data == "array": + item["data"] = ensure_us_time_resolution(self.data) + else: + msg = 'data argument must be bool, "list", or "array"' + raise ValueError(msg) + else: item.update({"dtype": str(self.dtype), "shape": self.shape}) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index dcbfd42c9f1..a27fb4defb9 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6,7 +6,7 @@ from collections.abc import Hashable from copy import deepcopy from textwrap import dedent -from typing import Any, Final, cast +from typing import Any, Final, Literal, cast import numpy as np import pandas as pd @@ -3345,46 +3345,70 @@ def test_series_categorical_index(self) -> None: arr = DataArray(s) assert "'a'" in repr(arr) # should not error + @pytest.mark.parametrize("use_dask", [True, False]) + @pytest.mark.parametrize("data", ["list", "array", True]) @pytest.mark.parametrize("encoding", [True, False]) - def test_to_and_from_dict(self, encoding) -> None: + def test_to_and_from_dict( + self, encoding: bool, data: bool | Literal["list", "array"], use_dask: bool + ) -> None: + if use_dask and not has_dask: + pytest.skip("requires dask") + encoding_data = {"bar": "spam"} array = DataArray( np.random.randn(2, 3), {"x": ["a", "b"]}, ["x", "y"], name="foo" ) - array.encoding = {"bar": "spam"} - expected = { + array.encoding = encoding_data + + return_data = array.to_numpy() + coords_data = np.array(["a", "b"]) + if data == "list" or data is True: + return_data = return_data.tolist() + coords_data = coords_data.tolist() + + expected: dict[str, Any] = { "name": "foo", "dims": ("x", "y"), - "data": array.values.tolist(), + "data": return_data, "attrs": {}, - "coords": {"x": {"dims": ("x",), "data": ["a", "b"], "attrs": {}}}, + "coords": {"x": {"dims": ("x",), "data": coords_data, "attrs": {}}}, } if encoding: - expected["encoding"] = {"bar": "spam"} - actual = array.to_dict(encoding=encoding) + expected["encoding"] = encoding_data + + if has_dask: + da = array.chunk() + else: + da = array + + if data == "array" or data is False: + with raise_if_dask_computes(): + actual = da.to_dict(encoding=encoding, data=data) + else: + actual = da.to_dict(encoding=encoding, data=data) # check that they are identical - assert expected == actual + np.testing.assert_equal(expected, actual) # check roundtrip - assert_identical(array, DataArray.from_dict(actual)) + assert_identical(da, DataArray.from_dict(actual)) # a more bare bones representation still roundtrips d = { "name": "foo", "dims": ("x", "y"), - "data": array.values.tolist(), + "data": da.values.tolist(), "coords": {"x": {"dims": "x", "data": ["a", "b"]}}, } - assert_identical(array, DataArray.from_dict(d)) + assert_identical(da, DataArray.from_dict(d)) # and the most bare bones representation still roundtrips - d = {"name": "foo", "dims": ("x", "y"), "data": array.values} - assert_identical(array.drop_vars("x"), DataArray.from_dict(d)) + d = {"name": "foo", "dims": ("x", "y"), "data": da.values} + assert_identical(da.drop_vars("x"), DataArray.from_dict(d)) # missing a dims in the coords d = { "dims": ("x", "y"), - "data": array.values, + "data": da.values, "coords": {"x": {"data": ["a", "b"]}}, } with pytest.raises( @@ -3407,7 +3431,7 @@ def test_to_and_from_dict(self, encoding) -> None: endiantype = "U1" expected_no_data["coords"]["x"].update({"dtype": endiantype, "shape": (2,)}) expected_no_data.update({"dtype": "float64", "shape": (2, 3)}) - actual_no_data = array.to_dict(data=False, encoding=encoding) + actual_no_data = da.to_dict(data=False, encoding=encoding) assert expected_no_data == actual_no_data def test_to_and_from_dict_with_time_dim(self) -> None: diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 45286727f0a..cc9220dfe33 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -8,7 +8,7 @@ from copy import copy, deepcopy from io import StringIO from textwrap import dedent -from typing import Any +from typing import Any, Literal import numpy as np import pandas as pd @@ -4596,7 +4596,11 @@ def test_convert_dataframe_with_many_types_and_multiindex(self) -> None: expected = df.apply(np.asarray) assert roundtripped.equals(expected) - def test_to_and_from_dict(self) -> None: + @pytest.mark.parametrize("encoding", [True, False]) + @pytest.mark.parametrize("data", [True, "list", "array"]) + def test_to_and_from_dict( + self, encoding: bool, data: bool | Literal["list", "array"] + ) -> None: # # Dimensions: (t: 10) # Coordinates: @@ -4617,14 +4621,25 @@ def test_to_and_from_dict(self) -> None: "b": {"dims": ("t",), "data": y.tolist(), "attrs": {}}, }, } + if encoding: + ds.t.encoding.update({"foo": "bar"}) + expected["encoding"] = {} + expected["coords"]["t"]["encoding"] = ds.t.encoding + for vvs in ["a", "b"]: + expected["data_vars"][vvs]["encoding"] = {} - actual = ds.to_dict() + actual = ds.to_dict(data=data, encoding=encoding) # check that they are identical - assert expected == actual + np.testing.assert_equal(expected, actual) # check roundtrip - assert_identical(ds, Dataset.from_dict(actual)) + ds_rt = Dataset.from_dict(actual) + assert_identical(ds, ds_rt) + if encoding: + assert set(ds_rt.variables) == set(ds.variables) + for vv in ds.variables: + np.testing.assert_equal(ds_rt[vv].encoding, ds[vv].encoding) # check the data=False option expected_no_data = expected.copy() @@ -4635,14 +4650,18 @@ def test_to_and_from_dict(self) -> None: expected_no_data["coords"]["t"].update({"dtype": endiantype, "shape": (10,)}) expected_no_data["data_vars"]["a"].update({"dtype": "float64", "shape": (10,)}) expected_no_data["data_vars"]["b"].update({"dtype": "float64", "shape": (10,)}) - actual_no_data = ds.to_dict(data=False) + actual_no_data = ds.to_dict(data=False, encoding=encoding) assert expected_no_data == actual_no_data # verify coords are included roundtrip expected_ds = ds.set_coords("b") - actual2 = Dataset.from_dict(expected_ds.to_dict()) + actual2 = Dataset.from_dict(expected_ds.to_dict(data=data, encoding=encoding)) assert_identical(expected_ds, actual2) + if encoding: + assert set(expected_ds.variables) == set(actual2.variables) + for vv in ds.variables: + np.testing.assert_equal(expected_ds[vv].encoding, actual2[vv].encoding) # test some incomplete dicts: # this one has no attrs field, the dims are strings, and x, y are @@ -4690,7 +4709,10 @@ def test_to_and_from_dict_with_time_dim(self) -> None: roundtripped = Dataset.from_dict(ds.to_dict()) assert_identical(ds, roundtripped) - def test_to_and_from_dict_with_nan_nat(self) -> None: + @pytest.mark.parametrize("data", [True, "list", "array"]) + def test_to_and_from_dict_with_nan_nat( + self, data: bool | Literal["list", "array"] + ) -> None: x = np.random.randn(10, 3) y = np.random.randn(10, 3) y[2] = np.nan @@ -4706,7 +4728,7 @@ def test_to_and_from_dict_with_nan_nat(self) -> None: "lat": ("lat", lat), } ) - roundtripped = Dataset.from_dict(ds.to_dict()) + roundtripped = Dataset.from_dict(ds.to_dict(data=data)) assert_identical(ds, roundtripped) def test_to_dict_with_numpy_attrs(self) -> None: