diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b80ec927b1e..38f96270c99 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -36,6 +36,7 @@ from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.core import indexing +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -379,6 +380,18 @@ def _chunk_ds( return backend_ds._replace(variables) +def _maybe_create_default_indexes(ds, create_default_indexes): + if not create_default_indexes: + return ds + + to_index = { + name: coord.variable + for name, coord in ds.coords.items() + if coord.dims == (name,) and name not in ds.xindexes + } + return ds.assign_coords(Coordinates(to_index)) + + def _dataset_from_backend_dataset( backend_ds, filename_or_obj, @@ -389,6 +402,7 @@ def _dataset_from_backend_dataset( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -397,11 +411,14 @@ def _dataset_from_backend_dataset( ) _protect_dataset_variables_inplace(backend_ds, cache) + + indexed = _maybe_create_default_indexes(backend_ds, create_default_indexes) + if chunks is None: - ds = backend_ds + ds = indexed else: ds = _chunk_ds( - backend_ds, + indexed, filename_or_obj, engine, chunks, @@ -434,6 +451,7 @@ def _datatree_from_backend_datatree( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -448,7 +466,9 @@ def _datatree_from_backend_datatree( tree = DataTree.from_dict( { path: _chunk_ds( - node.dataset, + node.dataset.pipe( + _maybe_create_default_indexes, create_default_indexes + ), filename_or_obj, engine, chunks, @@ -497,6 +517,7 @@ def open_dataset( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -610,6 +631,13 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -702,6 +730,7 @@ def open_dataset( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -725,6 +754,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -833,6 +863,13 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -890,6 +927,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, @@ -946,6 +984,7 @@ def open_datatree( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1055,6 +1094,13 @@ def open_datatree( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1148,6 +1194,7 @@ def open_datatree( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -1175,6 +1222,7 @@ def open_groups( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1286,6 +1334,13 @@ def open_groups( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1381,6 +1436,7 @@ def open_groups( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index b1b3956ca8e..de52aa193ed 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -9,6 +9,7 @@ AbstractDataStore, BackendEntrypoint, ) +from xarray.core.coordinates import Coordinates from xarray.core.dataset import Dataset if TYPE_CHECKING: @@ -36,6 +37,7 @@ def open_dataset( concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, ) -> Dataset: @@ -56,8 +58,19 @@ def open_dataset( decode_timedelta=decode_timedelta, ) - ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.intersection(vars)) + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + coord_vars[name] = var + else: + data_vars[name] = var + + # explicit Coordinates object with no index passed + coords = Coordinates(coord_vars, indexes={}) + + ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) ds.encoding = encoding diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index fff07063964..539027832f7 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1347,6 +1347,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + create_default_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1457,6 +1458,13 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg. Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. Returns ------- @@ -1513,6 +1521,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f42d2c2c17f..41cc7944d94 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,6 +55,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing +from xarray.core.indexes import PandasIndex from xarray.core.options import set_options from xarray.core.utils import module_available from xarray.namedarray.pycompat import array_type @@ -2050,6 +2051,26 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): with self.roundtrip(original): pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_netCDF4 class TestNetCDF4Data(NetCDF4Base): @@ -4010,6 +4031,26 @@ def test_pickle(self) -> None: def test_pickle_dataarray(self) -> None: pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_scipy class TestScipyFilePath(CFEncodedBase, NetCDF3Only): @@ -6379,6 +6420,26 @@ def test_zarr_closing_internal_zip_store(): assert_identical(original_da, loaded_da) +@requires_zarr +@pytest.mark.parametrize("create_default_indexes", [True, False]) +def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: + from xarray.core.indexes import PandasIndex + + store_path = tmp_path / "tmp.zarr" + original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}) + original_ds.to_zarr(store_path, mode="w") + + with open_dataset( + store_path, engine="zarr", create_default_indexes=create_default_indexes + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + + @requires_zarr @pytest.mark.usefixtures("default_zarr_format") def test_raises_key_error_on_invalid_zarr_store(tmp_path): diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 9342423b727..778e800ec67 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -201,3 +201,39 @@ def test_join_chunks(self, shape, pref_chunks, req_chunks): chunks=dict(zip(initial[self.var_name].dims, req_chunks, strict=True)), ) self.check_dataset(initial, final, explicit_chunks(req_chunks, shape)) + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes(self, create_default_indexes): + """Create default indexes if the backend does not create them.""" + coords = xr.Coordinates({"x": ("x", [0, 1]), "y": list("abc")}, indexes={}) + initial = xr.Dataset({"a": ("x", [1, 2])}, coords=coords) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + if create_default_indexes: + assert all(name in final.xindexes for name in ["x", "y"]) + else: + assert len(final.xindexes) == 0 + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes_passthrough(self, create_default_indexes): + """Allow creating indexes in the backend.""" + + initial = xr.Dataset( + {"a": (["x", "y"], [[1, 2, 3], [4, 5, 6]])}, + coords={"x": ("x", [0, 1]), "y": ("y", list("abc"))}, + ).stack(z=["x", "y"]) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + assert initial.coords.equals(final.coords)