diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 670a0ec6d68..71f2b929051 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -403,6 +403,7 @@ def open_dataset( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -494,6 +495,12 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + set_indexes : bool, optional + If True (default), create new indexes from coordinates. Both the number and + the type(s) of those indexes depend on the backend used to open the dataset. + For most common backends this creates a pandas index for each + :term:`Dimension coordinate`, which loads the coordinate data fully in memory. + Set it to False if you want to avoid loading data into memory. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -572,6 +579,7 @@ def open_dataset( backend_ds = backend.open_dataset( filename_or_obj, drop_variables=drop_variables, + set_indexes=set_indexes, **decoders, **kwargs, ) @@ -606,6 +614,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -699,6 +708,12 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + set_indexes : bool, optional + If True (default), create new indexes from coordinates. Both the number and + the type(s) of those indexes depend on the backend used to open the dataset. + For most common backends this creates a pandas index for each + :term:`Dimension coordinate`, which loads the coordinate data fully in memory. + Set it to False if you want to avoid loading data into memory. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -756,6 +771,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, + set_indexes=set_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 5b8f9a6840f..7ae6bb4717a 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -490,6 +490,7 @@ def open_dataset( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, **kwargs: Any, ) -> Dataset: """ diff --git a/xarray/backends/store.py b/xarray/backends/store.py index a507ee37470..e15e6b08c0f 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -9,6 +9,7 @@ AbstractDataStore, BackendEntrypoint, ) +from xarray.core.coordinates import Coordinates from xarray.core.dataset import Dataset if TYPE_CHECKING: @@ -35,6 +36,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, ) -> Dataset: @@ -55,8 +57,22 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti decode_timedelta=decode_timedelta, ) - ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.intersection(vars)) + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + coord_vars[name] = var + else: + data_vars[name] = var + + if set_indexes: + coords = coord_vars + else: + # explict Coordinates object with no index passed + coords = Coordinates(coord_vars) + + ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) ds.encoding = encoding diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 469bbf4c339..272700f7f6f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -803,6 +803,7 @@ def open_zarr( zarr_version=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + set_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -896,6 +897,10 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + set_indexes : bool, optional + If True (default), create a default (pandas) index for each + :term:`Dimension coordinate`. Set it to False if the dataset contains + dimension coordinate arrays that are too large to load fully in memory. Returns ------- @@ -952,6 +957,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + set_indexes=set_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, @@ -996,6 +1002,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, group=None, @@ -1032,6 +1039,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, + set_indexes=set_indexes, ) return ds