Skip to content

Allow setting (or skipping) new indexes in open_dataset #8051

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ def open_dataset(
concat_characters: bool | None = None,
decode_coords: Literal["coordinates", "all"] | bool | None = None,
drop_variables: str | Iterable[str] | None = None,
set_indexes: bool = True,
inline_array: bool = False,
chunked_array_type: str | None = None,
from_array_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -494,6 +495,12 @@ def open_dataset(
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.
set_indexes : bool, optional
If True (default), create new indexes from coordinates. Both the number and
the type(s) of those indexes depend on the backend used to open the dataset.
For most common backends this creates a pandas index for each
:term:`Dimension coordinate`, which loads the coordinate data fully in memory.
Set it to False if you want to avoid loading data into memory.
inline_array: bool, default: False
How to include the array in the dask task graph.
By default(``inline_array=False``) the array is included in a task by
Expand Down Expand Up @@ -572,6 +579,7 @@ def open_dataset(
backend_ds = backend.open_dataset(
filename_or_obj,
drop_variables=drop_variables,
set_indexes=set_indexes,
**decoders,
**kwargs,
)
Expand Down Expand Up @@ -606,6 +614,7 @@ def open_dataarray(
concat_characters: bool | None = None,
decode_coords: Literal["coordinates", "all"] | bool | None = None,
drop_variables: str | Iterable[str] | None = None,
set_indexes: bool = True,
inline_array: bool = False,
chunked_array_type: str | None = None,
from_array_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -699,6 +708,12 @@ def open_dataarray(
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.
set_indexes : bool, optional
If True (default), create new indexes from coordinates. Both the number and
the type(s) of those indexes depend on the backend used to open the dataset.
For most common backends this creates a pandas index for each
:term:`Dimension coordinate`, which loads the coordinate data fully in memory.
Set it to False if you want to avoid loading data into memory.
inline_array: bool, default: False
How to include the array in the dask task graph.
By default(``inline_array=False``) the array is included in a task by
Expand Down Expand Up @@ -756,6 +771,7 @@ def open_dataarray(
chunks=chunks,
cache=cache,
drop_variables=drop_variables,
set_indexes=set_indexes,
inline_array=inline_array,
chunked_array_type=chunked_array_type,
from_array_kwargs=from_array_kwargs,
Expand Down
1 change: 1 addition & 0 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,7 @@ def open_dataset(
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
*,
drop_variables: str | Iterable[str] | None = None,
set_indexes: bool = True,
**kwargs: Any,
) -> Dataset:
"""
Expand Down
20 changes: 18 additions & 2 deletions xarray/backends/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
AbstractDataStore,
BackendEntrypoint,
)
from xarray.core.coordinates import Coordinates
from xarray.core.dataset import Dataset

if TYPE_CHECKING:
Expand All @@ -35,6 +36,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
concat_characters=True,
decode_coords=True,
drop_variables: str | Iterable[str] | None = None,
set_indexes: bool = True,
use_cftime=None,
decode_timedelta=None,
) -> Dataset:
Expand All @@ -55,8 +57,22 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
decode_timedelta=decode_timedelta,
)

ds = Dataset(vars, attrs=attrs)
ds = ds.set_coords(coord_names.intersection(vars))
# split data and coordinate variables (promote dimension coordinates)
data_vars = {}
coord_vars = {}
for name, var in vars.items():
if name in coord_names or var.dims == (name,):
coord_vars[name] = var
else:
data_vars[name] = var

if set_indexes:
coords = coord_vars
else:
# explict Coordinates object with no index passed
coords = Coordinates(coord_vars)

ds = Dataset(data_vars, coords=coords, attrs=attrs)
ds.set_close(filename_or_obj.close)
ds.encoding = encoding

Expand Down
8 changes: 8 additions & 0 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,7 @@ def open_zarr(
zarr_version=None,
chunked_array_type: str | None = None,
from_array_kwargs: dict[str, Any] | None = None,
set_indexes=True,
**kwargs,
):
"""Load and decode a dataset from a Zarr store.
Expand Down Expand Up @@ -896,6 +897,10 @@ def open_zarr(
chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg.
Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to
:py:func:`dask.array.from_array`. Experimental API that should not be relied upon.
set_indexes : bool, optional
If True (default), create a default (pandas) index for each
:term:`Dimension coordinate`. Set it to False if the dataset contains
dimension coordinate arrays that are too large to load fully in memory.

Returns
-------
Expand Down Expand Up @@ -952,6 +957,7 @@ def open_zarr(
engine="zarr",
chunks=chunks,
drop_variables=drop_variables,
set_indexes=set_indexes,
chunked_array_type=chunked_array_type,
from_array_kwargs=from_array_kwargs,
backend_kwargs=backend_kwargs,
Expand Down Expand Up @@ -996,6 +1002,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
concat_characters=True,
decode_coords=True,
drop_variables: str | Iterable[str] | None = None,
set_indexes: bool = True,
use_cftime=None,
decode_timedelta=None,
group=None,
Expand Down Expand Up @@ -1032,6 +1039,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
drop_variables=drop_variables,
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
set_indexes=set_indexes,
)
return ds

Expand Down