diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cef070c4936..3dad685aaf7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,6 +43,10 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Explicit indexes refactor: avoid ``len(index)`` in ``map_blocks`` (:pull:`5670`). + By `Deepak Cherian `_. +- Explicit indexes refactor: decouple ``xarray.Index``` from ``xarray.Variable`` (:pull:`5636`). + By `Benoit Bovy `_. - Improve the performance of reprs for large datasets or dataarrays. (:pull:`5661`) By `Jimmy Westling `_. diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 8f2ba2f4b97..a53ac094253 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -18,7 +18,7 @@ import pandas as pd from . import dtypes -from .indexes import Index, PandasIndex, get_indexer_nd, wrap_pandas_index +from .indexes import Index, PandasIndex, get_indexer_nd from .utils import is_dict_like, is_full_slice, maybe_coerce_to_str, safe_cast_to_index from .variable import IndexVariable, Variable @@ -53,7 +53,10 @@ def _get_joiner(join, index_cls): def _override_indexes(objects, all_indexes, exclude): for dim, dim_indexes in all_indexes.items(): if dim not in exclude: - lengths = {index.size for index in dim_indexes} + lengths = { + getattr(index, "size", index.to_pandas_index().size) + for index in dim_indexes + } if len(lengths) != 1: raise ValueError( f"Indexes along dimension {dim!r} don't have the same length." @@ -300,16 +303,14 @@ def align( joined_indexes = {} for dim, matching_indexes in all_indexes.items(): if dim in indexes: - # TODO: benbovy - flexible indexes. maybe move this logic in util func - if isinstance(indexes[dim], Index): - index = indexes[dim] - else: - index = PandasIndex(safe_cast_to_index(indexes[dim])) + index, _ = PandasIndex.from_pandas_index( + safe_cast_to_index(indexes[dim]), dim + ) if ( any(not index.equals(other) for other in matching_indexes) or dim in unlabeled_dim_sizes ): - joined_indexes[dim] = index + joined_indexes[dim] = indexes[dim] else: if ( any( @@ -323,17 +324,18 @@ def align( joiner = _get_joiner(join, type(matching_indexes[0])) index = joiner(matching_indexes) # make sure str coords are not cast to object - index = maybe_coerce_to_str(index, all_coords[dim]) + index = maybe_coerce_to_str(index.to_pandas_index(), all_coords[dim]) joined_indexes[dim] = index else: index = all_coords[dim][0] if dim in unlabeled_dim_sizes: unlabeled_sizes = unlabeled_dim_sizes[dim] - # TODO: benbovy - flexible indexes: expose a size property for xarray.Index? - # Some indexes may not have a defined size (e.g., built from multiple coords of - # different sizes) - labeled_size = index.size + # TODO: benbovy - flexible indexes: https://github.com/pydata/xarray/issues/5647 + if isinstance(index, PandasIndex): + labeled_size = index.to_pandas_index().size + else: + labeled_size = index.size if len(unlabeled_sizes | {labeled_size}) > 1: raise ValueError( f"arguments without labels along dimension {dim!r} cannot be " @@ -350,7 +352,14 @@ def align( result = [] for obj in objects: - valid_indexers = {k: v for k, v in joined_indexes.items() if k in obj.dims} + # TODO: benbovy - flexible indexes: https://github.com/pydata/xarray/issues/5647 + valid_indexers = {} + for k, index in joined_indexes.items(): + if k in obj.dims: + if isinstance(index, Index): + valid_indexers[k] = index.to_pandas_index() + else: + valid_indexers[k] = index if not valid_indexers: # fast path for no reindexing necessary new_obj = obj.copy(deep=copy) @@ -471,7 +480,11 @@ def reindex_like_indexers( ValueError If any dimensions without labels have different sizes. """ - indexers = {k: v for k, v in other.xindexes.items() if k in target.dims} + # TODO: benbovy - flexible indexes: https://github.com/pydata/xarray/issues/5647 + # this doesn't support yet indexes other than pd.Index + indexers = { + k: v.to_pandas_index() for k, v in other.xindexes.items() if k in target.dims + } for dim in other.dims: if dim not in indexers and dim in target.dims: @@ -560,7 +573,8 @@ def reindex_variables( "from that to be indexed along {:s}".format(str(indexer.dims), dim) ) - target = new_indexes[dim] = wrap_pandas_index(safe_cast_to_index(indexers[dim])) + target = safe_cast_to_index(indexers[dim]) + new_indexes[dim] = PandasIndex(target, dim) if dim in indexes: # TODO (benbovy - flexible indexes): support other indexes than pd.Index? diff --git a/xarray/core/combine.py b/xarray/core/combine.py index be9c2992832..7e1565e50de 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -77,9 +77,8 @@ def _infer_concat_order_from_coords(datasets): "inferring concatenation order" ) - # TODO (benbovy, flexible indexes): all indexes should be Pandas.Index - # get pd.Index objects from Index objects - indexes = [index.array for index in indexes] + # TODO (benbovy, flexible indexes): support flexible indexes? + indexes = [index.to_pandas_index() for index in indexes] # If dimension coordinate values are same on every dataset then # should be leaving this dimension alone (it's just a "bystander") diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index cb2c4d30a69..900af885319 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -51,13 +51,7 @@ ) from .dataset import Dataset, split_indexes from .formatting import format_item -from .indexes import ( - Index, - Indexes, - default_indexes, - propagate_indexes, - wrap_pandas_index, -) +from .indexes import Index, Indexes, default_indexes, propagate_indexes from .indexing import is_fancy_indexer from .merge import PANDAS_TYPES, MergeError, _extract_indexes_from_coords from .options import OPTIONS, _get_keep_attrs @@ -473,15 +467,14 @@ def _overwrite_indexes(self, indexes: Mapping[Hashable, Any]) -> "DataArray": return self coords = self._coords.copy() for name, idx in indexes.items(): - coords[name] = IndexVariable(name, idx) + coords[name] = IndexVariable(name, idx.to_pandas_index()) obj = self._replace(coords=coords) # switch from dimension to level names, if necessary dim_names: Dict[Any, str] = {} for dim, idx in indexes.items(): - # TODO: benbovy - flexible indexes: update when MultiIndex has its own class - pd_idx = idx.array - if not isinstance(pd_idx, pd.MultiIndex) and pd_idx.name != dim: + pd_idx = idx.to_pandas_index() + if not isinstance(idx, pd.MultiIndex) and pd_idx.name != dim: dim_names[dim] = idx.name if dim_names: obj = obj.rename(dim_names) @@ -1046,12 +1039,7 @@ def copy(self: T_DataArray, deep: bool = True, data: Any = None) -> T_DataArray: if self._indexes is None: indexes = self._indexes else: - # TODO: benbovy: flexible indexes: support all xarray indexes (not just pandas.Index) - # xarray Index needs a copy method. - indexes = { - k: wrap_pandas_index(v.to_pandas_index().copy(deep=deep)) - for k, v in self._indexes.items() - } + indexes = {k: v.copy(deep=deep) for k, v in self._indexes.items()} return self._replace(variable, coords, indexes=indexes) def __copy__(self) -> "DataArray": diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5f5c01ad4c9..533ecadbae5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -71,7 +71,6 @@ propagate_indexes, remove_unused_levels_categories, roll_index, - wrap_pandas_index, ) from .indexing import is_fancy_indexer from .merge import ( @@ -1184,7 +1183,7 @@ def _overwrite_indexes(self, indexes: Mapping[Any, Index]) -> "Dataset": variables = self._variables.copy() new_indexes = dict(self.xindexes) for name, idx in indexes.items(): - variables[name] = IndexVariable(name, idx) + variables[name] = IndexVariable(name, idx.to_pandas_index()) new_indexes[name] = idx obj = self._replace(variables, indexes=new_indexes) @@ -2474,6 +2473,10 @@ def sel( pos_indexers, new_indexes = remap_label_indexers( self, indexers=indexers, method=method, tolerance=tolerance ) + # TODO: benbovy - flexible indexes: also use variables returned by Index.query + # (temporary dirty fix). + new_indexes = {k: v[0] for k, v in new_indexes.items()} + result = self.isel(indexers=pos_indexers, drop=drop) return result._overwrite_indexes(new_indexes) @@ -3297,20 +3300,21 @@ def _rename_dims(self, name_dict): return {name_dict.get(k, k): v for k, v in self.dims.items()} def _rename_indexes(self, name_dict, dims_set): + # TODO: benbovy - flexible indexes: https://github.com/pydata/xarray/issues/5645 if self._indexes is None: return None indexes = {} - for k, v in self.xindexes.items(): - # TODO: benbovy - flexible indexes: make it compatible with any xarray Index - index = v.to_pandas_index() + for k, v in self.indexes.items(): new_name = name_dict.get(k, k) if new_name not in dims_set: continue - if isinstance(index, pd.MultiIndex): - new_names = [name_dict.get(k, k) for k in index.names] - indexes[new_name] = PandasMultiIndex(index.rename(names=new_names)) + if isinstance(v, pd.MultiIndex): + new_names = [name_dict.get(k, k) for k in v.names] + indexes[new_name] = PandasMultiIndex( + v.rename(names=new_names), new_name + ) else: - indexes[new_name] = PandasIndex(index.rename(new_name)) + indexes[new_name] = PandasIndex(v.rename(new_name), new_name) return indexes def _rename_all(self, name_dict, dims_dict): @@ -3539,7 +3543,10 @@ def swap_dims( if new_index.nlevels == 1: # make sure index name matches dimension name new_index = new_index.rename(k) - indexes[k] = wrap_pandas_index(new_index) + if isinstance(new_index, pd.MultiIndex): + indexes[k] = PandasMultiIndex(new_index, k) + else: + indexes[k] = PandasIndex(new_index, k) else: var = v.to_base_variable() var.dims = dims @@ -3812,7 +3819,7 @@ def reorder_levels( raise ValueError(f"coordinate {dim} has no MultiIndex") new_index = index.reorder_levels(order) variables[dim] = IndexVariable(coord.dims, new_index) - indexes[dim] = PandasMultiIndex(new_index) + indexes[dim] = PandasMultiIndex(new_index, dim) return self._replace(variables, indexes=indexes) @@ -3840,7 +3847,7 @@ def _stack_once(self, dims, new_dim): coord_names = set(self._coord_names) - set(dims) | {new_dim} indexes = {k: v for k, v in self.xindexes.items() if k not in dims} - indexes[new_dim] = wrap_pandas_index(idx) + indexes[new_dim] = PandasMultiIndex(idx, new_dim) return self._replace_with_new_dims( variables, coord_names=coord_names, indexes=indexes @@ -4029,8 +4036,9 @@ def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": variables[name] = var for name, lev in zip(index.names, index.levels): - variables[name] = IndexVariable(name, lev) - indexes[name] = PandasIndex(lev) + idx, idx_vars = PandasIndex.from_pandas_index(lev, name) + variables[name] = idx_vars[name] + indexes[name] = idx coord_names = set(self._coord_names) - {dim} | set(index.names) @@ -4068,8 +4076,9 @@ def _unstack_full_reindex( variables[name] = var for name, lev in zip(new_dim_names, index.levels): - variables[name] = IndexVariable(name, lev) - indexes[name] = PandasIndex(lev) + idx, idx_vars = PandasIndex.from_pandas_index(lev, name) + variables[name] = idx_vars[name] + indexes[name] = idx coord_names = set(self._coord_names) - {dim} | set(new_dim_names) @@ -5839,10 +5848,13 @@ def diff(self, dim, n=1, label="upper"): indexes = dict(self.xindexes) if dim in indexes: - # TODO: benbovy - flexible indexes: check slicing of xarray indexes? - # or only allow this for pandas indexes? - index = indexes[dim].to_pandas_index() - indexes[dim] = PandasIndex(index[kwargs_new[dim]]) + if isinstance(indexes[dim], PandasIndex): + # maybe optimize? (pandas index already indexed above with var.isel) + new_index = indexes[dim].index[kwargs_new[dim]] + if isinstance(new_index, pd.MultiIndex): + indexes[dim] = PandasMultiIndex(new_index, dim) + else: + indexes[dim] = PandasIndex(new_index, dim) difference = self._replace_with_new_dims(variables, indexes=indexes) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 90d8eec6623..429c37af588 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1,6 +1,4 @@ import collections.abc -from contextlib import suppress -from datetime import timedelta from typing import ( TYPE_CHECKING, Any, @@ -18,28 +16,26 @@ import pandas as pd from . import formatting, utils -from .indexing import ExplicitlyIndexedNDArrayMixin, NumpyIndexingAdapter -from .npcompat import DTypeLike +from .indexing import ( + LazilyIndexedArray, + PandasIndexingAdapter, + PandasMultiIndexingAdapter, +) from .utils import is_dict_like, is_scalar if TYPE_CHECKING: - from .variable import Variable + from .variable import IndexVariable, Variable + +IndexVars = Dict[Hashable, "IndexVariable"] class Index: """Base class inherited by all xarray-compatible indexes.""" - __slots__ = ("coord_names",) - - def __init__(self, coord_names: Union[Hashable, Iterable[Hashable]]): - if isinstance(coord_names, Hashable): - coord_names = (coord_names,) - self.coord_names = tuple(coord_names) - @classmethod def from_variables( - cls, variables: Dict[Hashable, "Variable"], **kwargs - ): # pragma: no cover + cls, variables: Mapping[Hashable, "Variable"] + ) -> Tuple["Index", Optional[IndexVars]]: # pragma: no cover raise NotImplementedError() def to_pandas_index(self) -> pd.Index: @@ -52,8 +48,10 @@ def to_pandas_index(self) -> pd.Index: """ raise TypeError(f"{type(self)} cannot be cast to a pandas.Index object.") - def query(self, labels: Dict[Hashable, Any]): # pragma: no cover - raise NotImplementedError + def query( + self, labels: Dict[Hashable, Any] + ) -> Tuple[Any, Optional[Tuple["Index", IndexVars]]]: # pragma: no cover + raise NotImplementedError() def equals(self, other): # pragma: no cover raise NotImplementedError() @@ -64,6 +62,13 @@ def union(self, other): # pragma: no cover def intersection(self, other): # pragma: no cover raise NotImplementedError() + def copy(self, deep: bool = True): # pragma: no cover + raise NotImplementedError() + + def __getitem__(self, indexer: Any): + # if not implemented, index will be dropped from the Dataset or DataArray + raise NotImplementedError() + def _sanitize_slice_element(x): from .dataarray import DataArray @@ -138,64 +143,68 @@ def get_indexer_nd(index, labels, method=None, tolerance=None): return indexer -class PandasIndex(Index, ExplicitlyIndexedNDArrayMixin): - """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" +class PandasIndex(Index): + """Wrap a pandas.Index as an xarray compatible index.""" - __slots__ = ("array", "_dtype") + __slots__ = ("index", "dim") - def __init__( - self, array: Any, dtype: DTypeLike = None, coord_name: Optional[Hashable] = None - ): - if coord_name is None: - coord_name = tuple() - super().__init__(coord_name) + def __init__(self, array: Any, dim: Hashable): + self.index = utils.safe_cast_to_index(array) + self.dim = dim - self.array = utils.safe_cast_to_index(array) + @classmethod + def from_variables(cls, variables: Mapping[Hashable, "Variable"]): + from .variable import IndexVariable - if dtype is None: - if isinstance(array, pd.PeriodIndex): - dtype_ = np.dtype("O") - elif hasattr(array, "categories"): - # category isn't a real numpy dtype - dtype_ = array.categories.dtype - elif not utils.is_valid_numpy_dtype(array.dtype): - dtype_ = np.dtype("O") - else: - dtype_ = array.dtype + if len(variables) != 1: + raise ValueError( + f"PandasIndex only accepts one variable, found {len(variables)} variables" + ) + + name, var = next(iter(variables.items())) + + if var.ndim != 1: + raise ValueError( + "PandasIndex only accepts a 1-dimensional variable, " + f"variable {name!r} has {var.ndim} dimensions" + ) + + dim = var.dims[0] + + obj = cls(var.data, dim) + + data = PandasIndexingAdapter(obj.index) + index_var = IndexVariable( + dim, data, attrs=var.attrs, encoding=var.encoding, fastpath=True + ) + + return obj, {name: index_var} + + @classmethod + def from_pandas_index(cls, index: pd.Index, dim: Hashable): + from .variable import IndexVariable + + if index.name is None: + name = dim + index = index.copy() + index.name = dim else: - dtype_ = np.dtype(dtype) # type: ignore[assignment] - self._dtype = dtype_ + name = index.name + + data = PandasIndexingAdapter(index) + index_var = IndexVariable(dim, data, fastpath=True) + + return cls(index, dim), {name: index_var} def to_pandas_index(self) -> pd.Index: - return self.array - - @property - def dtype(self) -> np.dtype: - return self._dtype - - def __array__(self, dtype: DTypeLike = None) -> np.ndarray: - if dtype is None: - dtype = self.dtype - array = self.array - if isinstance(array, pd.PeriodIndex): - with suppress(AttributeError): - # this might not be public API - array = array.astype("object") - return np.asarray(array.values, dtype=dtype) - - @property - def shape(self) -> Tuple[int]: - return (len(self.array),) + return self.index - def query( - self, labels, method=None, tolerance=None - ) -> Tuple[Any, Union["PandasIndex", None]]: + def query(self, labels, method=None, tolerance=None): assert len(labels) == 1 coord_name, label = next(iter(labels.items())) - index = self.array if isinstance(label, slice): - indexer = _query_slice(index, label, coord_name, method, tolerance) + indexer = _query_slice(self.index, label, coord_name, method, tolerance) elif is_dict_like(label): raise ValueError( "cannot use a dict-like object for selection on " @@ -210,7 +219,7 @@ def query( if label.ndim == 0: # see https://github.com/pydata/xarray/pull/4292 for details label_value = label[()] if label.dtype.kind in "mM" else label.item() - if isinstance(index, pd.CategoricalIndex): + if isinstance(self.index, pd.CategoricalIndex): if method is not None: raise ValueError( "'method' is not a valid kwarg when indexing using a CategoricalIndex." @@ -219,115 +228,114 @@ def query( raise ValueError( "'tolerance' is not a valid kwarg when indexing using a CategoricalIndex." ) - indexer = index.get_loc(label_value) + indexer = self.index.get_loc(label_value) else: - indexer = index.get_loc( + indexer = self.index.get_loc( label_value, method=method, tolerance=tolerance ) elif label.dtype.kind == "b": indexer = label else: - indexer = get_indexer_nd(index, label, method, tolerance) + indexer = get_indexer_nd(self.index, label, method, tolerance) if np.any(indexer < 0): raise KeyError(f"not all values found in index {coord_name!r}") return indexer, None def equals(self, other): - if isinstance(other, pd.Index): - other = type(self)(other) - return self.array.equals(other.array) + return self.index.equals(other.index) def union(self, other): - if isinstance(other, pd.Index): - other = type(self)(other) - return type(self)(self.array.union(other.array)) + new_index = self.index.union(other.index) + return type(self)(new_index, self.dim) def intersection(self, other): - if isinstance(other, pd.Index): - other = PandasIndex(other) - return type(self)(self.array.intersection(other.array)) - - def __getitem__( - self, indexer - ) -> Union[ - "PandasIndex", - NumpyIndexingAdapter, - np.ndarray, - np.datetime64, - np.timedelta64, - ]: - key = indexer.tuple - if isinstance(key, tuple) and len(key) == 1: - # unpack key so it can index a pandas.Index object (pandas.Index - # objects don't like tuples) - (key,) = key - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - return NumpyIndexingAdapter(self.array.values)[indexer] - - result = self.array[key] - - if isinstance(result, pd.Index): - result = type(self)(result, dtype=self.dtype) - else: - # result is a scalar - if result is pd.NaT: - # work around the impossibility of casting NaT with asarray - # note: it probably would be better in general to return - # pd.Timestamp rather np.than datetime64 but this is easier - # (for now) - result = np.datetime64("NaT", "ns") - elif isinstance(result, timedelta): - result = np.timedelta64(getattr(result, "value", result), "ns") - elif isinstance(result, pd.Timestamp): - # Work around for GH: pydata/xarray#1932 and numpy/numpy#10668 - # numpy fails to convert pd.Timestamp to np.datetime64[ns] - result = np.asarray(result.to_datetime64()) - elif self.dtype != object: - result = np.asarray(result, dtype=self.dtype) - - # as for numpy.ndarray indexing, we always want the result to be - # a NumPy array. - result = utils.to_0d_array(result) - - return result - - def transpose(self, order) -> pd.Index: - return self.array # self.array should be always one-dimensional - - def __repr__(self) -> str: - return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})" - - def copy(self, deep: bool = True) -> "PandasIndex": - # Not the same as just writing `self.array.copy(deep=deep)`, as - # shallow copies of the underlying numpy.ndarrays become deep ones - # upon pickling - # >>> len(pickle.dumps((self.array, self.array))) - # 4000281 - # >>> len(pickle.dumps((self.array, self.array.copy(deep=False)))) - # 8000341 - array = self.array.copy(deep=True) if deep else self.array - return type(self)(array, self._dtype) + new_index = self.index.intersection(other.index) + return type(self)(new_index, self.dim) + + def copy(self, deep=True): + return type(self)(self.index.copy(deep=deep), self.dim) + + def __getitem__(self, indexer: Any): + return type(self)(self.index[indexer], self.dim) + + +def _create_variables_from_multiindex(index, dim, level_meta=None): + from .variable import IndexVariable + + if level_meta is None: + level_meta = {} + + variables = {} + + dim_coord_adapter = PandasMultiIndexingAdapter(index) + variables[dim] = IndexVariable( + dim, LazilyIndexedArray(dim_coord_adapter), fastpath=True + ) + + for level in index.names: + meta = level_meta.get(level, {}) + data = PandasMultiIndexingAdapter( + index, dtype=meta.get("dtype"), level=level, adapter=dim_coord_adapter + ) + variables[level] = IndexVariable( + dim, + data, + attrs=meta.get("attrs"), + encoding=meta.get("encoding"), + fastpath=True, + ) + + return variables class PandasMultiIndex(PandasIndex): - def query( - self, labels, method=None, tolerance=None - ) -> Tuple[Any, Union["PandasIndex", None]]: + @classmethod + def from_variables(cls, variables: Mapping[Hashable, "Variable"]): + if any([var.ndim != 1 for var in variables.values()]): + raise ValueError("PandasMultiIndex only accepts 1-dimensional variables") + + dims = set([var.dims for var in variables.values()]) + if len(dims) != 1: + raise ValueError( + "unmatched dimensions for variables " + + ",".join([str(k) for k in variables]) + ) + + dim = next(iter(dims))[0] + index = pd.MultiIndex.from_arrays( + [var.values for var in variables.values()], names=variables.keys() + ) + obj = cls(index, dim) + + level_meta = { + name: {"dtype": var.dtype, "attrs": var.attrs, "encoding": var.encoding} + for name, var in variables.items() + } + index_vars = _create_variables_from_multiindex( + index, dim, level_meta=level_meta + ) + + return obj, index_vars + + @classmethod + def from_pandas_index(cls, index: pd.MultiIndex, dim: Hashable): + index_vars = _create_variables_from_multiindex(index, dim) + return cls(index, dim), index_vars + + def query(self, labels, method=None, tolerance=None): if method is not None or tolerance is not None: raise ValueError( "multi-index does not support ``method`` and ``tolerance``" ) - index = self.array new_index = None # label(s) given for multi-index level(s) - if all([lbl in index.names for lbl in labels]): + if all([lbl in self.index.names for lbl in labels]): is_nested_vals = _is_nested_tuple(tuple(labels.values())) - if len(labels) == index.nlevels and not is_nested_vals: - indexer = index.get_loc(tuple(labels[k] for k in index.names)) + if len(labels) == self.index.nlevels and not is_nested_vals: + indexer = self.index.get_loc(tuple(labels[k] for k in self.index.names)) else: for k, v in labels.items(): # index should be an item (i.e. Hashable) not an array-like @@ -336,7 +344,7 @@ def query( "Vectorized selection is not " f"available along coordinate {k!r} (multi-index level)" ) - indexer, new_index = index.get_loc_level( + indexer, new_index = self.index.get_loc_level( tuple(labels.values()), level=tuple(labels.keys()) ) # GH2619. Raise a KeyError if nothing is chosen @@ -346,16 +354,18 @@ def query( # assume one label value given for the multi-index "array" (dimension) else: if len(labels) > 1: - coord_name = next(iter(set(labels) - set(index.names))) + coord_name = next(iter(set(labels) - set(self.index.names))) raise ValueError( f"cannot provide labels for both coordinate {coord_name!r} (multi-index array) " - f"and one or more coordinates among {index.names!r} (multi-index levels)" + f"and one or more coordinates among {self.index.names!r} (multi-index levels)" ) coord_name, label = next(iter(labels.items())) if is_dict_like(label): - invalid_levels = [name for name in label if name not in index.names] + invalid_levels = [ + name for name in label if name not in self.index.names + ] if invalid_levels: raise ValueError( f"invalid multi-index level names {invalid_levels}" @@ -363,15 +373,15 @@ def query( return self.query(label) elif isinstance(label, slice): - indexer = _query_slice(index, label, coord_name) + indexer = _query_slice(self.index, label, coord_name) elif isinstance(label, tuple): if _is_nested_tuple(label): - indexer = index.get_locs(label) - elif len(label) == index.nlevels: - indexer = index.get_loc(label) + indexer = self.index.get_locs(label) + elif len(label) == self.index.nlevels: + indexer = self.index.get_loc(label) else: - indexer, new_index = index.get_loc_level( + indexer, new_index = self.index.get_loc_level( label, level=list(range(len(label))) ) @@ -382,7 +392,7 @@ def query( else _asarray_tuplesafe(label) ) if label.ndim == 0: - indexer, new_index = index.get_loc_level(label.item(), level=0) + indexer, new_index = self.index.get_loc_level(label.item(), level=0) elif label.dtype.kind == "b": indexer = label else: @@ -391,21 +401,20 @@ def query( "Vectorized selection is not available along " f"coordinate {coord_name!r} with a multi-index" ) - indexer = get_indexer_nd(index, label) + indexer = get_indexer_nd(self.index, label) if np.any(indexer < 0): raise KeyError(f"not all values found in index {coord_name!r}") if new_index is not None: - new_index = PandasIndex(new_index) - - return indexer, new_index - - -def wrap_pandas_index(index): - if isinstance(index, pd.MultiIndex): - return PandasMultiIndex(index) - else: - return PandasIndex(index) + if isinstance(new_index, pd.MultiIndex): + new_index, new_vars = PandasMultiIndex.from_pandas_index( + new_index, self.dim + ) + else: + new_index, new_vars = PandasIndex.from_pandas_index(new_index, self.dim) + return indexer, (new_index, new_vars) + else: + return indexer, None def remove_unused_levels_categories(index: pd.Index) -> pd.Index: @@ -492,7 +501,13 @@ def isel_variable_and_index( index: Index, indexers: Mapping[Hashable, Union[int, slice, np.ndarray, "Variable"]], ) -> Tuple["Variable", Optional[Index]]: - """Index a Variable and pandas.Index together.""" + """Index a Variable and an Index together. + + If the index cannot be indexed, return None (it will be dropped). + + (note: not compatible yet with xarray flexible indexes). + + """ from .variable import Variable if not indexers: @@ -515,8 +530,11 @@ def isel_variable_and_index( indexer = indexers[dim] if isinstance(indexer, Variable): indexer = indexer.data - pd_index = index.to_pandas_index() - new_index = wrap_pandas_index(pd_index[indexer]) + try: + new_index = index[indexer] + except NotImplementedError: + new_index = None + return new_variable, new_index @@ -528,7 +546,7 @@ def roll_index(index: PandasIndex, count: int, axis: int = 0) -> PandasIndex: new_idx = pd_index[-count:].append(pd_index[:-count]) else: new_idx = pd_index[:] - return PandasIndex(new_idx) + return PandasIndex(new_idx, index.dim) def propagate_indexes( diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 1ace4db241d..70994a36ac8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -2,12 +2,15 @@ import functools import operator from collections import defaultdict -from typing import Any, Callable, Iterable, List, Tuple, Union +from contextlib import suppress +from datetime import timedelta +from typing import Any, Callable, Iterable, List, Optional, Tuple, Union import numpy as np import pandas as pd from . import duck_array_ops, nputils, utils +from .npcompat import DTypeLike from .pycompat import ( dask_array_type, dask_version, @@ -569,9 +572,7 @@ def as_indexable(array): if isinstance(array, np.ndarray): return NumpyIndexingAdapter(array) if isinstance(array, pd.Index): - from .indexes import PandasIndex - - return PandasIndex(array) + return PandasIndexingAdapter(array) if isinstance(array, dask_array_type): return DaskIndexingAdapter(array) if hasattr(array, "__array_function__"): @@ -1259,3 +1260,149 @@ def __setitem__(self, key, value): def transpose(self, order): return self.array.transpose(order) + + +class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin): + """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" + + __slots__ = ("array", "_dtype") + + def __init__(self, array: pd.Index, dtype: DTypeLike = None): + self.array = utils.safe_cast_to_index(array) + + if dtype is None: + if isinstance(array, pd.PeriodIndex): + dtype_ = np.dtype("O") + elif hasattr(array, "categories"): + # category isn't a real numpy dtype + dtype_ = array.categories.dtype + elif not utils.is_valid_numpy_dtype(array.dtype): + dtype_ = np.dtype("O") + else: + dtype_ = array.dtype + else: + dtype_ = np.dtype(dtype) # type: ignore[assignment] + self._dtype = dtype_ + + @property + def dtype(self) -> np.dtype: + return self._dtype + + def __array__(self, dtype: DTypeLike = None) -> np.ndarray: + if dtype is None: + dtype = self.dtype + array = self.array + if isinstance(array, pd.PeriodIndex): + with suppress(AttributeError): + # this might not be public API + array = array.astype("object") + return np.asarray(array.values, dtype=dtype) + + @property + def shape(self) -> Tuple[int]: + return (len(self.array),) + + def __getitem__( + self, indexer + ) -> Union[ + "PandasIndexingAdapter", + NumpyIndexingAdapter, + np.ndarray, + np.datetime64, + np.timedelta64, + ]: + key = indexer.tuple + if isinstance(key, tuple) and len(key) == 1: + # unpack key so it can index a pandas.Index object (pandas.Index + # objects don't like tuples) + (key,) = key + + if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional + return NumpyIndexingAdapter(self.array.values)[indexer] + + result = self.array[key] + + if isinstance(result, pd.Index): + result = type(self)(result, dtype=self.dtype) + else: + # result is a scalar + if result is pd.NaT: + # work around the impossibility of casting NaT with asarray + # note: it probably would be better in general to return + # pd.Timestamp rather np.than datetime64 but this is easier + # (for now) + result = np.datetime64("NaT", "ns") + elif isinstance(result, timedelta): + result = np.timedelta64(getattr(result, "value", result), "ns") + elif isinstance(result, pd.Timestamp): + # Work around for GH: pydata/xarray#1932 and numpy/numpy#10668 + # numpy fails to convert pd.Timestamp to np.datetime64[ns] + result = np.asarray(result.to_datetime64()) + elif self.dtype != object: + result = np.asarray(result, dtype=self.dtype) + + # as for numpy.ndarray indexing, we always want the result to be + # a NumPy array. + result = utils.to_0d_array(result) + + return result + + def transpose(self, order) -> pd.Index: + return self.array # self.array should be always one-dimensional + + def __repr__(self) -> str: + return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})" + + def copy(self, deep: bool = True) -> "PandasIndexingAdapter": + # Not the same as just writing `self.array.copy(deep=deep)`, as + # shallow copies of the underlying numpy.ndarrays become deep ones + # upon pickling + # >>> len(pickle.dumps((self.array, self.array))) + # 4000281 + # >>> len(pickle.dumps((self.array, self.array.copy(deep=False)))) + # 8000341 + array = self.array.copy(deep=True) if deep else self.array + return type(self)(array, self._dtype) + + +class PandasMultiIndexingAdapter(PandasIndexingAdapter): + """Handles explicit indexing for a pandas.MultiIndex. + + This allows creating one instance for each multi-index level while + preserving indexing efficiency (memoized + might reuse another instance with + the same multi-index). + + """ + + __slots__ = ("array", "_dtype", "level", "adapter") + + def __init__( + self, + array: pd.MultiIndex, + dtype: DTypeLike = None, + level: Optional[str] = None, + adapter: Optional[PandasIndexingAdapter] = None, + ): + super().__init__(array, dtype) + self.level = level + self.adapter = adapter + + def __array__(self, dtype: DTypeLike = None) -> np.ndarray: + if self.level is not None: + return self.array.get_level_values(self.level).values + else: + return super().__array__(dtype) + + @functools.lru_cache(1) + def __getitem__(self, indexer): + if self.adapter is None: + return super().__getitem__(indexer) + else: + return self.adapter.__getitem__(indexer) + + def __repr__(self) -> str: + if self.level is None: + return super().__repr__() + else: + props = "(array={self.array!r}, level={self.level!r}, dtype={self.dtype!r})" + return f"{type(self).__name__}{props}" diff --git a/xarray/core/merge.py b/xarray/core/merge.py index db5b95fd415..b8b32bdaa01 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -578,7 +578,7 @@ def merge_core( combine_attrs: Optional[str] = "override", priority_arg: Optional[int] = None, explicit_coords: Optional[Sequence] = None, - indexes: Optional[Mapping[Hashable, Index]] = None, + indexes: Optional[Mapping[Hashable, Any]] = None, fill_value: object = dtypes.NA, ) -> _MergeResult: """Core logic for merging labeled objects. @@ -601,7 +601,8 @@ def merge_core( explicit_coords : set, optional An explicit list of variables from `objects` that are coordinates. indexes : dict, optional - Dictionary with values given by pandas.Index objects. + Dictionary with values given by xarray.Index objects or anything that + may be cast to pandas.Index objects. fill_value : scalar, optional Value to use for newly missing values @@ -979,8 +980,14 @@ def dataset_update_method( other[key] = value.drop_vars(coord_names) # use ds.coords and not ds.indexes, else str coords are cast to object - # TODO: benbovy - flexible indexes: fix this (it only works with pandas indexes) - indexes = {key: PandasIndex(dataset.coords[key]) for key in dataset.xindexes.keys()} + # TODO: benbovy - flexible indexes: make it work with any xarray index + indexes = {} + for key, index in dataset.xindexes.items(): + if isinstance(index, PandasIndex): + indexes[key] = dataset.coords[key] + else: + indexes[key] = index + return merge_core( [dataset, other], priority_arg=1, diff --git a/xarray/core/parallel.py b/xarray/core/parallel.py index 795d2e48afe..2c7f4249b5e 100644 --- a/xarray/core/parallel.py +++ b/xarray/core/parallel.py @@ -27,8 +27,6 @@ import numpy as np -from xarray.core.indexes import PandasIndex - from .alignment import align from .dataarray import DataArray from .dataset import Dataset @@ -504,16 +502,10 @@ def subset_dataset_to_block( } expected["data_vars"] = set(template.data_vars.keys()) # type: ignore[assignment] expected["coords"] = set(template.coords.keys()) # type: ignore[assignment] - # TODO: benbovy - flexible indexes: clean this up - # for now assumes pandas index (thus can be indexed) but it won't be the case for - # all indexes - expected_indexes = {} - for dim in indexes: - idx = indexes[dim].to_pandas_index()[ - _get_chunk_slicer(dim, chunk_index, output_chunk_bounds) - ] - expected_indexes[dim] = PandasIndex(idx) - expected["indexes"] = expected_indexes + expected["indexes"] = { + dim: indexes[dim][_get_chunk_slicer(dim, chunk_index, output_chunk_bounds)] + for dim in indexes + } from_wrapper = (gname,) + chunk_tuple graph[from_wrapper] = (_wrapper, func, blocked_args, kwargs, is_array, expected) @@ -558,7 +550,13 @@ def subset_dataset_to_block( }, ) - result = Dataset(coords=indexes, attrs=template.attrs) + # TODO: benbovy - flexible indexes: make it work with custom indexes + # this will need to pass both indexes and coords to the Dataset constructor + result = Dataset( + coords={k: idx.to_pandas_index() for k, idx in indexes.items()}, + attrs=template.attrs, + ) + for index in result.xindexes: result[index].attrs = template[index].attrs result[index].encoding = template[index].encoding diff --git a/xarray/core/variable.py b/xarray/core/variable.py index f69951580c7..bd89fe97494 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -25,8 +25,14 @@ from . import common, dtypes, duck_array_ops, indexing, nputils, ops, utils from .arithmetic import VariableArithmetic from .common import AbstractArray -from .indexes import PandasIndex, wrap_pandas_index -from .indexing import BasicIndexer, OuterIndexer, VectorizedIndexer, as_indexable +from .indexes import PandasIndex, PandasMultiIndex +from .indexing import ( + BasicIndexer, + OuterIndexer, + PandasIndexingAdapter, + VectorizedIndexer, + as_indexable, +) from .options import _get_keep_attrs from .pycompat import ( DuckArrayModule, @@ -170,11 +176,11 @@ def _maybe_wrap_data(data): Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure they can be indexed properly. - NumpyArrayAdapter, PandasIndex and LazilyIndexedArray should + NumpyArrayAdapter, PandasIndexingAdapter and LazilyIndexedArray should all pass through unmodified. """ if isinstance(data, pd.Index): - return wrap_pandas_index(data) + return PandasIndexingAdapter(data) return data @@ -331,7 +337,9 @@ def nbytes(self): @property def _in_memory(self): - return isinstance(self._data, (np.ndarray, np.number, PandasIndex)) or ( + return isinstance( + self._data, (np.ndarray, np.number, PandasIndexingAdapter) + ) or ( isinstance(self._data, indexing.MemoryCachedArray) and isinstance(self._data.array, indexing.NumpyIndexingAdapter) ) @@ -539,7 +547,14 @@ def to_index_variable(self): def _to_xindex(self): # temporary function used internally as a replacement of to_index() # returns an xarray Index instance instead of a pd.Index instance - return wrap_pandas_index(self.to_index()) + index_var = self.to_index_variable() + index = index_var.to_index() + dim = index_var.dims[0] + + if isinstance(index, pd.MultiIndex): + return PandasMultiIndex(index, dim) + else: + return PandasIndex(index, dim) def to_index(self): """Convert this variable to a pandas.Index""" @@ -2571,8 +2586,8 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): raise ValueError(f"{type(self).__name__} objects must be 1-dimensional") # Unlike in Variable, always eagerly load values into memory - if not isinstance(self._data, PandasIndex): - self._data = PandasIndex(self._data) + if not isinstance(self._data, PandasIndexingAdapter): + self._data = PandasIndexingAdapter(self._data) def __dask_tokenize__(self): from dask.base import normalize_token @@ -2907,7 +2922,7 @@ def assert_unique_multiindex_level_names(variables): level_names = defaultdict(list) all_level_names = set() for var_name, var in variables.items(): - if isinstance(var._data, PandasIndex): + if isinstance(var._data, PandasIndexingAdapter): idx_level_names = var.to_index_variable().level_names if idx_level_names is not None: for n in idx_level_names: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ed4b80587e5..3bbc2c93b31 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -42,7 +42,7 @@ from xarray.backends.scipy_ import ScipyBackendEntrypoint from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates -from xarray.core import indexes, indexing +from xarray.core import indexing from xarray.core.options import set_options from xarray.core.pycompat import dask_array_type from xarray.tests import LooseVersion, mock @@ -738,7 +738,7 @@ def find_and_validate_array(obj): elif isinstance(obj.array, dask_array_type): assert isinstance(obj, indexing.DaskIndexingAdapter) elif isinstance(obj.array, pd.Index): - assert isinstance(obj, indexes.PandasIndex) + assert isinstance(obj, indexing.PandasIndexingAdapter) else: raise TypeError( "{} is wrapped by {}".format(type(obj.array), type(obj)) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 012b070f1ee..8ab8bc872da 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -150,7 +150,9 @@ def test_data_property(self): def test_indexes(self): array = DataArray(np.zeros((2, 3)), [("x", [0, 1]), ("y", ["a", "b", "c"])]) expected_indexes = {"x": pd.Index([0, 1]), "y": pd.Index(["a", "b", "c"])} - expected_xindexes = {k: PandasIndex(idx) for k, idx in expected_indexes.items()} + expected_xindexes = { + k: PandasIndex(idx, k) for k, idx in expected_indexes.items() + } assert array.xindexes.keys() == expected_xindexes.keys() assert array.indexes.keys() == expected_indexes.keys() assert all([isinstance(idx, pd.Index) for idx in array.indexes.values()]) @@ -1473,7 +1475,7 @@ def test_coords_alignment(self): def test_set_coords_update_index(self): actual = DataArray([1, 2, 3], [("x", [1, 2, 3])]) actual.coords["x"] = ["a", "b", "c"] - assert actual.xindexes["x"].equals(pd.Index(["a", "b", "c"])) + assert actual.xindexes["x"].to_pandas_index().equals(pd.Index(["a", "b", "c"])) def test_coords_replacement_alignment(self): # regression test for GH725 @@ -1637,15 +1639,6 @@ def test_init_value(self): DataArray(np.array(1), coords=[("x", np.arange(10))]) def test_swap_dims(self): - array = DataArray(np.random.randn(3), {"y": ("x", list("abc"))}, "x") - expected = DataArray(array.values, {"y": list("abc")}, dims="y") - actual = array.swap_dims({"x": "y"}) - assert_identical(expected, actual) - for dim_name in set().union(expected.xindexes.keys(), actual.xindexes.keys()): - pd.testing.assert_index_equal( - expected.xindexes[dim_name].array, actual.xindexes[dim_name].array - ) - array = DataArray(np.random.randn(3), {"x": list("abc")}, "x") expected = DataArray(array.values, {"x": ("y", list("abc"))}, dims="y") actual = array.swap_dims({"x": "y"}) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 02d27ade161..8e39bbdd83e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -730,7 +730,7 @@ def test_coords_modify(self): def test_update_index(self): actual = Dataset(coords={"x": [1, 2, 3]}) actual["x"] = ["a", "b", "c"] - assert actual.xindexes["x"].equals(pd.Index(["a", "b", "c"])) + assert actual.xindexes["x"].to_pandas_index().equals(pd.Index(["a", "b", "c"])) def test_coords_setitem_with_new_dimension(self): actual = Dataset() @@ -3559,6 +3559,7 @@ def test_setitem_align_new_indexes(self): def test_setitem_str_dtype(self, dtype): ds = xr.Dataset(coords={"x": np.array(["x", "y"], dtype=dtype)}) + # test Dataset update ds["foo"] = xr.DataArray(np.array([0, 0]), dims=["x"]) assert np.issubdtype(ds.x.dtype, dtype) diff --git a/xarray/tests/test_indexes.py b/xarray/tests/test_indexes.py index defc6212228..c8ba72a253f 100644 --- a/xarray/tests/test_indexes.py +++ b/xarray/tests/test_indexes.py @@ -2,7 +2,9 @@ import pandas as pd import pytest +import xarray as xr from xarray.core.indexes import PandasIndex, PandasMultiIndex, _asarray_tuplesafe +from xarray.core.variable import IndexVariable def test_asarray_tuplesafe(): @@ -18,9 +20,57 @@ def test_asarray_tuplesafe(): class TestPandasIndex: + def test_constructor(self): + pd_idx = pd.Index([1, 2, 3]) + index = PandasIndex(pd_idx, "x") + + assert index.index is pd_idx + assert index.dim == "x" + + def test_from_variables(self): + var = xr.Variable( + "x", [1, 2, 3], attrs={"unit": "m"}, encoding={"dtype": np.int32} + ) + + index, index_vars = PandasIndex.from_variables({"x": var}) + xr.testing.assert_identical(var.to_index_variable(), index_vars["x"]) + assert index.dim == "x" + assert index.index.equals(index_vars["x"].to_index()) + + var2 = xr.Variable(("x", "y"), [[1, 2, 3], [4, 5, 6]]) + with pytest.raises(ValueError, match=r".*only accepts one variable.*"): + PandasIndex.from_variables({"x": var, "foo": var2}) + + with pytest.raises( + ValueError, match=r".*only accepts a 1-dimensional variable.*" + ): + PandasIndex.from_variables({"foo": var2}) + + def test_from_pandas_index(self): + pd_idx = pd.Index([1, 2, 3], name="foo") + + index, index_vars = PandasIndex.from_pandas_index(pd_idx, "x") + + assert index.dim == "x" + assert index.index is pd_idx + assert index.index.name == "foo" + xr.testing.assert_identical(index_vars["foo"], IndexVariable("x", [1, 2, 3])) + + # test no name set for pd.Index + pd_idx.name = None + index, index_vars = PandasIndex.from_pandas_index(pd_idx, "x") + assert "x" in index_vars + assert index.index is not pd_idx + assert index.index.name == "x" + + def to_pandas_index(self): + pd_idx = pd.Index([1, 2, 3], name="foo") + index = PandasIndex(pd_idx, "x") + assert index.to_pandas_index() is pd_idx + def test_query(self): # TODO: add tests that aren't just for edge cases - index = PandasIndex(pd.Index([1, 2, 3])) + index = PandasIndex(pd.Index([1, 2, 3]), "x") with pytest.raises(KeyError, match=r"not all values found"): index.query({"x": [0]}) with pytest.raises(KeyError): @@ -29,7 +79,9 @@ def test_query(self): index.query({"x": {"one": 0}}) def test_query_datetime(self): - index = PandasIndex(pd.to_datetime(["2000-01-01", "2001-01-01", "2002-01-01"])) + index = PandasIndex( + pd.to_datetime(["2000-01-01", "2001-01-01", "2002-01-01"]), "x" + ) actual = index.query({"x": "2001-01-01"}) expected = (1, None) assert actual == expected @@ -38,18 +90,96 @@ def test_query_datetime(self): assert actual == expected def test_query_unsorted_datetime_index_raises(self): - index = PandasIndex(pd.to_datetime(["2001", "2000", "2002"])) + index = PandasIndex(pd.to_datetime(["2001", "2000", "2002"]), "x") with pytest.raises(KeyError): # pandas will try to convert this into an array indexer. We should # raise instead, so we can be sure the result of indexing with a # slice is always a view. index.query({"x": slice("2001", "2002")}) + def test_equals(self): + index1 = PandasIndex([1, 2, 3], "x") + index2 = PandasIndex([1, 2, 3], "x") + assert index1.equals(index2) is True + + def test_union(self): + index1 = PandasIndex([1, 2, 3], "x") + index2 = PandasIndex([4, 5, 6], "y") + actual = index1.union(index2) + assert actual.index.equals(pd.Index([1, 2, 3, 4, 5, 6])) + assert actual.dim == "x" + + def test_intersection(self): + index1 = PandasIndex([1, 2, 3], "x") + index2 = PandasIndex([2, 3, 4], "y") + actual = index1.intersection(index2) + assert actual.index.equals(pd.Index([2, 3])) + assert actual.dim == "x" + + def test_copy(self): + expected = PandasIndex([1, 2, 3], "x") + actual = expected.copy() + + assert actual.index.equals(expected.index) + assert actual.index is not expected.index + assert actual.dim == expected.dim + + def test_getitem(self): + pd_idx = pd.Index([1, 2, 3]) + expected = PandasIndex(pd_idx, "x") + actual = expected[1:] + + assert actual.index.equals(pd_idx[1:]) + assert actual.dim == expected.dim + class TestPandasMultiIndex: + def test_from_variables(self): + v_level1 = xr.Variable( + "x", [1, 2, 3], attrs={"unit": "m"}, encoding={"dtype": np.int32} + ) + v_level2 = xr.Variable( + "x", ["a", "b", "c"], attrs={"unit": "m"}, encoding={"dtype": "U"} + ) + + index, index_vars = PandasMultiIndex.from_variables( + {"level1": v_level1, "level2": v_level2} + ) + + expected_idx = pd.MultiIndex.from_arrays([v_level1.data, v_level2.data]) + assert index.dim == "x" + assert index.index.equals(expected_idx) + + assert list(index_vars) == ["x", "level1", "level2"] + xr.testing.assert_equal(xr.IndexVariable("x", expected_idx), index_vars["x"]) + xr.testing.assert_identical(v_level1.to_index_variable(), index_vars["level1"]) + xr.testing.assert_identical(v_level2.to_index_variable(), index_vars["level2"]) + + var = xr.Variable(("x", "y"), [[1, 2, 3], [4, 5, 6]]) + with pytest.raises( + ValueError, match=r".*only accepts 1-dimensional variables.*" + ): + PandasMultiIndex.from_variables({"var": var}) + + v_level3 = xr.Variable("y", [4, 5, 6]) + with pytest.raises(ValueError, match=r"unmatched dimensions for variables.*"): + PandasMultiIndex.from_variables({"level1": v_level1, "level3": v_level3}) + + def test_from_pandas_index(self): + pd_idx = pd.MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=("foo", "bar")) + + index, index_vars = PandasMultiIndex.from_pandas_index(pd_idx, "x") + + assert index.dim == "x" + assert index.index is pd_idx + assert index.index.names == ("foo", "bar") + xr.testing.assert_identical(index_vars["x"], IndexVariable("x", pd_idx)) + xr.testing.assert_identical(index_vars["foo"], IndexVariable("x", [1, 2, 3])) + xr.testing.assert_identical(index_vars["bar"], IndexVariable("x", [4, 5, 6])) + def test_query(self): index = PandasMultiIndex( - pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) + pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")), "x" ) # test tuples inside slice are considered as scalar indexer values assert index.query({"x": slice(("a", 1), ("b", 2))}) == (slice(0, 4), None) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 1909d309cf5..6e4fd320029 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -81,9 +81,12 @@ def test_group_indexers_by_index(self): def test_remap_label_indexers(self): def test_indexer(data, x, expected_pos, expected_idx=None): - pos, idx = indexing.remap_label_indexers(data, {"x": x}) + pos, new_idx_vars = indexing.remap_label_indexers(data, {"x": x}) + idx, _ = new_idx_vars.get("x", (None, None)) + if idx is not None: + idx = idx.to_pandas_index() assert_array_equal(pos.get("x"), expected_pos) - assert_array_equal(idx.get("x"), expected_idx) + assert_array_equal(idx, expected_idx) data = Dataset({"x": ("x", [1, 2, 3])}) mindex = pd.MultiIndex.from_product( diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 9c78caea4d6..ce796e9de49 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -7,7 +7,6 @@ from xarray.coding.cftimeindex import CFTimeIndex from xarray.core import duck_array_ops, utils -from xarray.core.indexes import PandasIndex from xarray.core.utils import either_dict_or_kwargs, iterate_nested from . import assert_array_equal, requires_cftime, requires_dask @@ -29,13 +28,11 @@ def test_safe_cast_to_index(): dates = pd.date_range("2000-01-01", periods=10) x = np.arange(5) td = x * np.timedelta64(1, "D") - midx = pd.MultiIndex.from_tuples([(0,)], names=["a"]) for expected, array in [ (dates, dates.values), (pd.Index(x, dtype=object), x.astype(object)), (pd.Index(td), td), (pd.Index(td, dtype=object), td.astype(object)), - (midx, PandasIndex(midx)), ]: actual = utils.safe_cast_to_index(array) assert_array_equal(expected, actual) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 96072a4e1e0..487c9b34336 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -11,7 +11,6 @@ from xarray import Coordinate, DataArray, Dataset, IndexVariable, Variable, set_options from xarray.core import dtypes, duck_array_ops, indexing from xarray.core.common import full_like, ones_like, zeros_like -from xarray.core.indexes import PandasIndex from xarray.core.indexing import ( BasicIndexer, CopyOnWriteArray, @@ -20,6 +19,7 @@ MemoryCachedArray, NumpyIndexingAdapter, OuterIndexer, + PandasIndexingAdapter, VectorizedIndexer, ) from xarray.core.pycompat import dask_array_type @@ -537,7 +537,7 @@ def test_copy_index(self): v = self.cls("x", midx) for deep in [True, False]: w = v.copy(deep=deep) - assert isinstance(w._data, PandasIndex) + assert isinstance(w._data, PandasIndexingAdapter) assert isinstance(w.to_index(), pd.MultiIndex) assert_array_equal(v._data.array, w._data.array) @@ -2161,7 +2161,7 @@ def test_multiindex_default_level_names(self): def test_data(self): x = IndexVariable("x", np.arange(3.0)) - assert isinstance(x._data, PandasIndex) + assert isinstance(x._data, PandasIndexingAdapter) assert isinstance(x.data, np.ndarray) assert float == x.dtype assert_array_equal(np.arange(3), x) @@ -2303,7 +2303,7 @@ def test_coarsen_2d(self): class TestAsCompatibleData: def test_unchanged_types(self): - types = (np.asarray, PandasIndex, LazilyIndexedArray) + types = (np.asarray, PandasIndexingAdapter, LazilyIndexedArray) for t in types: for data in [ np.arange(3),