Skip to content

Commit 06244df

Browse files
authored
ENH: switch Dataset and DataArray to use explicit indexes (#2639)
* ENH: switch Dataset and DataArray to use explicit indexes This change switches Dataset.indexes and DataArray.indexes to be backed by explicit dictionaries of indexes, instead of being implicitly defined by the set of coordinates with names matching dimensions. There are no changes to the public interface yet: these will come later. For now, indexes are recreated from coordinates every time a new DataArray or Dataset is created. In follow-up PRs, I will refactor indexes to be propagated explicitly in xarray operations. This will facilitate future API changes, when indexes will no longer only be associated with dimensions. * Add xarray.core.indexes * Fixes per review
1 parent 28123bb commit 06244df

File tree

4 files changed

+86
-50
lines changed

4 files changed

+86
-50
lines changed

xarray/core/coordinates.py

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ def _update_coords(self, coords):
196196
self._data._variables = variables
197197
self._data._coord_names.update(new_coord_names)
198198
self._data._dims = dict(dims)
199+
self._data._indexes = None
199200

200201
def __delitem__(self, key):
201202
if key in self:
@@ -276,44 +277,6 @@ def __iter__(self):
276277
return iter(self._data._level_coords)
277278

278279

279-
class Indexes(Mapping, formatting.ReprMixin):
280-
"""Ordered Mapping[str, pandas.Index] for xarray objects.
281-
"""
282-
283-
def __init__(self, variables, sizes):
284-
"""Not for public consumption.
285-
286-
Parameters
287-
----------
288-
variables : OrderedDict[Any, Variable]
289-
Reference to OrderedDict holding variable objects. Should be the
290-
same dictionary used by the source object.
291-
sizes : OrderedDict[Any, int]
292-
Map from dimension names to sizes.
293-
"""
294-
self._variables = variables
295-
self._sizes = sizes
296-
297-
def __iter__(self):
298-
for key in self._sizes:
299-
if key in self._variables:
300-
yield key
301-
302-
def __len__(self):
303-
return sum(key in self._variables for key in self._sizes)
304-
305-
def __contains__(self, key):
306-
return key in self._sizes and key in self._variables
307-
308-
def __getitem__(self, key):
309-
if key not in self._sizes:
310-
raise KeyError(key)
311-
return self._variables[key].to_index()
312-
313-
def __unicode__(self):
314-
return formatting.indexes_repr(self)
315-
316-
317280
def assert_coordinate_consistent(obj, coords):
318281
""" Maeke sure the dimension coordinate of obj is
319282
consistent with coords.

xarray/core/dataarray.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
from .alignment import align, reindex_like_indexers
1414
from .common import AbstractArray, DataWithCoords
1515
from .coordinates import (
16-
DataArrayCoordinates, Indexes, LevelCoordinatesSource,
16+
DataArrayCoordinates, LevelCoordinatesSource,
1717
assert_coordinate_consistent, remap_label_indexers)
1818
from .dataset import Dataset, merge_indexes, split_indexes
1919
from .formatting import format_item
20+
from .indexes import default_indexes, Indexes
2021
from .options import OPTIONS
2122
from .pycompat import OrderedDict, basestring, iteritems, range, zip
2223
from .utils import (
@@ -165,7 +166,7 @@ class DataArray(AbstractArray, DataWithCoords):
165166
dt = property(DatetimeAccessor)
166167

167168
def __init__(self, data, coords=None, dims=None, name=None,
168-
attrs=None, encoding=None, fastpath=False):
169+
attrs=None, encoding=None, indexes=None, fastpath=False):
169170
"""
170171
Parameters
171172
----------
@@ -237,6 +238,10 @@ def __init__(self, data, coords=None, dims=None, name=None,
237238
self._coords = coords
238239
self._name = name
239240

241+
# TODO(shoyer): document this argument, once it becomes part of the
242+
# public interface.
243+
self._indexes = indexes
244+
240245
self._file_obj = None
241246

242247
self._initialized = True
@@ -534,9 +539,11 @@ def encoding(self, value):
534539

535540
@property
536541
def indexes(self):
537-
"""OrderedDict of pandas.Index objects used for label based indexing
542+
"""Mapping of pandas.Index objects used for label based indexing
538543
"""
539-
return Indexes(self._coords, self.sizes)
544+
if self._indexes is None:
545+
self._indexes = default_indexes(self._coords, self.dims)
546+
return Indexes(self._indexes)
540547

541548
@property
542549
def coords(self):

xarray/core/dataset.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,17 @@
1313
import xarray as xr
1414

1515
from . import (
16-
alignment, dtypes, duck_array_ops, formatting, groupby, indexing, ops,
17-
pdcompat, resample, rolling, utils)
16+
alignment, dtypes, duck_array_ops, formatting, groupby,
17+
indexing, ops, pdcompat, resample, rolling, utils)
1818
from ..coding.cftimeindex import _parse_array_of_cftime_strings
1919
from .alignment import align
2020
from .common import (
2121
ALL_DIMS, DataWithCoords, ImplementsDatasetReduce,
2222
_contains_datetime_like_objects)
2323
from .coordinates import (
24-
DatasetCoordinates, Indexes, LevelCoordinatesSource,
24+
DatasetCoordinates, LevelCoordinatesSource,
2525
assert_coordinate_consistent, remap_label_indexers)
26+
from .indexes import Indexes, default_indexes
2627
from .merge import (
2728
dataset_merge_method, dataset_update_method, merge_data_and_coords,
2829
merge_variables)
@@ -364,6 +365,10 @@ def __init__(self, data_vars=None, coords=None, attrs=None,
364365
coords = {}
365366
if data_vars is not None or coords is not None:
366367
self._set_init_vars_and_dims(data_vars, coords, compat)
368+
369+
# TODO(shoyer): expose indexes as a public argument in __init__
370+
self._indexes = None
371+
367372
if attrs is not None:
368373
self.attrs = attrs
369374
self._encoding = None
@@ -642,14 +647,15 @@ def persist(self, **kwargs):
642647

643648
@classmethod
644649
def _construct_direct(cls, variables, coord_names, dims=None, attrs=None,
645-
file_obj=None, encoding=None):
650+
indexes=None, file_obj=None, encoding=None):
646651
"""Shortcut around __init__ for internal use when we want to skip
647652
costly validation
648653
"""
649654
obj = object.__new__(cls)
650655
obj._variables = variables
651656
obj._coord_names = coord_names
652657
obj._dims = dims
658+
obj._indexes = indexes
653659
obj._attrs = attrs
654660
obj._file_obj = file_obj
655661
obj._encoding = encoding
@@ -664,7 +670,8 @@ def _from_vars_and_coord_names(cls, variables, coord_names, attrs=None):
664670
return cls._construct_direct(variables, coord_names, dims, attrs)
665671

666672
def _replace_vars_and_dims(self, variables, coord_names=None, dims=None,
667-
attrs=__default_attrs, inplace=False):
673+
attrs=__default_attrs, indexes=None,
674+
inplace=False):
668675
"""Fastpath constructor for internal use.
669676
670677
Preserves coord names and attributes. If not provided explicitly,
@@ -693,13 +700,15 @@ def _replace_vars_and_dims(self, variables, coord_names=None, dims=None,
693700
self._coord_names = coord_names
694701
if attrs is not self.__default_attrs:
695702
self._attrs = attrs
703+
self._indexes = indexes
696704
obj = self
697705
else:
698706
if coord_names is None:
699707
coord_names = self._coord_names.copy()
700708
if attrs is self.__default_attrs:
701709
attrs = self._attrs_copy()
702-
obj = self._construct_direct(variables, coord_names, dims, attrs)
710+
obj = self._construct_direct(
711+
variables, coord_names, dims, attrs, indexes)
703712
return obj
704713

705714
def _replace_indexes(self, indexes):
@@ -1064,9 +1073,11 @@ def identical(self, other):
10641073

10651074
@property
10661075
def indexes(self):
1067-
"""OrderedDict of pandas.Index objects used for label based indexing
1076+
"""Mapping of pandas.Index objects used for label based indexing
10681077
"""
1069-
return Indexes(self._variables, self._dims)
1078+
if self._indexes is None:
1079+
self._indexes = default_indexes(self._variables, self._dims)
1080+
return Indexes(self._indexes)
10701081

10711082
@property
10721083
def coords(self):

xarray/core/indexes.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from __future__ import absolute_import, division, print_function
2+
try:
3+
from collections.abc import Mapping
4+
except ImportError:
5+
from collections import Mapping
6+
from collections import OrderedDict
7+
8+
from . import formatting
9+
10+
11+
class Indexes(Mapping, formatting.ReprMixin):
12+
"""Immutable proxy for Dataset or DataArrary indexes."""
13+
def __init__(self, indexes):
14+
"""Not for public consumption.
15+
16+
Parameters
17+
----------
18+
indexes : Dict[Any, pandas.Index]
19+
Indexes held by this object.
20+
"""
21+
self._indexes = indexes
22+
23+
def __iter__(self):
24+
return iter(self._indexes)
25+
26+
def __len__(self):
27+
return len(self._indexes)
28+
29+
def __contains__(self, key):
30+
return key in self._indexes
31+
32+
def __getitem__(self, key):
33+
return self._indexes[key]
34+
35+
def __unicode__(self):
36+
return formatting.indexes_repr(self)
37+
38+
39+
def default_indexes(coords, dims):
40+
"""Default indexes for a Dataset/DataArray.
41+
42+
Parameters
43+
----------
44+
coords : Mapping[Any, xarray.Variable]
45+
Coordinate variables from which to draw default indexes.
46+
dims : iterable
47+
Iterable of dimension names.
48+
49+
Returns
50+
-------
51+
Mapping[Any, pandas.Index] mapping indexing keys (levels/dimension names)
52+
to indexes used for indexing along that dimension.
53+
"""
54+
return OrderedDict((key, coords[key].to_index())
55+
for key in dims if key in coords)

0 commit comments

Comments
 (0)