Skip to content

Commit aa4361d

Browse files
dcherianpre-commit-ci[bot]Illviljanshoyer
authored
Avoid calling np.asarray on lazy indexing classes (#6874)
* Add get_array to lazy indexing array types. This returns the underlying array type instead of always casting to np.array. This is necessary for Zarr stores where the Zarr Array wraps a cupy array (for example kvikio.zarr.GDSStoree). In that case, we cannot call np.asarray because __array__ is expected to always return a numpy array. We use get_array in Variable.data to make sure we don't load arrays from such GDSStores. * Rename to short_array_repr; use Variable.data instead of always casting to np.asarray * Fix Variable.load * Make get_array recursive. * Some cleanups * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add get_array to PandasIndexingAdaptor * Finish short_array_repr refactoring * Rename to get_duck_array * Try without hasattr check * Return bare array from LazilyIndexedArray.get_duck_array * Add get_duck_array to AbstractArray Clean up short_array_repr. * Fix zerodim test * Fix LazilyVectorizedIndexedArray * Inherit __array__ from ExplicitlyIndexed * Fix InaccessibleArray in tests * Fix BackendArray * reprs Use .data on AbstractArray * Force netCDF and h5netCDF to return arrays * Add whats-new * Add comments; review feedback * Fix merge. * Remove another np.asarray * Avoid np.asarray on __getitem__. for BoolTypeArray, NativeEndiannessArray * [WIP] ExplicitlyIndexedBackendArray * Handle Indexing Adapter classes explicitly. * Revert "Handle Indexing Adapter classes explicitly." This reverts commit 46d98ec. * Revert "[WIP] ExplicitlyIndexedBackendArray" This reverts commit 9b727e6. * Fix pydap now that NumpyIndexingAdapter does not automatically cast to array * Update xarray/backends/pydap_.py * Add test store. * [skip-ci] Update whats-new * Fix test * fix mypy? * Fix Zarr test * test the repr too * Guard np.asarray for scalars. * Revert casting to arrays in backend * Wrap numpy scalars in Explicitly*Indexed*.get_duck_aray * Apply suggestions from code review Co-authored-by: Illviljan <[email protected]> Co-authored-by: Stephan Hoyer <[email protected]> * Update xarray/tests/__init__.py Co-authored-by: Illviljan <[email protected]> * Update xarray/core/indexing.py * Apply suggestions from code review Co-authored-by: Illviljan <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Bring back the ugly check * Update whats-new * Fix pre-commit * silence mypy error * minimize diff --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Illviljan <[email protected]> Co-authored-by: Stephan Hoyer <[email protected]>
1 parent e2e1b29 commit aa4361d

File tree

10 files changed

+178
-37
lines changed

10 files changed

+178
-37
lines changed

doc/whats-new.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ Documentation
5757
Internal Changes
5858
~~~~~~~~~~~~~~~~
5959

60+
- Don't assume that arrays read from disk will be Numpy arrays. This is a step toward
61+
enabling reads from a Zarr store using the `Kvikio <https://docs.rapids.ai/api/kvikio/stable/api.html#zarr>`_
62+
or `TensorStore <https://google.github.io/tensorstore/>`_ libraries.
63+
(:pull:`6874`). By `Deepak Cherian <https://github.com/dcherian>`_.
64+
6065
- Remove internal support for reading GRIB files through the ``cfgrib`` backend. ``cfgrib`` now uses the external
6166
backend interface, so no existing code should break.
6267
By `Deepak Cherian <https://github.com/dcherian>`_.

xarray/backends/common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,9 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500
8686
class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed):
8787
__slots__ = ()
8888

89-
def __array__(self, dtype=None):
89+
def get_duck_array(self, dtype: np.typing.DTypeLike = None):
9090
key = indexing.BasicIndexer((slice(None),) * self.ndim)
91-
return np.asarray(self[key], dtype=dtype)
91+
return self[key] # type: ignore [index]
9292

9393

9494
class AbstractDataStore:

xarray/backends/pydap_.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def _getitem(self, key):
5454
# downloading coordinate data twice
5555
array = getattr(self.array, "array", self.array)
5656
result = robust_getitem(array, key, catch=ValueError)
57+
result = np.asarray(result)
5758
# in some cases, pydap doesn't squeeze axes automatically like numpy
5859
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
5960
if result.ndim + len(axis) != array.ndim and axis:

xarray/coding/variables.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ def dtype(self) -> np.dtype:
6969
def __getitem__(self, key):
7070
return type(self)(self.array[key], self.func, self.dtype)
7171

72-
def __array__(self, dtype=None):
73-
return self.func(self.array)
72+
def get_duck_array(self):
73+
return self.func(self.array.get_duck_array())
7474

7575
def __repr__(self) -> str:
7676
return "{}({!r}, func={!r}, dtype={!r})".format(
@@ -224,7 +224,7 @@ def decode(self, variable: Variable, name: T_Name = None):
224224

225225

226226
def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
227-
data = np.array(data, dtype=dtype, copy=True)
227+
data = data.astype(dtype=dtype, copy=True)
228228
if scale_factor is not None:
229229
data *= scale_factor
230230
if add_offset is not None:

xarray/core/formatting.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pandas.errors import OutOfBoundsDatetime
1717

1818
from xarray.core.duck_array_ops import array_equiv
19-
from xarray.core.indexing import MemoryCachedArray
19+
from xarray.core.indexing import ExplicitlyIndexed, MemoryCachedArray
2020
from xarray.core.options import OPTIONS, _get_boolean_with_default
2121
from xarray.core.pycompat import array_type
2222
from xarray.core.utils import is_duck_array
@@ -557,8 +557,15 @@ def limit_lines(string: str, *, limit: int):
557557
return string
558558

559559

560-
def short_numpy_repr(array):
561-
array = np.asarray(array)
560+
def short_array_repr(array):
561+
from xarray.core.common import AbstractArray
562+
563+
if isinstance(array, ExplicitlyIndexed):
564+
array = array.get_duck_array()
565+
elif isinstance(array, AbstractArray):
566+
array = array.data
567+
if not is_duck_array(array):
568+
array = np.asarray(array)
562569

563570
# default to lower precision so a full (abbreviated) line can fit on
564571
# one line with the default display_width
@@ -582,11 +589,11 @@ def short_data_repr(array):
582589
"""Format "data" for DataArray and Variable."""
583590
internal_data = getattr(array, "variable", array)._data
584591
if isinstance(array, np.ndarray):
585-
return short_numpy_repr(array)
592+
return short_array_repr(array)
586593
elif is_duck_array(internal_data):
587594
return limit_lines(repr(array.data), limit=40)
588595
elif array._in_memory:
589-
return short_numpy_repr(array)
596+
return short_array_repr(array)
590597
else:
591598
# internal xarray array type
592599
return f"[{array.size} values with dtype={array.dtype}]"
@@ -831,7 +838,7 @@ def diff_array_repr(a, b, compat):
831838
equiv = array_equiv
832839

833840
if not equiv(a.data, b.data):
834-
temp = [wrap_indent(short_numpy_repr(obj), start=" ") for obj in (a, b)]
841+
temp = [wrap_indent(short_array_repr(obj), start=" ") for obj in (a, b)]
835842
diff_data_repr = [
836843
ab_side + "\n" + ab_data_repr
837844
for ab_side, ab_data_repr in zip(("L", "R"), temp)

xarray/core/indexing.py

Lines changed: 51 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -449,13 +449,25 @@ class ExplicitlyIndexed:
449449

450450
__slots__ = ()
451451

452+
def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray:
453+
# Leave casting to an array up to the underlying array type.
454+
return np.asarray(self.get_duck_array(), dtype=dtype)
455+
456+
def get_duck_array(self):
457+
return self.array
458+
452459

453460
class ExplicitlyIndexedNDArrayMixin(NDArrayMixin, ExplicitlyIndexed):
454461
__slots__ = ()
455462

456-
def __array__(self, dtype=None):
463+
def get_duck_array(self):
457464
key = BasicIndexer((slice(None),) * self.ndim)
458-
return np.asarray(self[key], dtype=dtype)
465+
return self[key]
466+
467+
def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray:
468+
# This is necessary because we apply the indexing key in self.get_duck_array()
469+
# Note this is the base class for all lazy indexing classes
470+
return np.asarray(self.get_duck_array(), dtype=dtype)
459471

460472

461473
class ImplicitToExplicitIndexingAdapter(NDArrayMixin):
@@ -467,8 +479,11 @@ def __init__(self, array, indexer_cls=BasicIndexer):
467479
self.array = as_indexable(array)
468480
self.indexer_cls = indexer_cls
469481

470-
def __array__(self, dtype=None):
471-
return np.asarray(self.array, dtype=dtype)
482+
def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray:
483+
return np.asarray(self.get_duck_array(), dtype=dtype)
484+
485+
def get_duck_array(self):
486+
return self.array.get_duck_array()
472487

473488
def __getitem__(self, key):
474489
key = expanded_indexer(key, self.ndim)
@@ -531,9 +546,15 @@ def shape(self) -> tuple[int, ...]:
531546
shape.append(k.size)
532547
return tuple(shape)
533548

534-
def __array__(self, dtype=None):
535-
array = as_indexable(self.array)
536-
return np.asarray(array[self.key], dtype=None)
549+
def get_duck_array(self):
550+
array = self.array[self.key]
551+
# self.array[self.key] is now a numpy array when
552+
# self.array is a BackendArray subclass
553+
# and self.key is BasicIndexer((slice(None, None, None),))
554+
# so we need the explicit check for ExplicitlyIndexed
555+
if isinstance(array, ExplicitlyIndexed):
556+
array = array.get_duck_array()
557+
return _wrap_numpy_scalars(array)
537558

538559
def transpose(self, order):
539560
return LazilyVectorizedIndexedArray(self.array, self.key).transpose(order)
@@ -584,8 +605,15 @@ def __init__(self, array, key):
584605
def shape(self) -> tuple[int, ...]:
585606
return np.broadcast(*self.key.tuple).shape
586607

587-
def __array__(self, dtype=None):
588-
return np.asarray(self.array[self.key], dtype=None)
608+
def get_duck_array(self):
609+
array = self.array[self.key]
610+
# self.array[self.key] is now a numpy array when
611+
# self.array is a BackendArray subclass
612+
# and self.key is BasicIndexer((slice(None, None, None),))
613+
# so we need the explicit check for ExplicitlyIndexed
614+
if isinstance(array, ExplicitlyIndexed):
615+
array = array.get_duck_array()
616+
return _wrap_numpy_scalars(array)
589617

590618
def _updated_key(self, new_key):
591619
return _combine_indexers(self.key, self.shape, new_key)
@@ -631,8 +659,8 @@ def _ensure_copied(self):
631659
self.array = as_indexable(np.array(self.array))
632660
self._copied = True
633661

634-
def __array__(self, dtype=None):
635-
return np.asarray(self.array, dtype=dtype)
662+
def get_duck_array(self):
663+
return self.array.get_duck_array()
636664

637665
def __getitem__(self, key):
638666
return type(self)(_wrap_numpy_scalars(self.array[key]))
@@ -658,12 +686,14 @@ def __init__(self, array):
658686
self.array = _wrap_numpy_scalars(as_indexable(array))
659687

660688
def _ensure_cached(self):
661-
if not isinstance(self.array, NumpyIndexingAdapter):
662-
self.array = NumpyIndexingAdapter(np.asarray(self.array))
689+
self.array = as_indexable(self.array.get_duck_array())
690+
691+
def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray:
692+
return np.asarray(self.get_duck_array(), dtype=dtype)
663693

664-
def __array__(self, dtype=None):
694+
def get_duck_array(self):
665695
self._ensure_cached()
666-
return np.asarray(self.array, dtype=dtype)
696+
return self.array.get_duck_array()
667697

668698
def __getitem__(self, key):
669699
return type(self)(_wrap_numpy_scalars(self.array[key]))
@@ -827,7 +857,7 @@ def explicit_indexing_adapter(
827857
result = raw_indexing_method(raw_key.tuple)
828858
if numpy_indices.tuple:
829859
# index the loaded np.ndarray
830-
result = NumpyIndexingAdapter(np.asarray(result))[numpy_indices]
860+
result = NumpyIndexingAdapter(result)[numpy_indices]
831861
return result
832862

833863

@@ -1463,6 +1493,9 @@ def __array__(self, dtype: DTypeLike = None) -> np.ndarray:
14631493
array = array.astype("object")
14641494
return np.asarray(array.values, dtype=dtype)
14651495

1496+
def get_duck_array(self) -> np.ndarray:
1497+
return np.asarray(self)
1498+
14661499
@property
14671500
def shape(self) -> tuple[int, ...]:
14681501
return (len(self.array),)
@@ -1603,9 +1636,9 @@ def _repr_inline_(self, max_width: int) -> str:
16031636
return format_array_flat(self._get_array_subset(), max_width)
16041637

16051638
def _repr_html_(self) -> str:
1606-
from xarray.core.formatting import short_numpy_repr
1639+
from xarray.core.formatting import short_array_repr
16071640

1608-
array_repr = short_numpy_repr(self._get_array_subset())
1641+
array_repr = short_array_repr(self._get_array_subset())
16091642
return f"<pre>{escape(array_repr)}</pre>"
16101643

16111644
def copy(self, deep: bool = True) -> PandasMultiIndexingAdapter:

xarray/core/variable.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,8 @@ def data(self) -> Any:
426426
"""
427427
if is_duck_array(self._data):
428428
return self._data
429+
elif isinstance(self._data, indexing.ExplicitlyIndexed):
430+
return self._data.get_duck_array()
429431
else:
430432
return self.values
431433

@@ -533,6 +535,8 @@ def load(self, **kwargs):
533535
"""
534536
if is_duck_dask_array(self._data):
535537
self._data = as_compatible_data(self._data.compute(**kwargs))
538+
elif isinstance(self._data, indexing.ExplicitlyIndexed):
539+
self._data = self._data.get_duck_array()
536540
elif not is_duck_array(self._data):
537541
self._data = np.asarray(self._data)
538542
return self

xarray/tests/__init__.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,18 @@ class UnexpectedDataAccess(Exception):
139139

140140

141141
class InaccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed):
142+
"""Disallows any loading."""
143+
142144
def __init__(self, array):
143145
self.array = array
144146

145-
def __getitem__(self, key):
146-
raise UnexpectedDataAccess("Tried accessing data.")
147+
def get_duck_array(self):
148+
raise UnexpectedDataAccess("Tried accessing data")
149+
150+
def __array__(self, dtype: np.typing.DTypeLike = None):
151+
raise UnexpectedDataAccess("Tried accessing data")
147152

148-
def __array__(self):
153+
def __getitem__(self, key):
149154
raise UnexpectedDataAccess("Tried accessing data.")
150155

151156

@@ -157,6 +162,23 @@ def __getitem__(self, key):
157162
return self.array[tuple_idxr]
158163

159164

165+
class DuckArrayWrapper(utils.NDArrayMixin):
166+
"""Array-like that prevents casting to array.
167+
Modeled after cupy."""
168+
169+
def __init__(self, array: np.ndarray):
170+
self.array = array
171+
172+
def __getitem__(self, key):
173+
return type(self)(self.array[key])
174+
175+
def __array__(self, dtype: np.typing.DTypeLike = None):
176+
raise UnexpectedDataAccess("Tried accessing data")
177+
178+
def __array_namespace__(self):
179+
"""Present to satisfy is_duck_array test."""
180+
181+
160182
class ReturnItem:
161183
def __getitem__(self, key):
162184
return key

0 commit comments

Comments
 (0)