Skip to content

Commit 60a4a49

Browse files
d-v-bpre-commit-ci[bot]dcherian
authored
Cache pre-existing Zarr arrays in Zarr backend (pydata#9861)
* cache array keys on ZarrStore instances * refactor get_array_keys * add test for get_array_keys * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * lint * add type annotation for _cache * make type hint accurate * make type hint pass mypy * make type hint pass mypy, correctly * Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian <[email protected]> * use roundtrip method instead of explicit alternative * cache members instead of just array keys * adjust test to members caching * refactor members cache * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * explictly revert changes to test_write_store * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * indent correctly * update instrumented tests, remove cache update functionality, and set cache default to False in test fixtures * update instrumented tests, remove cache update functionality, and set cache default to False in test fixtures * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * lint * update tests * make check_requests more permissive * update instrumented tests for zarr==2.18 * Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian <[email protected]> * Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian <[email protected]> * Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian <[email protected]> * Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian <[email protected]> * Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian <[email protected]> * doc: add whats new content * fixup * update whats new * fixup * remove vestigial cache_array_keys * make cache_members default to True * tweak * Remove from public API --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: Deepak Cherian <[email protected]>
1 parent 9fe816e commit 60a4a49

File tree

3 files changed

+162
-62
lines changed

3 files changed

+162
-62
lines changed

doc/whats-new.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ New Features
2424
- Better support wrapping additional array types (e.g. ``cupy`` or ``jax``) by calling generalized
2525
duck array operations throughout more xarray methods. (:issue:`7848`, :pull:`9798`).
2626
By `Sam Levang <https://github.com/slevang>`_.
27+
28+
- Better performance for reading Zarr arrays in the ``ZarrStore`` class by caching the state of Zarr
29+
storage and avoiding redundant IO operations. Usage of the cache can be controlled via the
30+
``cache_members`` parameter to ``ZarrStore``. When ``cache_members`` is ``True`` (the default), the
31+
``ZarrStore`` stores a snapshot of names and metadata of the in-scope Zarr arrays; this cache
32+
is then used when iterating over those Zarr arrays, which avoids IO operations and thereby reduces
33+
latency. (:issue:`9853`, :pull:`9861`). By `Davis Bennett <https://github.com/d-v-b>`_.
34+
2735
- Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to
2836
iso8601-parser (:pull:`9885`).
2937
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

xarray/backends/zarr.py

Lines changed: 68 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -602,9 +602,11 @@ class ZarrStore(AbstractWritableDataStore):
602602

603603
__slots__ = (
604604
"_append_dim",
605+
"_cache_members",
605606
"_close_store_on_close",
606607
"_consolidate_on_close",
607608
"_group",
609+
"_members",
608610
"_mode",
609611
"_read_only",
610612
"_safe_chunks",
@@ -633,6 +635,7 @@ def open_store(
633635
zarr_format=None,
634636
use_zarr_fill_value_as_mask=None,
635637
write_empty: bool | None = None,
638+
cache_members: bool = True,
636639
):
637640
(
638641
zarr_group,
@@ -664,6 +667,7 @@ def open_store(
664667
write_empty,
665668
close_store_on_close,
666669
use_zarr_fill_value_as_mask,
670+
cache_members=cache_members,
667671
)
668672
for group in group_paths
669673
}
@@ -686,6 +690,7 @@ def open_group(
686690
zarr_format=None,
687691
use_zarr_fill_value_as_mask=None,
688692
write_empty: bool | None = None,
693+
cache_members: bool = True,
689694
):
690695
(
691696
zarr_group,
@@ -716,6 +721,7 @@ def open_group(
716721
write_empty,
717722
close_store_on_close,
718723
use_zarr_fill_value_as_mask,
724+
cache_members,
719725
)
720726

721727
def __init__(
@@ -729,6 +735,7 @@ def __init__(
729735
write_empty: bool | None = None,
730736
close_store_on_close: bool = False,
731737
use_zarr_fill_value_as_mask=None,
738+
cache_members: bool = True,
732739
):
733740
self.zarr_group = zarr_group
734741
self._read_only = self.zarr_group.read_only
@@ -742,15 +749,66 @@ def __init__(
742749
self._write_empty = write_empty
743750
self._close_store_on_close = close_store_on_close
744751
self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask
752+
self._cache_members: bool = cache_members
753+
self._members: dict[str, ZarrArray | ZarrGroup] = {}
754+
755+
if self._cache_members:
756+
# initialize the cache
757+
# this cache is created here and never updated.
758+
# If the `ZarrStore` instance creates a new zarr array, or if an external process
759+
# removes an existing zarr array, then the cache will be invalid.
760+
# We use this cache only to record any pre-existing arrays when the group was opened
761+
# create a new ZarrStore instance if you want to
762+
# capture the current state of the zarr group, or create a ZarrStore with
763+
# `cache_members` set to `False` to disable this cache and instead fetch members
764+
# on demand.
765+
self._members = self._fetch_members()
766+
767+
@property
768+
def members(self) -> dict[str, ZarrArray]:
769+
"""
770+
Model the arrays and groups contained in self.zarr_group as a dict. If `self._cache_members`
771+
is true, the dict is cached. Otherwise, it is retrieved from storage.
772+
"""
773+
if not self._cache_members:
774+
return self._fetch_members()
775+
else:
776+
return self._members
777+
778+
def _fetch_members(self) -> dict[str, ZarrArray]:
779+
"""
780+
Get the arrays and groups defined in the zarr group modelled by this Store
781+
"""
782+
import zarr
783+
784+
if zarr.__version__ >= "3":
785+
return dict(self.zarr_group.members())
786+
else:
787+
return dict(self.zarr_group.items())
788+
789+
def array_keys(self) -> tuple[str, ...]:
790+
from zarr import Array as ZarrArray
791+
792+
return tuple(
793+
key for (key, node) in self.members.items() if isinstance(node, ZarrArray)
794+
)
795+
796+
def arrays(self) -> tuple[tuple[str, ZarrArray], ...]:
797+
from zarr import Array as ZarrArray
798+
799+
return tuple(
800+
(key, node)
801+
for (key, node) in self.members.items()
802+
if isinstance(node, ZarrArray)
803+
)
745804

746805
@property
747806
def ds(self):
748807
# TODO: consider deprecating this in favor of zarr_group
749808
return self.zarr_group
750809

751-
def open_store_variable(self, name, zarr_array=None):
752-
if zarr_array is None:
753-
zarr_array = self.zarr_group[name]
810+
def open_store_variable(self, name):
811+
zarr_array = self.members[name]
754812
data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array))
755813
try_nczarr = self._mode == "r"
756814
dimensions, attributes = _get_zarr_dims_and_attrs(
@@ -798,9 +856,7 @@ def open_store_variable(self, name, zarr_array=None):
798856
return Variable(dimensions, data, attributes, encoding)
799857

800858
def get_variables(self):
801-
return FrozenDict(
802-
(k, self.open_store_variable(k, v)) for k, v in self.zarr_group.arrays()
803-
)
859+
return FrozenDict((k, self.open_store_variable(k)) for k in self.array_keys())
804860

805861
def get_attrs(self):
806862
return {
@@ -812,7 +868,7 @@ def get_attrs(self):
812868
def get_dimensions(self):
813869
try_nczarr = self._mode == "r"
814870
dimensions = {}
815-
for _k, v in self.zarr_group.arrays():
871+
for _k, v in self.arrays():
816872
dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr)
817873
for d, s in zip(dim_names, v.shape, strict=True):
818874
if d in dimensions and dimensions[d] != s:
@@ -881,7 +937,7 @@ def store(
881937
existing_keys = {}
882938
existing_variable_names = {}
883939
else:
884-
existing_keys = tuple(self.zarr_group.array_keys())
940+
existing_keys = self.array_keys()
885941
existing_variable_names = {
886942
vn for vn in variables if _encode_variable_name(vn) in existing_keys
887943
}
@@ -1059,7 +1115,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
10591115
dimensions.
10601116
"""
10611117

1062-
existing_keys = tuple(self.zarr_group.array_keys())
1118+
existing_keys = self.array_keys()
10631119
is_zarr_v3_format = _zarr_v3() and self.zarr_group.metadata.zarr_format == 3
10641120

10651121
for vn, v in variables.items():
@@ -1107,7 +1163,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
11071163
zarr_array.resize(new_shape)
11081164

11091165
zarr_shape = zarr_array.shape
1110-
11111166
region = tuple(write_region[dim] for dim in dims)
11121167

11131168
# We need to do this for both new and existing variables to ensure we're not
@@ -1249,7 +1304,7 @@ def _validate_and_autodetect_region(self, ds: Dataset) -> Dataset:
12491304

12501305
def _validate_encoding(self, encoding) -> None:
12511306
if encoding and self._mode in ["a", "a-", "r+"]:
1252-
existing_var_names = set(self.zarr_group.array_keys())
1307+
existing_var_names = self.array_keys()
12531308
for var_name in existing_var_names:
12541309
if var_name in encoding:
12551310
raise ValueError(
@@ -1503,6 +1558,7 @@ def open_dataset(
15031558
store=None,
15041559
engine=None,
15051560
use_zarr_fill_value_as_mask=None,
1561+
cache_members: bool = True,
15061562
) -> Dataset:
15071563
filename_or_obj = _normalize_path(filename_or_obj)
15081564
if not store:
@@ -1518,6 +1574,7 @@ def open_dataset(
15181574
zarr_version=zarr_version,
15191575
use_zarr_fill_value_as_mask=None,
15201576
zarr_format=zarr_format,
1577+
cache_members=cache_members,
15211578
)
15221579

15231580
store_entrypoint = StoreBackendEntrypoint()

0 commit comments

Comments
 (0)