From f4f820085308ad4bb17940aef3ca792a0e9bbb66 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 10:06:17 -0500 Subject: [PATCH 1/7] Fixed consolidated Group getitem with multi-part key This fixes `Group.__getitem__` when indexing with a key like 'subgroup/array'. The basic idea is to rewrite the indexing operation as `group['subgroup']['array']` by splitting the key and doing each operation independently. This is fine for consolidated metadata which doesn't need to do IO. There's a complication around unconsolidated metadata, though. What if we encounter a node where `Group.getitem` returns a sub Group without consolidated metadata. Then we need to fall back to non-consolidated metadata. We've written _getitem_consolidated as a regular (non-async) function so we need to pop back up to the async caller and have *it* fall back. Closes https://github.com/zarr-developers/zarr-python/issues/2358 --- src/zarr/core/group.py | 119 ++++++++++++++++++++++++++++++++--------- tests/v3/test_group.py | 15 +++++- 2 files changed, 109 insertions(+), 25 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 0b15e2f085..3e814527ab 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -97,6 +97,24 @@ def _parse_async_node( raise TypeError(f"Unknown node type, got {type(node)}") +class _MixedConsolidatedMetadataException(Exception): + """ + A custom, *internal* exception for when we encounter mixed consolidated metadata. + + Typically, Consolidated Metadata will explicitly indicate that there are no + additional children under a group with ``ConsolidatedMetadata(metadata={})``, + as opposed to ``metadata=None``. This is the behavior of ``consolidated_metadata``. + We rely on that "fact" to do I/O-free getitem: when a group's consolidated metadata + doesn't contain a child we can raise a ``KeyError`` without consulting the backing + store. + + Users can potentially get themselves in situations where there's "mixed" consolidated + metadata. For now, we'll raise this error, catch it internally, and silently fall back + to the store (which will either succeed or raise its own KeyError, slowly). We might + want to expose this in the future, in which case rename it add it to zarr.errors. + """ + + @dataclass(frozen=True) class ConsolidatedMetadata: """ @@ -571,7 +589,10 @@ def _from_bytes_v2( @classmethod def _from_bytes_v3( - cls, store_path: StorePath, zarr_json_bytes: Buffer, use_consolidated: bool | None + cls, + store_path: StorePath, + zarr_json_bytes: Buffer, + use_consolidated: bool | None, ) -> AsyncGroup: group_metadata = json.loads(zarr_json_bytes.to_bytes()) if use_consolidated and group_metadata.get("consolidated_metadata") is None: @@ -604,14 +625,22 @@ async def getitem( # Consolidated metadata lets us avoid some I/O operations so try that first. if self.metadata.consolidated_metadata is not None: - return self._getitem_consolidated(store_path, key, prefix=self.name) + try: + return self._getitem_consolidated(store_path, key, prefix=self.name) + except _MixedConsolidatedMetadataException: + logger.info( + "Mixed consolidated and unconsolidated metadata. key=%s, store_path=%s", + key, + store_path, + ) + # now fall back to the non-consolidated variant # Note: # in zarr-python v2, we first check if `key` references an Array, else if `key` references # a group,using standalone `contains_array` and `contains_group` functions. These functions # are reusable, but for v3 they would perform redundant I/O operations. # Not clear how much of that strategy we want to keep here. - elif self.metadata.zarr_format == 3: + if self.metadata.zarr_format == 3: zarr_json_bytes = await (store_path / ZARR_JSON).get() if zarr_json_bytes is None: raise KeyError(key) @@ -661,18 +690,39 @@ def _getitem_consolidated( # getitem, in the special case where we have consolidated metadata. # Note that this is a regular def (non async) function. # This shouldn't do any additional I/O. + # All callers *must* catch _MixedConsolidatedMetadataException to ensure + # that we correctly handle the case where we need to fall back to doing + # additional I/O. # the caller needs to verify this! assert self.metadata.consolidated_metadata is not None - try: - metadata = self.metadata.consolidated_metadata.metadata[key] - except KeyError as e: - # The Group Metadata has consolidated metadata, but the key - # isn't present. We trust this to mean that the key isn't in - # the hierarchy, and *don't* fall back to checking the store. - msg = f"'{key}' not found in consolidated metadata." - raise KeyError(msg) from e + # we support nested getitems like group/subgroup/array + indexers = key.split("/") + indexers.reverse() + metadata: ArrayV2Metadata | ArrayV3Metadata | GroupMetadata = self.metadata + + while indexers: + indexer = indexers.pop() + if isinstance(metadata, ArrayV2Metadata | ArrayV3Metadata): + # we've indexed into an array with group["array/subarray"]. Invalid. + raise KeyError(key) + try: + if metadata.consolidated_metadata is None: + # we've indexed into a group without consolidated metadata. + # Note that the `None` case is different from `metadata={}` + # where we explicitly know we have no children. In the None + # case we have to fall back to non-consolidated metadata. + raise _MixedConsolidatedMetadataException(key) + assert metadata.consolidated_metadata is not None + + metadata = metadata.consolidated_metadata.metadata[indexer] + except KeyError as e: + # The Group Metadata has consolidated metadata, but the key + # isn't present. We trust this to mean that the key isn't in + # the hierarchy, and *don't* fall back to checking the store. + msg = f"'{key}' not found in consolidated metadata." + raise KeyError(msg) from e # update store_path to ensure that AsyncArray/Group.name is correct if prefix != "/": @@ -1087,7 +1137,8 @@ async def members( self, max_depth: int | None = 0, ) -> AsyncGenerator[ - tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None + tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], + None, ]: """ Returns an AsyncGenerator over the arrays and groups contained in this group. @@ -1118,15 +1169,20 @@ async def members( async def _members( self, max_depth: int | None, current_depth: int ) -> AsyncGenerator[ - tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None + tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], + None, ]: if self.metadata.consolidated_metadata is not None: # we should be able to do members without any additional I/O - members = self._members_consolidated(max_depth, current_depth) - - for member in members: - yield member - return + try: + members = self._members_consolidated(max_depth, current_depth) + except _MixedConsolidatedMetadataException: + # we've already logged this. We'll fall back to the non-consolidated version. + pass + else: + for member in members: + yield member + return if not self.store_path.store.supports_listing: msg = ( @@ -1177,17 +1233,28 @@ async def _members( def _members_consolidated( self, max_depth: int | None, current_depth: int, prefix: str = "" ) -> Generator[ - tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None + tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], + None, ]: consolidated_metadata = self.metadata.consolidated_metadata # we kind of just want the top-level keys. if consolidated_metadata is not None: for key in consolidated_metadata.metadata.keys(): - obj = self._getitem_consolidated( - self.store_path, key, prefix=self.name - ) # Metadata -> Group/Array - key = f"{prefix}/{key}".lstrip("/") + try: + obj = self._getitem_consolidated( + self.store_path, key, prefix=self.name + ) # Metadata -> Group/Array + key = f"{prefix}/{key}".lstrip("/") + except _MixedConsolidatedMetadataException: + logger.info( + "Mixed consolidated and unconsolidated metadata. key=%s, depth=%d, prefix=%s", + key, + current_depth, + prefix, + ) + # This isn't an async def function so we need to re-raise up one more level. + raise yield key, obj if ((max_depth is None) or (current_depth < max_depth)) and isinstance( @@ -1262,7 +1329,11 @@ async def full( self, *, name: str, shape: ChunkCoords, fill_value: Any | None, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: return await async_api.full( - shape=shape, fill_value=fill_value, store=self.store_path, path=name, **kwargs + shape=shape, + fill_value=fill_value, + store=self.store_path, + path=name, + **kwargs, ) async def empty_like( diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 90933abeaa..c7bf2ff0bd 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -300,18 +300,31 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) + # we're going to assume that `group.metadata` is correct, and reuse that to focus + # on indexing in this test. Other tests verify the correctness of group.metadata object.__setattr__( - subgroup.metadata, "consolidated_metadata", ConsolidatedMetadata(metadata={}) + subgroup.metadata, + "consolidated_metadata", + ConsolidatedMetadata( + metadata={"subarray": group.metadata.consolidated_metadata.metadata["subarray"]} + ), ) assert group["subgroup"] == subgroup assert group["subarray"] == subarray + assert subgroup["subarray"] == subsubarray + # assert group["subgroup/subarray"] == subsubarray + with pytest.raises(KeyError): group["nope"] + with pytest.raises(KeyError, match="subarray/subsubarray"): + group["subarray/subsubarray"] + def test_group_get_with_default(store: Store, zarr_format: ZarrFormat) -> None: group = Group.from_store(store, zarr_format=zarr_format) From 05cf65289f4fb058d61cac64af4ab5b11ea7863a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 10:46:17 -0500 Subject: [PATCH 2/7] test for getitem --- tests/v3/test_group.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index c7bf2ff0bd..7515ac071e 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -13,7 +13,7 @@ from zarr.abc.store import Store from zarr.core.buffer import default_buffer_prototype from zarr.core.common import JSON, ZarrFormat -from zarr.core.group import ConsolidatedMetadata, GroupMetadata +from zarr.core.group import ConsolidatedMetadata, GroupMetadata, _MixedConsolidatedMetadataException from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore @@ -316,8 +316,8 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool assert group["subgroup"] == subgroup assert group["subarray"] == subarray - assert subgroup["subarray"] == subsubarray - # assert group["subgroup/subarray"] == subsubarray + assert group["subgroup"]["subarray"] == subsubarray + assert group["subgroup/subarray"] == subsubarray with pytest.raises(KeyError): group["nope"] @@ -325,6 +325,28 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool with pytest.raises(KeyError, match="subarray/subsubarray"): group["subarray/subsubarray"] + # Now test the mixed case + if consolidated: + group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) + # we're going to assume that `group.metadata` is correct, and reuse that to focus + # on indexing in this test. Other tests verify the correctness of group.metadata + object.__setattr__( + group.metadata.consolidated_metadata.metadata["subgroup"], + "consolidated_metadata", + None, + ) + + # test the implementation directly + with pytest.raises(_MixedConsolidatedMetadataException): + group._async_group._getitem_consolidated( + group.store_path, "subgroup/subarray", prefix="/" + ) + + assert group["subgroup/subarray"] == subsubarray + + with pytest.raises(KeyError, match="subarray/subsubarray"): + group["subarray/subsubarray"] + def test_group_get_with_default(store: Store, zarr_format: ZarrFormat) -> None: group = Group.from_store(store, zarr_format=zarr_format) From 488d57f0605377f9f6c0b27c6c02490e4d9efcf2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 11:15:36 -0500 Subject: [PATCH 3/7] fixed members --- src/zarr/core/group.py | 22 ++++++++++++++++++---- tests/v3/test_group.py | 17 ++++++++++++++--- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 3e814527ab..eeda99207e 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1124,7 +1124,16 @@ async def nmembers( ------- count : int """ - if self.metadata.consolidated_metadata is not None: + # check if we can use consolidated metadata, which requires that we have non-None + # consolidated metadata at all points in the hierarchy. + use_consolidated_metadata = self.metadata.consolidated_metadata is not None and all( + x.consolidated_metadata is not None + for x in self.metadata.consolidated_metadata.flattened_metadata.values() + if isinstance(x, GroupMetadata) + ) + + if use_consolidated_metadata: + assert self.metadata.consolidated_metadata is not None # helping mypy return len(self.metadata.consolidated_metadata.flattened_metadata) # TODO: consider using aioitertools.builtins.sum for this # return await aioitertools.builtins.sum((1 async for _ in self.members()), start=0) @@ -1174,13 +1183,16 @@ async def _members( ]: if self.metadata.consolidated_metadata is not None: # we should be able to do members without any additional I/O + members = self._members_consolidated(max_depth, current_depth) + try: - members = self._members_consolidated(max_depth, current_depth) + # we already have this in memory, so fine to build this list + # and catch the exception if needed. + members_ = list(members) except _MixedConsolidatedMetadataException: - # we've already logged this. We'll fall back to the non-consolidated version. pass else: - for member in members: + for member in members_: yield member return @@ -1238,6 +1250,8 @@ def _members_consolidated( ]: consolidated_metadata = self.metadata.consolidated_metadata + if consolidated_metadata is None: + raise _MixedConsolidatedMetadataException(prefix) # we kind of just want the top-level keys. if consolidated_metadata is not None: for key in consolidated_metadata.metadata.keys(): diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 7515ac071e..bdcf1b22fb 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -327,9 +327,6 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool # Now test the mixed case if consolidated: - group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) - # we're going to assume that `group.metadata` is correct, and reuse that to focus - # on indexing in this test. Other tests verify the correctness of group.metadata object.__setattr__( group.metadata.consolidated_metadata.metadata["subgroup"], "consolidated_metadata", @@ -1036,6 +1033,20 @@ async def test_group_members_async(store: Store, consolidated_metadata: bool) -> with pytest.raises(ValueError, match="max_depth"): [x async for x in group.members(max_depth=-1)] + if consolidated_metadata: + # test for mixed + object.__setattr__( + group.metadata.consolidated_metadata.metadata["g0"].consolidated_metadata.metadata[ + "g1" + ], + "consolidated_metadata", + None, + ) + all_children = sorted([x async for x in group.members(max_depth=None)], key=lambda x: x[0]) + assert len(all_children) == len(expected) + nmembers = await group.nmembers(max_depth=None) + assert nmembers == 6 + async def test_require_group(store: LocalStore | MemoryStore, zarr_format: ZarrFormat) -> None: root = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) From a0c0ef621ff996bb2f9fcf97419c7efec44a0135 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 16 Oct 2024 09:32:00 -0500 Subject: [PATCH 4/7] remove assert --- src/zarr/core/group.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index f8f1f726c7..e7dc32187a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -715,7 +715,6 @@ def _getitem_consolidated( # where we explicitly know we have no children. In the None # case we have to fall back to non-consolidated metadata. raise _MixedConsolidatedMetadataException(key) - assert metadata.consolidated_metadata is not None metadata = metadata.consolidated_metadata.metadata[indexer] except KeyError as e: From 5533ac7f891e6ff780e16377e7b814dc950b47ab Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 16 Oct 2024 11:11:16 -0500 Subject: [PATCH 5/7] trust the metadata --- src/zarr/core/group.py | 111 +++++++++-------------------------------- tests/v3/test_group.py | 9 ++-- 2 files changed, 29 insertions(+), 91 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index efefe00e4c..77b35ca151 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -98,24 +98,6 @@ def _parse_async_node( raise TypeError(f"Unknown node type, got {type(node)}") -class _MixedConsolidatedMetadataException(Exception): - """ - A custom, *internal* exception for when we encounter mixed consolidated metadata. - - Typically, Consolidated Metadata will explicitly indicate that there are no - additional children under a group with ``ConsolidatedMetadata(metadata={})``, - as opposed to ``metadata=None``. This is the behavior of ``consolidated_metadata``. - We rely on that "fact" to do I/O-free getitem: when a group's consolidated metadata - doesn't contain a child we can raise a ``KeyError`` without consulting the backing - store. - - Users can potentially get themselves in situations where there's "mixed" consolidated - metadata. For now, we'll raise this error, catch it internally, and silently fall back - to the store (which will either succeed or raise its own KeyError, slowly). We might - want to expose this in the future, in which case rename it add it to zarr.errors. - """ - - @dataclass(frozen=True) class ConsolidatedMetadata: """ @@ -626,15 +608,7 @@ async def getitem( # Consolidated metadata lets us avoid some I/O operations so try that first. if self.metadata.consolidated_metadata is not None: - try: - return self._getitem_consolidated(store_path, key, prefix=self.name) - except _MixedConsolidatedMetadataException: - logger.info( - "Mixed consolidated and unconsolidated metadata. key=%s, store_path=%s", - key, - store_path, - ) - # now fall back to the non-consolidated variant + return self._getitem_consolidated(store_path, key, prefix=self.name) # Note: # in zarr-python v2, we first check if `key` references an Array, else if `key` references @@ -691,9 +665,6 @@ def _getitem_consolidated( # getitem, in the special case where we have consolidated metadata. # Note that this is a regular def (non async) function. # This shouldn't do any additional I/O. - # All callers *must* catch _MixedConsolidatedMetadataException to ensure - # that we correctly handle the case where we need to fall back to doing - # additional I/O. # the caller needs to verify this! assert self.metadata.consolidated_metadata is not None @@ -708,14 +679,16 @@ def _getitem_consolidated( if isinstance(metadata, ArrayV2Metadata | ArrayV3Metadata): # we've indexed into an array with group["array/subarray"]. Invalid. raise KeyError(key) + if metadata.consolidated_metadata is None: + # we've indexed into a group without consolidated metadata. + # This isn't normal; typically, consolidated metadata + # will include explicit markers for when there are no child + # nodes as metadata={}. + # We have some freedom in exactly how we interpret this case. + # For now, we treat None as the same as {}, i.e. we don't + # have any children. + raise KeyError(key) try: - if metadata.consolidated_metadata is None: - # we've indexed into a group without consolidated metadata. - # Note that the `None` case is different from `metadata={}` - # where we explicitly know we have no children. In the None - # case we have to fall back to non-consolidated metadata. - raise _MixedConsolidatedMetadataException(key) - metadata = metadata.consolidated_metadata.metadata[indexer] except KeyError as e: # The Group Metadata has consolidated metadata, but the key @@ -981,11 +954,7 @@ async def create_array( @deprecated("Use AsyncGroup.create_array instead.") async def create_dataset( - self, - name: str, - *, - shape: ShapeLike, - **kwargs: Any, + self, name: str, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -996,8 +965,6 @@ async def create_dataset( ---------- name : str Array name. - shape : int or tuple of ints - Array shape. kwargs : dict Additional arguments passed to :func:`zarr.AsyncGroup.create_array`. @@ -1008,7 +975,7 @@ async def create_dataset( .. deprecated:: 3.0.0 The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead. """ - return await self.create_array(name, shape=shape, **kwargs) + return await self.create_array(name, **kwargs) @deprecated("Use AsyncGroup.require_array instead.") async def require_dataset( @@ -1190,17 +1157,9 @@ async def _members( if self.metadata.consolidated_metadata is not None: # we should be able to do members without any additional I/O members = self._members_consolidated(max_depth, current_depth) - - try: - # we already have this in memory, so fine to build this list - # and catch the exception if needed. - members_ = list(members) - except _MixedConsolidatedMetadataException: - pass - else: - for member in members_: - yield member - return + members_ = list(members) + for member in members_: + yield member if not self.store_path.store.supports_listing: msg = ( @@ -1258,25 +1217,13 @@ def _members_consolidated( ]: consolidated_metadata = self.metadata.consolidated_metadata - if consolidated_metadata is None: - raise _MixedConsolidatedMetadataException(prefix) # we kind of just want the top-level keys. if consolidated_metadata is not None: for key in consolidated_metadata.metadata.keys(): - try: - obj = self._getitem_consolidated( - self.store_path, key, prefix=self.name - ) # Metadata -> Group/Array - key = f"{prefix}/{key}".lstrip("/") - except _MixedConsolidatedMetadataException: - logger.info( - "Mixed consolidated and unconsolidated metadata. key=%s, depth=%d, prefix=%s", - key, - current_depth, - prefix, - ) - # This isn't an async def function so we need to re-raise up one more level. - raise + obj = self._getitem_consolidated( + self.store_path, key, prefix=self.name + ) # Metadata -> Group/Array + key = f"{prefix}/{key}".lstrip("/") yield key, obj if ((max_depth is None) or (current_depth < max_depth)) and isinstance( @@ -1711,13 +1658,7 @@ def create_dataset(self, name: str, **kwargs: Any) -> Array: return Array(self._sync(self._async_group.create_dataset(name, **kwargs))) @deprecated("Use Group.require_array instead.") - def require_dataset( - self, - name: str, - *, - shape: ShapeLike, - **kwargs: Any, - ) -> Array: + def require_dataset(self, name: str, **kwargs: Any) -> Array: """Obtain an array, creating if it doesn't exist. Arrays are known as "datasets" in HDF5 terminology. For compatibility @@ -1744,15 +1685,9 @@ def require_dataset( .. deprecated:: 3.0.0 The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead. """ - return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs))) + return Array(self._sync(self._async_group.require_array(name, **kwargs))) - def require_array( - self, - name: str, - *, - shape: ShapeLike, - **kwargs: Any, - ) -> Array: + def require_array(self, name: str, **kwargs: Any) -> Array: """Obtain an array, creating if it doesn't exist. @@ -1774,7 +1709,7 @@ def require_array( ------- a : Array """ - return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs))) + return Array(self._sync(self._async_group.require_array(name, **kwargs))) def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array: return Array(self._sync(self._async_group.empty(name=name, shape=shape, **kwargs))) diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 8030cb97e6..e6cffcb480 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -15,7 +15,7 @@ from zarr.abc.store import Store from zarr.core.buffer import default_buffer_prototype from zarr.core.common import JSON, ZarrFormat -from zarr.core.group import ConsolidatedMetadata, GroupMetadata, _MixedConsolidatedMetadataException +from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore @@ -347,12 +347,15 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool ) # test the implementation directly - with pytest.raises(_MixedConsolidatedMetadataException): + with pytest.raises(KeyError): group._async_group._getitem_consolidated( group.store_path, "subgroup/subarray", prefix="/" ) - assert group["subgroup/subarray"] == subsubarray + with pytest.raises(KeyError): + # We've chosen to trust the consolidted metadata, which doesn't + # contain this array + group["subgroup/subarray"] with pytest.raises(KeyError, match="subarray/subsubarray"): group["subarray/subsubarray"] From 259e09ef72a43bbccf62e95aff3f74e5d4260475 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 16 Oct 2024 11:14:40 -0500 Subject: [PATCH 6/7] simplify --- src/zarr/core/group.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 77b35ca151..d797ed7370 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -615,7 +615,7 @@ async def getitem( # a group,using standalone `contains_array` and `contains_group` functions. These functions # are reusable, but for v3 they would perform redundant I/O operations. # Not clear how much of that strategy we want to keep here. - if self.metadata.zarr_format == 3: + elif self.metadata.zarr_format == 3: zarr_json_bytes = await (store_path / ZARR_JSON).get() if zarr_json_bytes is None: raise KeyError(key) @@ -1099,14 +1099,7 @@ async def nmembers( """ # check if we can use consolidated metadata, which requires that we have non-None # consolidated metadata at all points in the hierarchy. - use_consolidated_metadata = self.metadata.consolidated_metadata is not None and all( - x.consolidated_metadata is not None - for x in self.metadata.consolidated_metadata.flattened_metadata.values() - if isinstance(x, GroupMetadata) - ) - - if use_consolidated_metadata: - assert self.metadata.consolidated_metadata is not None # helping mypy + if self.metadata.consolidated_metadata is not None: return len(self.metadata.consolidated_metadata.flattened_metadata) # TODO: consider using aioitertools.builtins.sum for this # return await aioitertools.builtins.sum((1 async for _ in self.members()), start=0) @@ -1157,9 +1150,9 @@ async def _members( if self.metadata.consolidated_metadata is not None: # we should be able to do members without any additional I/O members = self._members_consolidated(max_depth, current_depth) - members_ = list(members) - for member in members_: + for member in members: yield member + return if not self.store_path.store.supports_listing: msg = ( From 446c893ae4f646de2480218c0af765731b6d747f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 16 Oct 2024 11:23:25 -0500 Subject: [PATCH 7/7] fixup --- tests/v3/test_group.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index e6cffcb480..9fbb82dd25 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -1050,7 +1050,8 @@ async def test_group_members_async(store: Store, consolidated_metadata: bool) -> [x async for x in group.members(max_depth=-1)] if consolidated_metadata: - # test for mixed + # test for mixed known and unknown metadata. + # For now, we trust the consolidated metadata. object.__setattr__( group.metadata.consolidated_metadata.metadata["g0"].consolidated_metadata.metadata[ "g1" @@ -1059,9 +1060,9 @@ async def test_group_members_async(store: Store, consolidated_metadata: bool) -> None, ) all_children = sorted([x async for x in group.members(max_depth=None)], key=lambda x: x[0]) - assert len(all_children) == len(expected) + assert len(all_children) == 4 nmembers = await group.nmembers(max_depth=None) - assert nmembers == 6 + assert nmembers == 4 async def test_require_group(store: LocalStore | MemoryStore, zarr_format: ZarrFormat) -> None: