Skip to content

Commit 57d6ace

Browse files
d-v-bjhamman
andauthored
implement group.members (#1726)
* feat: functional .children method for groups * changes necessary for correctly generating list of children * add stand-alone test for group.children * give type hints a glow-up * test: use separate assert statements to avoid platform-dependent ordering issues * test: put fixtures in conftest, add MemoryStore fixture * docs: release notes * test: remove prematurely-added mock s3 fixture * fix: Rename children to members; AsyncGroup.members yields tuples of (name, AsyncArray / AsyncGroup) pairs; Group.members repackages these into a dict. * fix: make Group.members return a tuple of str, Array | Group pairs * fix: revert changes to synchronization code; this is churn that we need to deal with * make mypy happy * feat: implement member-specific iteration methods in asyncgroup * chore: clean up some post-merge issues * chore: remove extra directory added by test code --------- Co-authored-by: Joseph Hamman <[email protected]>
1 parent 0f755cc commit 57d6ace

File tree

5 files changed

+215
-64
lines changed

5 files changed

+215
-64
lines changed

docs/release.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ Release notes
1818
Unreleased (v3)
1919
---------------
2020

21+
Enhancements
22+
~~~~~~~~~~~~
23+
24+
* Implement listing of the sub-arrays and sub-groups for a V3 ``Group``.
25+
By :user:`Davis Bennett <d-v-b>` :issue:`1726`.
26+
2127
Maintenance
2228
~~~~~~~~~~~
2329

src/zarr/group.py

Lines changed: 124 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
from __future__ import annotations
2+
from typing import TYPE_CHECKING
23
from dataclasses import asdict, dataclass, field, replace
34

45
import asyncio
56
import json
67
import logging
7-
from typing import Any, Dict, Literal, Optional, Union, AsyncIterator, List
8+
9+
if TYPE_CHECKING:
10+
from typing import (
11+
Any,
12+
AsyncGenerator,
13+
Literal,
14+
AsyncIterator,
15+
)
816
from zarr.abc.metadata import Metadata
917

1018
from zarr.array import AsyncArray, Array
@@ -25,7 +33,7 @@ def parse_zarr_format(data: Any) -> Literal[2, 3]:
2533

2634

2735
# todo: convert None to empty dict
28-
def parse_attributes(data: Any) -> Dict[str, Any]:
36+
def parse_attributes(data: Any) -> dict[str, Any]:
2937
if data is None:
3038
return {}
3139
elif isinstance(data, dict) and all(map(lambda v: isinstance(v, str), data.keys())):
@@ -36,12 +44,12 @@ def parse_attributes(data: Any) -> Dict[str, Any]:
3644

3745
@dataclass(frozen=True)
3846
class GroupMetadata(Metadata):
39-
attributes: Dict[str, Any] = field(default_factory=dict)
47+
attributes: dict[str, Any] = field(default_factory=dict)
4048
zarr_format: Literal[2, 3] = 3
4149
node_type: Literal["group"] = field(default="group", init=False)
4250

4351
# todo: rename this, since it doesn't return bytes
44-
def to_bytes(self) -> Dict[str, bytes]:
52+
def to_bytes(self) -> dict[str, bytes]:
4553
if self.zarr_format == 3:
4654
return {ZARR_JSON: json.dumps(self.to_dict()).encode()}
4755
else:
@@ -50,34 +58,34 @@ def to_bytes(self) -> Dict[str, bytes]:
5058
ZATTRS_JSON: json.dumps(self.attributes).encode(),
5159
}
5260

53-
def __init__(self, attributes: Optional[Dict[str, Any]] = None, zarr_format: Literal[2, 3] = 3):
61+
def __init__(self, attributes: dict[str, Any] | None = None, zarr_format: Literal[2, 3] = 3):
5462
attributes_parsed = parse_attributes(attributes)
5563
zarr_format_parsed = parse_zarr_format(zarr_format)
5664

5765
object.__setattr__(self, "attributes", attributes_parsed)
5866
object.__setattr__(self, "zarr_format", zarr_format_parsed)
5967

6068
@classmethod
61-
def from_dict(cls, data: Dict[str, Any]) -> GroupMetadata:
69+
def from_dict(cls, data: dict[str, Any]) -> GroupMetadata:
6270
assert data.pop("node_type", None) in ("group", None)
6371
return cls(**data)
6472

65-
def to_dict(self) -> Dict[str, Any]:
73+
def to_dict(self) -> dict[str, Any]:
6674
return asdict(self)
6775

6876

6977
@dataclass(frozen=True)
7078
class AsyncGroup:
7179
metadata: GroupMetadata
7280
store_path: StorePath
73-
runtime_configuration: RuntimeConfiguration
81+
runtime_configuration: RuntimeConfiguration = RuntimeConfiguration()
7482

7583
@classmethod
7684
async def create(
7785
cls,
7886
store: StoreLike,
7987
*,
80-
attributes: Optional[Dict[str, Any]] = None,
88+
attributes: dict[str, Any] = {},
8189
exists_ok: bool = False,
8290
zarr_format: Literal[2, 3] = 3,
8391
runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(),
@@ -89,7 +97,7 @@ async def create(
8997
elif zarr_format == 2:
9098
assert not await (store_path / ZGROUP_JSON).exists()
9199
group = cls(
92-
metadata=GroupMetadata(attributes=attributes or {}, zarr_format=zarr_format),
100+
metadata=GroupMetadata(attributes=attributes, zarr_format=zarr_format),
93101
store_path=store_path,
94102
runtime_configuration=runtime_configuration,
95103
)
@@ -137,7 +145,7 @@ async def open(
137145
def from_dict(
138146
cls,
139147
store_path: StorePath,
140-
data: Dict[str, Any],
148+
data: dict[str, Any],
141149
runtime_configuration: RuntimeConfiguration,
142150
) -> AsyncGroup:
143151
group = cls(
@@ -150,14 +158,24 @@ def from_dict(
150158
async def getitem(
151159
self,
152160
key: str,
153-
) -> Union[AsyncArray, AsyncGroup]:
161+
) -> AsyncArray | AsyncGroup:
154162
store_path = self.store_path / key
155163

164+
# Note:
165+
# in zarr-python v2, we first check if `key` references an Array, else if `key` references
166+
# a group,using standalone `contains_array` and `contains_group` functions. These functions
167+
# are reusable, but for v3 they would perform redundant I/O operations.
168+
# Not clear how much of that strategy we want to keep here.
169+
170+
# if `key` names an object in storage, it cannot be an array or group
171+
if await store_path.exists():
172+
raise KeyError(key)
173+
156174
if self.metadata.zarr_format == 3:
157175
zarr_json_bytes = await (store_path / ZARR_JSON).get()
158176
if zarr_json_bytes is None:
159177
# implicit group?
160-
logger.warning("group at {} is an implicit group", store_path)
178+
logger.warning("group at %s is an implicit group", store_path)
161179
zarr_json = {
162180
"zarr_format": self.metadata.zarr_format,
163181
"node_type": "group",
@@ -196,7 +214,7 @@ async def getitem(
196214
else:
197215
if zgroup_bytes is None:
198216
# implicit group?
199-
logger.warning("group at {} is an implicit group", store_path)
217+
logger.warning("group at %s is an implicit group", store_path)
200218
zgroup = (
201219
json.loads(zgroup_bytes)
202220
if zgroup_bytes is not None
@@ -248,7 +266,7 @@ async def create_array(self, path: str, **kwargs) -> AsyncArray:
248266
**kwargs,
249267
)
250268

251-
async def update_attributes(self, new_attributes: Dict[str, Any]):
269+
async def update_attributes(self, new_attributes: dict[str, Any]):
252270
# metadata.attributes is "frozen" so we simply clear and update the dict
253271
self.metadata.attributes.clear()
254272
self.metadata.attributes.update(new_attributes)
@@ -269,26 +287,68 @@ async def update_attributes(self, new_attributes: Dict[str, Any]):
269287
def __repr__(self):
270288
return f"<AsyncGroup {self.store_path}>"
271289

272-
async def nchildren(self) -> int:
273-
raise NotImplementedError
274-
275-
async def children(self) -> AsyncIterator[Union[AsyncArray, AsyncGroup]]:
276-
raise NotImplementedError
277-
278-
async def contains(self, child: str) -> bool:
290+
async def nmembers(self) -> int:
279291
raise NotImplementedError
280292

281-
async def group_keys(self) -> AsyncIterator[str]:
282-
raise NotImplementedError
293+
async def members(self) -> AsyncGenerator[tuple[str, AsyncArray | AsyncGroup], None]:
294+
"""
295+
Returns an AsyncGenerator over the arrays and groups contained in this group.
296+
This method requires that `store_path.store` supports directory listing.
297+
298+
The results are not guaranteed to be ordered.
299+
"""
300+
if not self.store_path.store.supports_listing:
301+
msg = (
302+
f"The store associated with this group ({type(self.store_path.store)}) "
303+
"does not support listing, "
304+
"specifically via the `list_dir` method. "
305+
"This function requires a store that supports listing."
306+
)
283307

284-
async def groups(self) -> AsyncIterator[AsyncGroup]:
285-
raise NotImplementedError
308+
raise ValueError(msg)
309+
subkeys = await self.store_path.store.list_dir(self.store_path.path)
310+
# would be nice to make these special keys accessible programmatically,
311+
# and scoped to specific zarr versions
312+
subkeys_filtered = filter(lambda v: v not in ("zarr.json", ".zgroup", ".zattrs"), subkeys)
313+
# is there a better way to schedule this?
314+
for subkey in subkeys_filtered:
315+
try:
316+
yield (subkey, await self.getitem(subkey))
317+
except KeyError:
318+
# keyerror is raised when `subkey` names an object (in the object storage sense),
319+
# as opposed to a prefix, in the store under the prefix associated with this group
320+
# in which case `subkey` cannot be the name of a sub-array or sub-group.
321+
logger.warning(
322+
"Object at %s is not recognized as a component of a Zarr hierarchy.", subkey
323+
)
324+
pass
286325

287-
async def array_keys(self) -> AsyncIterator[str]:
326+
async def contains(self, member: str) -> bool:
288327
raise NotImplementedError
289328

329+
# todo: decide if this method should be separate from `groups`
330+
async def group_keys(self) -> AsyncGenerator[str, None]:
331+
async for key, value in self.members():
332+
if isinstance(value, AsyncGroup):
333+
yield key
334+
335+
# todo: decide if this method should be separate from `group_keys`
336+
async def groups(self) -> AsyncGenerator[AsyncGroup, None]:
337+
async for key, value in self.members():
338+
if isinstance(value, AsyncGroup):
339+
yield value
340+
341+
# todo: decide if this method should be separate from `arrays`
342+
async def array_keys(self) -> AsyncGenerator[str, None]:
343+
async for key, value in self.members():
344+
if isinstance(value, AsyncArray):
345+
yield key
346+
347+
# todo: decide if this method should be separate from `array_keys`
290348
async def arrays(self) -> AsyncIterator[AsyncArray]:
291-
raise NotImplementedError
349+
async for key, value in self.members():
350+
if isinstance(value, AsyncArray):
351+
yield value
292352

293353
async def tree(self, expand=False, level=None) -> Any:
294354
raise NotImplementedError
@@ -331,7 +391,7 @@ def create(
331391
cls,
332392
store: StoreLike,
333393
*,
334-
attributes: Optional[Dict[str, Any]] = None,
394+
attributes: dict[str, Any] = {},
335395
exists_ok: bool = False,
336396
runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(),
337397
) -> Group:
@@ -358,7 +418,7 @@ def open(
358418
)
359419
return cls(obj)
360420

361-
def __getitem__(self, path: str) -> Union[Array, Group]:
421+
def __getitem__(self, path: str) -> Array | Group:
362422
obj = self._sync(self._async_group.getitem(path))
363423
if isinstance(obj, AsyncArray):
364424
return Array(obj)
@@ -378,7 +438,7 @@ def __setitem__(self, key, value):
378438
"""__setitem__ is not supported in v3"""
379439
raise NotImplementedError
380440

381-
async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Group:
441+
async def update_attributes_async(self, new_attributes: dict[str, Any]) -> Group:
382442
new_metadata = replace(self.metadata, attributes=new_attributes)
383443

384444
# Write new metadata
@@ -389,6 +449,10 @@ async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Group
389449
async_group = replace(self._async_group, metadata=new_metadata)
390450
return replace(self, _async_group=async_group)
391451

452+
@property
453+
def store_path(self) -> StorePath:
454+
return self._async_group.store_path
455+
392456
@property
393457
def metadata(self) -> GroupMetadata:
394458
return self._async_group.metadata
@@ -401,50 +465,54 @@ def attrs(self) -> Attributes:
401465
def info(self):
402466
return self._async_group.info
403467

404-
@property
405-
def store_path(self) -> StorePath:
406-
return self._async_group.store_path
407-
408-
def update_attributes(self, new_attributes: Dict[str, Any]):
468+
def update_attributes(self, new_attributes: dict[str, Any]):
409469
self._sync(self._async_group.update_attributes(new_attributes))
410470
return self
411471

412472
@property
413-
def nchildren(self) -> int:
414-
return self._sync(self._async_group.nchildren())
473+
def nmembers(self) -> int:
474+
return self._sync(self._async_group.nmembers())
415475

416476
@property
417-
def children(self) -> List[Union[Array, Group]]:
418-
raise NotImplementedError
419-
# Uncomment with AsyncGroup implements this method
420-
# _children: List[Union[AsyncArray, AsyncGroup]] = self._sync_iter(
421-
# self._async_group.children()
422-
# )
423-
# return [Array(obj) if isinstance(obj, AsyncArray) else Group(obj) for obj in _children]
477+
def members(self) -> tuple[tuple[str, Array | Group], ...]:
478+
"""
479+
Return the sub-arrays and sub-groups of this group as a `tuple` of (name, array | group)
480+
pairs
481+
"""
482+
_members: list[tuple[str, AsyncArray | AsyncGroup]] = self._sync_iter(
483+
self._async_group.members()
484+
)
485+
ret: list[tuple[str, Array | Group]] = []
486+
for key, value in _members:
487+
if isinstance(value, AsyncArray):
488+
ret.append((key, Array(value)))
489+
else:
490+
ret.append((key, Group(value)))
491+
return tuple(ret)
424492

425-
def __contains__(self, child) -> bool:
426-
return self._sync(self._async_group.contains(child))
493+
def __contains__(self, member) -> bool:
494+
return self._sync(self._async_group.contains(member))
427495

428-
def group_keys(self) -> List[str]:
429-
raise NotImplementedError
496+
def group_keys(self) -> list[str]:
430497
# uncomment with AsyncGroup implements this method
431498
# return self._sync_iter(self._async_group.group_keys())
499+
raise NotImplementedError
432500

433-
def groups(self) -> List[Group]:
501+
def groups(self) -> list[Group]:
434502
# TODO: in v2 this was a generator that return key: Group
435-
raise NotImplementedError
436503
# uncomment with AsyncGroup implements this method
437504
# return [Group(obj) for obj in self._sync_iter(self._async_group.groups())]
505+
raise NotImplementedError
438506

439-
def array_keys(self) -> List[str]:
507+
def array_keys(self) -> list[str]:
440508
# uncomment with AsyncGroup implements this method
441-
# return self._sync_iter(self._async_group.array_keys())
509+
# return self._sync_iter(self._async_group.array_keys)
442510
raise NotImplementedError
443511

444-
def arrays(self) -> List[Array]:
445-
raise NotImplementedError
512+
def arrays(self) -> list[Array]:
446513
# uncomment with AsyncGroup implements this method
447-
# return [Array(obj) for obj in self._sync_iter(self._async_group.arrays())]
514+
# return [Array(obj) for obj in self._sync_iter(self._async_group.arrays)]
515+
raise NotImplementedError
448516

449517
def tree(self, expand=False, level=None) -> Any:
450518
return self._sync(self._async_group.tree(expand=expand, level=level))

tests/v2/conftest.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import pathlib
2-
32
import pytest
43

54

tests/v3/conftest.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import pathlib
2+
import pytest
3+
4+
from zarr.store import LocalStore, StorePath, MemoryStore, RemoteStore
5+
6+
7+
@pytest.fixture(params=[str, pathlib.Path])
8+
def path_type(request):
9+
return request.param
10+
11+
12+
# todo: harmonize this with local_store fixture
13+
@pytest.fixture
14+
def store_path(tmpdir):
15+
store = LocalStore(str(tmpdir))
16+
p = StorePath(store)
17+
return p
18+
19+
20+
@pytest.fixture(scope="function")
21+
def local_store(tmpdir):
22+
return LocalStore(str(tmpdir))
23+
24+
25+
@pytest.fixture(scope="function")
26+
def remote_store():
27+
return RemoteStore()
28+
29+
30+
@pytest.fixture(scope="function")
31+
def memory_store():
32+
return MemoryStore()

0 commit comments

Comments
 (0)