Skip to content

Commit fc901eb

Browse files
committed
read zarr-v2 consolidated metadata
1 parent 79bf235 commit fc901eb

File tree

5 files changed

+194
-10
lines changed

5 files changed

+194
-10
lines changed

src/zarr/api/asynchronous.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@ async def open(
271271

272272

273273
async def open_consolidated(*args: Any, **kwargs: Any) -> AsyncGroup:
274+
kwargs.setdefault("open_consolidated", True)
274275
return await open_group(*args, **kwargs)
275276

276277

@@ -531,6 +532,7 @@ async def open_group(
531532
zarr_format: ZarrFormat | None = None,
532533
meta_array: Any | None = None, # not used
533534
attributes: dict[str, JSON] | None = None,
535+
open_consolidated: bool = False,
534536
) -> AsyncGroup:
535537
"""Open a group using file-mode-like semantics.
536538
@@ -590,7 +592,9 @@ async def open_group(
590592
attributes = {}
591593

592594
try:
593-
return await AsyncGroup.open(store_path, zarr_format=zarr_format)
595+
return await AsyncGroup.open(
596+
store_path, zarr_format=zarr_format, open_consolidated=open_consolidated
597+
)
594598
except (KeyError, FileNotFoundError):
595599
return await AsyncGroup.create(
596600
store_path, zarr_format=zarr_format, exists_ok=True, attributes=attributes

src/zarr/core/common.py

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
ZARRAY_JSON = ".zarray"
2929
ZGROUP_JSON = ".zgroup"
3030
ZATTRS_JSON = ".zattrs"
31+
ZMETADATA_v2_JSON = ".zmetadata"
3132

3233
BytesLike = bytes | bytearray | memoryview
3334
ShapeLike = tuple[int, ...] | int

src/zarr/core/group.py

+54-4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import asyncio
44
import json
55
import logging
6+
from collections import defaultdict
67
from dataclasses import asdict, dataclass, field, replace
78
from typing import TYPE_CHECKING, Literal, cast, overload
89

@@ -28,7 +29,7 @@
2829
parse_shapelike,
2930
)
3031
from zarr.core.config import config
31-
from zarr.core.metadata import ArrayMetadata, ArrayV3Metadata
32+
from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
3233
from zarr.core.sync import SyncMixin, sync
3334
from zarr.store import StoreLike, StorePath, make_store_path
3435
from zarr.store.common import ensure_no_existing_node
@@ -126,7 +127,14 @@ def from_dict(cls, data: dict[str, JSON]) -> ConsolidatedMetadata:
126127
elif node_type == "array":
127128
metadata[k] = ArrayV3Metadata.from_dict(v)
128129
else:
129-
raise ValueError(f"Invalid node_type: '{node_type}'")
130+
# We either have V2 metadata, or invalid metadata
131+
if "shape" in v:
132+
# probably ArrayV2Metadata
133+
metadata[k] = ArrayV2Metadata.from_dict(v)
134+
else:
135+
# probably v2 Group metadata
136+
metadata[k] = GroupMetadata.from_dict(v)
137+
130138
# assert data["kind"] == "inline"
131139
if data["kind"] != "inline":
132140
raise ValueError
@@ -226,15 +234,26 @@ async def open(
226234
cls,
227235
store: StoreLike,
228236
zarr_format: Literal[2, 3, None] = 3,
237+
open_consolidated: bool = False,
229238
) -> AsyncGroup:
230239
store_path = await make_store_path(store)
231240

232241
if zarr_format == 2:
233-
zgroup_bytes, zattrs_bytes = await asyncio.gather(
234-
(store_path / ZGROUP_JSON).get(), (store_path / ZATTRS_JSON).get()
242+
paths = [store_path / ZGROUP_JSON, store_path / ZATTRS_JSON]
243+
if open_consolidated:
244+
paths.append(store_path / ".zmetadata") # todo: configurable
245+
246+
zgroup_bytes, zattrs_bytes, *rest = await asyncio.gather(
247+
*[path.get() for path in paths]
235248
)
236249
if zgroup_bytes is None:
237250
raise FileNotFoundError(store_path)
251+
252+
if open_consolidated:
253+
consolidated_metadata_bytes = rest[0]
254+
if consolidated_metadata_bytes is None:
255+
raise FileNotFoundError(paths[-1])
256+
238257
elif zarr_format == 3:
239258
zarr_json_bytes = await (store_path / ZARR_JSON).get()
240259
if zarr_json_bytes is None:
@@ -265,6 +284,37 @@ async def open(
265284
zgroup = json.loads(zgroup_bytes.to_bytes())
266285
zattrs = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {}
267286
group_metadata = {**zgroup, "attributes": zattrs}
287+
288+
if open_consolidated:
289+
# this *should* be defined.
290+
assert consolidated_metadata_bytes is not None # already checked above
291+
292+
v2_consolidated_metadata = json.loads(consolidated_metadata_bytes.to_bytes())
293+
v2_consolidated_metadata = v2_consolidated_metadata["metadata"]
294+
# We already read zattrs and zgroup. Should we ignore these?
295+
v2_consolidated_metadata.pop(".zattrs")
296+
v2_consolidated_metadata.pop(".zgroup")
297+
298+
consolidated_metadata: defaultdict[str, dict[str, Any]] = defaultdict(dict)
299+
300+
# keys like air/.zarray, air/.zattrs
301+
for k, v in v2_consolidated_metadata.items():
302+
path, kind = k.rsplit("/.", 1)
303+
304+
if kind == "zarray":
305+
consolidated_metadata[path].update(v)
306+
elif kind == "zattrs":
307+
consolidated_metadata[path]["attributes"] = v
308+
elif kind == "zgroup":
309+
consolidated_metadata[path].update(v)
310+
else:
311+
raise ValueError(f"Invalid file type '{kind}' at path '{path}")
312+
group_metadata["consolidated_metadata"] = {
313+
"metadata": dict(consolidated_metadata),
314+
"kind": "inline",
315+
"must_understand": False,
316+
}
317+
268318
else:
269319
# V3 groups are comprised of a zarr.json object
270320
assert zarr_json_bytes is not None

tests/v3/conftest.py

+9
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
from zarr.core.common import ChunkCoords, MemoryOrder, ZarrFormat
2525

2626

27+
HERE = pathlib.Path(__file__).parent
28+
29+
2730
async def parse_store(
2831
store: Literal["local", "memory", "remote"], path: str
2932
) -> LocalStore | MemoryStore | RemoteStore:
@@ -91,6 +94,12 @@ async def async_group(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> As
9194
return agroup
9295

9396

97+
@pytest.fixture(scope="function")
98+
async def complex_hierarchy() -> LocalStore:
99+
root = HERE / "examples/complex-hierarchy.zarr"
100+
return LocalStore(root=root)
101+
102+
94103
@pytest.fixture(params=["numpy", "cupy"])
95104
def xp(request: pytest.FixtureRequest) -> Iterator[ModuleType]:
96105
"""Fixture to parametrize over numpy-like libraries"""

tests/v3/test_metadata/test_v2.py

+125-5
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,22 @@
11
from __future__ import annotations
22

3+
import json
34
from typing import TYPE_CHECKING, Literal
45

5-
if TYPE_CHECKING:
6-
from typing import Any
7-
8-
from zarr.abc.codec import Codec
9-
6+
import numpy as np
107
import pytest
118

9+
import zarr.store
1210
from zarr.codecs import GzipCodec
11+
from zarr.core.buffer import cpu
12+
from zarr.core.group import ConsolidatedMetadata, GroupMetadata
1313
from zarr.core.metadata import ArrayV2Metadata, parse_zarr_format_v2
1414

15+
if TYPE_CHECKING:
16+
from typing import Any
17+
18+
from zarr.abc.codec import Codec
19+
1520

1621
def test_parse_zarr_format_valid() -> None:
1722
assert parse_zarr_format_v2(2) == 2
@@ -70,3 +75,118 @@ def test_metadata_to_dict(
7075
observed.pop("dimension_separator")
7176

7277
assert observed == expected
78+
79+
80+
async def test_read_consolidated_metadata():
81+
# .zgroup, .zattrs, .metadata
82+
zmetadata = {
83+
"metadata": {
84+
".zattrs": {
85+
"Conventions": "COARDS",
86+
},
87+
".zgroup": {"zarr_format": 2},
88+
"air/.zarray": {
89+
"chunks": [730],
90+
"compressor": {},
91+
"dtype": "<i2",
92+
"fill_value": 0,
93+
"filters": None,
94+
"order": "C",
95+
"shape": [730],
96+
"zarr_format": 2,
97+
},
98+
"air/.zattrs": {
99+
"_ARRAY_DIMENSIONS": ["time"],
100+
"dataset": "NMC Reanalysis",
101+
},
102+
"time/.zarray": {
103+
"chunks": [730],
104+
"compressor": {},
105+
"dtype": "<f4",
106+
"fill_value": "0.0",
107+
"filters": None,
108+
"order": "C",
109+
"shape": [730],
110+
"zarr_format": 2,
111+
},
112+
"time/.zattrs": {
113+
"_ARRAY_DIMENSIONS": ["time"],
114+
"calendar": "standard",
115+
"long_name": "Time",
116+
"standard_name": "time",
117+
"units": "hours since 1800-01-01",
118+
},
119+
"nested/.zattrs": {"key": "value"},
120+
"nested/.zgroup": {"zarr_format": 2},
121+
},
122+
"zarr_consolidated_format": 1,
123+
}
124+
store_dict = {}
125+
store = zarr.store.MemoryStore(store_dict=store_dict, mode="a")
126+
await store.set(
127+
".zattrs", cpu.Buffer.from_bytes(json.dumps({"Conventions": "COARDS"}).encode())
128+
)
129+
await store.set(".zgroup", cpu.Buffer.from_bytes(json.dumps({"zarr_format": 2}).encode()))
130+
await store.set(".zmetadata", cpu.Buffer.from_bytes(json.dumps(zmetadata).encode()))
131+
await store.set(
132+
"air/.zarray",
133+
cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["air/.zarray"]).encode()),
134+
)
135+
await store.set(
136+
"air/.zattrs",
137+
cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["air/.zattrs"]).encode()),
138+
)
139+
await store.set(
140+
"time/.zarray",
141+
cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["time/.zarray"]).encode()),
142+
)
143+
await store.set(
144+
"time/.zattrs",
145+
cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["time/.zattrs"]).encode()),
146+
)
147+
148+
# and a nested group for fun
149+
await store.set("nested/.zattrs", cpu.Buffer.from_bytes(json.dumps({"key": "value"}).encode()))
150+
await store.set(
151+
"nested/.zgroup", cpu.Buffer.from_bytes(json.dumps({"zarr_format": 2}).encode())
152+
)
153+
154+
group = zarr.open_consolidated(store=store, zarr_format=2)
155+
assert group.metadata.consolidated_metadata is not None
156+
expected = ConsolidatedMetadata(
157+
metadata={
158+
"air": ArrayV2Metadata(
159+
shape=(730,),
160+
fill_value=0,
161+
chunks=(730,),
162+
attributes={"_ARRAY_DIMENSIONS": ["time"], "dataset": "NMC Reanalysis"},
163+
dtype=np.dtype("int16"),
164+
order="C",
165+
filters=None,
166+
dimension_separator=".",
167+
compressor={},
168+
),
169+
"time": ArrayV2Metadata(
170+
shape=(730,),
171+
fill_value=0.0,
172+
chunks=(730,),
173+
attributes={
174+
"_ARRAY_DIMENSIONS": ["time"],
175+
"calendar": "standard",
176+
"long_name": "Time",
177+
"standard_name": "time",
178+
"units": "hours since 1800-01-01",
179+
},
180+
dtype=np.dtype("float32"),
181+
order="C",
182+
filters=None,
183+
dimension_separator=".",
184+
compressor={},
185+
),
186+
"nested": GroupMetadata(attributes={"key": "value"}, zarr_format=2),
187+
},
188+
kind="inline",
189+
must_understand=False,
190+
)
191+
result = group.metadata.consolidated_metadata
192+
assert result == expected

0 commit comments

Comments
 (0)