diff --git a/src/zarr/abc/metadata.py b/src/zarr/abc/metadata.py index 239d151c0c..291ceb459c 100644 --- a/src/zarr/abc/metadata.py +++ b/src/zarr/abc/metadata.py @@ -22,7 +22,6 @@ def to_dict(self) -> dict[str, JSON]: are instances of `Metadata`. Sequences of `Metadata` are similarly recursed into, and the output of that recursion is collected in a list. """ - ... out_dict = {} for field in fields(self): key = field.name diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 19b44ec450..ec44673f9d 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,8 +1,9 @@ from __future__ import annotations +import base64 from collections.abc import Iterable from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast if TYPE_CHECKING: from typing import Any, Literal, Self @@ -31,7 +32,7 @@ class ArrayV2Metadata(ArrayMetadata): shape: ChunkCoords chunk_grid: RegularChunkGrid data_type: np.dtype[Any] - fill_value: None | int | float = 0 + fill_value: None | int | float | str | bytes = 0 order: Literal["C", "F"] = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." @@ -140,6 +141,13 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # check that the zarr_format attribute is correct _ = parse_zarr_format(_data.pop("zarr_format")) + dtype = parse_dtype(_data["dtype"]) + + if dtype.kind in "SV": + fill_value_encoded = _data.get("fill_value") + if fill_value_encoded is not None: + fill_value = base64.standard_b64decode(fill_value_encoded) + _data["fill_value"] = fill_value # zarr v2 allowed arbitrary keys here. # We don't want the ArrayV2Metadata constructor to fail just because someone put an @@ -155,6 +163,14 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() + + if self.dtype.kind in "SV" and self.fill_value is not None: + # There's a relationship between self.dtype and self.fill_value + # that mypy isn't aware of. The fact that we have S or V dtype here + # means we should have a bytes-type fill_value. + fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii") + zarray_dict["fill_value"] = fill_value + _ = zarray_dict.pop("chunk_grid") zarray_dict["chunks"] = self.chunk_grid.chunk_shape diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index 95a5f66602..f488782d78 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -1,3 +1,4 @@ +import json from collections.abc import Iterator import numpy as np @@ -6,6 +7,9 @@ from numcodecs.blosc import Blosc import zarr +import zarr.core.buffer.cpu +import zarr.core.metadata +import zarr.storage from zarr import Array from zarr.storage import MemoryStore, StorePath @@ -46,3 +50,37 @@ def test_codec_pipeline() -> None: result = array[:] expected = np.ones(1) np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["|S", "|V"]) +async def test_v2_encode_decode(dtype): + store = zarr.storage.MemoryStore(mode="w") + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=b"X", + ) + + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": f"{dtype}0", + "fill_value": "WA==", + "filters": None, + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + expected = np.full((3,), b"X", dtype=dtype) + np.testing.assert_equal(data, expected)