refactor: split metadata into v2 and v3 modules (#2163)

d-v-b · web-flow · commit 52d68490b308 · 2024-09-12T20:30:27.000+02:00
* refactor: split metadata into v2 and v3 modules

* add more explicit typeguards

* port fill value normalization from v2

* remove v2 suffix from zarr format parsing

* remove v2 suffix from zarr format parsing
diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py
@@ -10,7 +10,8 @@
 from zarr.core.array import Array, AsyncArray
 from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat
 from zarr.core.group import AsyncGroup
-from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
+from zarr.core.metadata.v2 import ArrayV2Metadata
+from zarr.core.metadata.v3 import ArrayV3Metadata
 from zarr.store import (
     StoreLike,
     make_store_path,
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -44,7 +44,7 @@
     get_indexer,
     morton_order_iter,
 )
-from zarr.core.metadata import parse_codecs
+from zarr.core.metadata.v3 import parse_codecs
 from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec
 
 if TYPE_CHECKING:
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -55,7 +55,8 @@
     is_scalar,
     pop_fields,
 )
-from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
+from zarr.core.metadata.v2 import ArrayV2Metadata
+from zarr.core.metadata.v3 import ArrayV3Metadata
 from zarr.core.sync import sync
 from zarr.registry import get_pipeline_class
 from zarr.store import StoreLike, StorePath, make_store_path
@@ -67,6 +68,7 @@
     from collections.abc import Iterable
 
     from zarr.abc.codec import Codec, CodecPipeline
+    from zarr.core.metadata.common import ArrayMetadata
 
 # Array and AsyncArray are defined in the base ``zarr`` namespace
 __all__ = ["parse_array_metadata", "create_codec_pipeline"]
diff --git a/src/zarr/core/metadata/__init__.py b/src/zarr/core/metadata/__init__.py
@@ -0,0 +1,4 @@
+from .v2 import ArrayV2Metadata
+from .v3 import ArrayV3Metadata
+
+__all__ = ["ArrayV2Metadata", "ArrayV3Metadata"]
diff --git a/src/zarr/core/metadata/common.py b/src/zarr/core/metadata/common.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from typing import Any, Literal
+
+    import numpy as np
+    from typing_extensions import Self
+
+    from zarr.core.array_spec import ArraySpec
+    from zarr.core.buffer import Buffer, BufferPrototype
+    from zarr.core.chunk_grids import ChunkGrid
+    from zarr.core.common import JSON, ChunkCoords, ZarrFormat
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from zarr.abc.metadata import Metadata
+
+
+@dataclass(frozen=True, kw_only=True)
+class ArrayMetadata(Metadata, ABC):
+    shape: ChunkCoords
+    fill_value: Any
+    chunk_grid: ChunkGrid
+    attributes: dict[str, JSON]
+    zarr_format: ZarrFormat
+
+    @property
+    @abstractmethod
+    def dtype(self) -> np.dtype[Any]:
+        pass
+
+    @property
+    @abstractmethod
+    def ndim(self) -> int:
+        pass
+
+    @abstractmethod
+    def get_chunk_spec(
+        self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
+    ) -> ArraySpec:
+        pass
+
+    @abstractmethod
+    def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
+        pass
+
+    @abstractmethod
+    def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
+        pass
+
+    @abstractmethod
+    def update_shape(self, shape: ChunkCoords) -> Self:
+        pass
+
+    @abstractmethod
+    def update_attributes(self, attributes: dict[str, JSON]) -> Self:
+        pass
+
+
+def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]:
+    if data is None:
+        return {}
+
+    return data
diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
@@ -0,0 +1,235 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from typing import Any, Literal
+
+    import numpy.typing as npt
+    from typing_extensions import Self
+
+    from zarr.core.buffer import Buffer, BufferPrototype
+    from zarr.core.common import JSON, ChunkCoords
+
+import json
+from dataclasses import dataclass, field, replace
+
+import numpy as np
+
+from zarr.core.array_spec import ArraySpec
+from zarr.core.chunk_grids import RegularChunkGrid
+from zarr.core.chunk_key_encodings import parse_separator
+from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike
+from zarr.core.config import config, parse_indexing_order
+from zarr.core.metadata.common import ArrayMetadata, parse_attributes
+
+
+@dataclass(frozen=True, kw_only=True)
+class ArrayV2Metadata(ArrayMetadata):
+    shape: ChunkCoords
+    chunk_grid: RegularChunkGrid
+    data_type: np.dtype[Any]
+    fill_value: None | int | float = 0
+    order: Literal["C", "F"] = "C"
+    filters: list[dict[str, JSON]] | None = None
+    dimension_separator: Literal[".", "/"] = "."
+    compressor: dict[str, JSON] | None = None
+    attributes: dict[str, JSON] = field(default_factory=dict)
+    zarr_format: Literal[2] = field(init=False, default=2)
+
+    def __init__(
+        self,
+        *,
+        shape: ChunkCoords,
+        dtype: npt.DTypeLike,
+        chunks: ChunkCoords,
+        fill_value: Any,
+        order: Literal["C", "F"],
+        dimension_separator: Literal[".", "/"] = ".",
+        compressor: dict[str, JSON] | None = None,
+        filters: list[dict[str, JSON]] | None = None,
+        attributes: dict[str, JSON] | None = None,
+    ):
+        """
+        Metadata for a Zarr version 2 array.
+        """
+        shape_parsed = parse_shapelike(shape)
+        data_type_parsed = parse_dtype(dtype)
+        chunks_parsed = parse_shapelike(chunks)
+        compressor_parsed = parse_compressor(compressor)
+        order_parsed = parse_indexing_order(order)
+        dimension_separator_parsed = parse_separator(dimension_separator)
+        filters_parsed = parse_filters(filters)
+        fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed)
+        attributes_parsed = parse_attributes(attributes)
+
+        object.__setattr__(self, "shape", shape_parsed)
+        object.__setattr__(self, "data_type", data_type_parsed)
+        object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed))
+        object.__setattr__(self, "compressor", compressor_parsed)
+        object.__setattr__(self, "order", order_parsed)
+        object.__setattr__(self, "dimension_separator", dimension_separator_parsed)
+        object.__setattr__(self, "filters", filters_parsed)
+        object.__setattr__(self, "fill_value", fill_value_parsed)
+        object.__setattr__(self, "attributes", attributes_parsed)
+
+        # ensure that the metadata document is consistent
+        _ = parse_metadata(self)
+
+    @property
+    def ndim(self) -> int:
+        return len(self.shape)
+
+    @property
+    def dtype(self) -> np.dtype[Any]:
+        return self.data_type
+
+    @property
+    def chunks(self) -> ChunkCoords:
+        return self.chunk_grid.chunk_shape
+
+    def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
+        def _json_convert(
+            o: Any,
+        ) -> Any:
+            if isinstance(o, np.dtype):
+                if o.fields is None:
+                    return o.str
+                else:
+                    return o.descr
+            if np.isscalar(o):
+                # convert numpy scalar to python type, and pass
+                # python types through
+                return getattr(o, "item", lambda: o)()
+            raise TypeError
+
+        zarray_dict = self.to_dict()
+
+        # todo: remove this check when we can ensure that to_dict always returns dicts.
+        if not isinstance(zarray_dict, dict):
+            raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")
+
+        zattrs_dict = zarray_dict.pop("attributes", {})
+        json_indent = config.get("json_indent")
+        return {
+            ZARRAY_JSON: prototype.buffer.from_bytes(
+                json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
+            ),
+            ZATTRS_JSON: prototype.buffer.from_bytes(
+                json.dumps(zattrs_dict, indent=json_indent).encode()
+            ),
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
+        # make a copy to protect the original from modification
+        _data = data.copy()
+        # check that the zarr_format attribute is correct
+        _ = parse_zarr_format(_data.pop("zarr_format"))
+        return cls(**_data)
+
+    def to_dict(self) -> JSON:
+        zarray_dict = super().to_dict()
+
+        # todo: remove this check when we can ensure that to_dict always returns dicts.
+        if not isinstance(zarray_dict, dict):
+            raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")
+
+        _ = zarray_dict.pop("chunk_grid")
+        zarray_dict["chunks"] = self.chunk_grid.chunk_shape
+
+        _ = zarray_dict.pop("data_type")
+        zarray_dict["dtype"] = self.data_type.str
+
+        return zarray_dict
+
+    def get_chunk_spec(
+        self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
+    ) -> ArraySpec:
+        return ArraySpec(
+            shape=self.chunk_grid.chunk_shape,
+            dtype=self.dtype,
+            fill_value=self.fill_value,
+            order=order,
+            prototype=prototype,
+        )
+
+    def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
+        chunk_identifier = self.dimension_separator.join(map(str, chunk_coords))
+        return "0" if chunk_identifier == "" else chunk_identifier
+
+    def update_shape(self, shape: ChunkCoords) -> Self:
+        return replace(self, shape=shape)
+
+    def update_attributes(self, attributes: dict[str, JSON]) -> Self:
+        return replace(self, attributes=attributes)
+
+
+def parse_zarr_format(data: Literal[2]) -> Literal[2]:
+    if data == 2:
+        return data
+    raise ValueError(f"Invalid value. Expected 2. Got {data}.")
+
+
+def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None:
+    return data
+
+
+def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None:
+    return data
+
+
+def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
+    if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)):
+        msg = (
+            f"The `shape` and `chunks` attributes must have the same length. "
+            f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}."
+        )
+        raise ValueError(msg)
+    return data
+
+
+def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
+    """
+    Parse a potential fill value into a value that is compatible with the provided dtype.
+
+    Parameters
+    ----------
+    fill_value: Any
+        A potential fill value.
+    dtype: np.dtype[Any]
+        A numpy dtype.
+
+    Returns
+        An instance of `dtype`, or `None`, or any python object (in the case of an object dtype)
+    """
+
+    if fill_value is None or dtype.hasobject:
+        # no fill value
+        pass
+    elif not isinstance(fill_value, np.void) and fill_value == 0:
+        # this should be compatible across numpy versions for any array type, including
+        # structured arrays
+        fill_value = np.zeros((), dtype=dtype)[()]
+
+    elif dtype.kind == "U":
+        # special case unicode because of encoding issues on Windows if passed through numpy
+        # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713
+
+        if not isinstance(fill_value, str):
+            raise ValueError(
+                f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string"
+            )
+    else:
+        try:
+            if isinstance(fill_value, bytes) and dtype.kind == "V":
+                # special case for numpy 1.14 compatibility
+                fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()]
+            else:
+                fill_value = np.array(fill_value, dtype=dtype)[()]
+
+        except Exception as e:
+            msg = f"Fill_value {fill_value} is not valid for dtype {dtype}."
+            raise ValueError(msg) from e
+
+    return fill_value
diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py
diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@`
`44`	`44`	`get_indexer,`
`45`	`45`	`morton_order_iter,`
`46`	`46`	`)`
`47`		`-from zarr.core.metadata import parse_codecs`
	`47`	`+from zarr.core.metadata.v3 import parse_codecs`
`48`	`48`	`from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec`
`49`	`49`
`50`	`50`	`if TYPE_CHECKING:`