zarr-developers · dcherian · Apr 4, 2025 · Feb 5, 2025 · Feb 18, 2025 · Feb 20, 2025
diff --git a/changes/2802.fix.rst b/changes/2802.fix.rst
@@ -0,0 +1 @@
+Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization
diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py
@@ -3456,7 +3456,7 @@ def _build_metadata_v3(zarr_json: dict[str, JSON]) -> ArrayV3Metadata | GroupMet
 
 
 def _build_metadata_v2(
-    zarr_json: dict[str, object], attrs_json: dict[str, JSON]
+    zarr_json: dict[str, JSON], attrs_json: dict[str, JSON]
 ) -> ArrayV2Metadata | GroupMetadata:
     """
     Convert a dict representation of Zarr V2 metadata into the corresponding metadata class.

diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
@@ -2,24 +2,25 @@
 
 import base64
 import warnings
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 from enum import Enum
 from functools import cached_property
-from typing import TYPE_CHECKING, TypedDict, cast
+from typing import TYPE_CHECKING, Any, TypedDict, cast
 
 import numcodecs.abc
 
 from zarr.abc.metadata import Metadata
 
 if TYPE_CHECKING:
-    from typing import Any, Literal, Self
+    from typing import Literal, Self
 
     import numpy.typing as npt
 
     from zarr.core.buffer import Buffer, BufferPrototype
     from zarr.core.common import ChunkCoords
 
 import json
+import numbers
 from dataclasses import dataclass, field, fields, replace
 
 import numcodecs
@@ -146,41 +147,39 @@ def _json_convert(
             raise TypeError
 
         zarray_dict = self.to_dict()
+        zarray_dict["fill_value"] = _serialize_fill_value(self.fill_value, self.dtype)
         zattrs_dict = zarray_dict.pop("attributes", {})
         json_indent = config.get("json_indent")
         return {
             ZARRAY_JSON: prototype.buffer.from_bytes(
-                json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
+                json.dumps(
+                    zarray_dict, default=_json_convert, indent=json_indent, allow_nan=False
+                ).encode()
             ),
             ZATTRS_JSON: prototype.buffer.from_bytes(
-                json.dumps(zattrs_dict, indent=json_indent).encode()
+                json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode()
             ),
         }
 
     @classmethod
     def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
-        # make a copy to protect the original from modification
+        # Make a copy to protect the original from modification.
         _data = data.copy()
-        # check that the zarr_format attribute is correct
+        # Check that the zarr_format attribute is correct.
         _ = parse_zarr_format(_data.pop("zarr_format"))
-        dtype = parse_dtype(_data["dtype"])
 
-        if dtype.kind in "SV":
-            fill_value_encoded = _data.get("fill_value")
-            if fill_value_encoded is not None:
-                fill_value = base64.standard_b64decode(fill_value_encoded)
-                _data["fill_value"] = fill_value
-
-        # zarr v2 allowed arbitrary keys here.
-        # We don't want the ArrayV2Metadata constructor to fail just because someone put an
-        # extra key in the metadata.
+        # zarr v2 allowed arbitrary keys in the metadata.
+        # Filter the keys to only those expected by the constructor.
         expected = {x.name for x in fields(cls)}
-        # https://github.com/zarr-developers/zarr-python/issues/2269
-        # handle the renames
         expected |= {"dtype", "chunks"}
 
         # check if `filters` is an empty sequence; if so use None instead and raise a warning
-        if _data["filters"] is not None and len(_data["filters"]) == 0:
+        filters = _data.get("filters")
+        if (
+            isinstance(filters, Sequence)
+            and not isinstance(filters, (str, bytes))
+            and len(filters) == 0
+        ):
             msg = (
                 "Found an empty list of filters in the array metadata document. "
                 "This is contrary to the Zarr V2 specification, and will cause an error in the future. "
@@ -196,13 +195,6 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
     def to_dict(self) -> dict[str, JSON]:
         zarray_dict = super().to_dict()
 
-        if self.dtype.kind in "SV" and self.fill_value is not None:
-            # There's a relationship between self.dtype and self.fill_value
-            # that mypy isn't aware of. The fact that we have S or V dtype here
-            # means we should have a bytes-type fill_value.
-            fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
-            zarray_dict["fill_value"] = fill_value
-
         _ = zarray_dict.pop("dtype")
         dtype_json: JSON
         # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
@@ -300,7 +292,26 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
     return data
 
 
-def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
+def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
+    """Handle structured dtype/fill value pairs"""
+    print("FILL VALUE", fill_value, "DT", dtype)
+    try:
+        if isinstance(fill_value, list):
+            return np.array([tuple(fill_value)], dtype=dtype)[0]
+        elif isinstance(fill_value, tuple):
+            return np.array([fill_value], dtype=dtype)[0]
+        elif isinstance(fill_value, bytes):
+            return np.frombuffer(fill_value, dtype=dtype)[0]
+        elif isinstance(fill_value, str):
+            decoded = base64.standard_b64decode(fill_value)
+            return np.frombuffer(decoded, dtype=dtype)[0]
+        else:
+            return np.array(fill_value, dtype=dtype)[()]
+    except Exception as e:
+        raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e
+
+
+def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
     """
     Parse a potential fill value into a value that is compatible with the provided dtype.
 
@@ -317,13 +328,16 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
     """
 
     if fill_value is None or dtype.hasobject:
-        # no fill value
         pass
+    elif dtype.fields is not None:
+        # the dtype is structured (has multiple fields), so the fill_value might be a
+        # compound value (e.g., a tuple or dict) that needs field-wise processing.
+        # We use parse_structured_fill_value to correctly convert each component.
+        fill_value = _parse_structured_fill_value(fill_value, dtype)
     elif not isinstance(fill_value, np.void) and fill_value == 0:
         # this should be compatible across numpy versions for any array type, including
         # structured arrays
         fill_value = np.zeros((), dtype=dtype)[()]
-
     elif dtype.kind == "U":
         # special case unicode because of encoding issues on Windows if passed through numpy
         # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713
@@ -332,6 +346,11 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
             raise ValueError(
                 f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string"
             )
+    elif dtype.kind in "SV" and isinstance(fill_value, str):
+        fill_value = base64.standard_b64decode(fill_value)
+    elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2:
+        complex_val = complex(float(fill_value[0]), float(fill_value[1]))
+        fill_value = np.array(complex_val, dtype=dtype)[()]
     else:
         try:
             if isinstance(fill_value, bytes) and dtype.kind == "V":
@@ -347,6 +366,39 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
     return fill_value
 
 
+def _serialize_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> JSON:
+    serialized: JSON
+
+    if fill_value is None:
+        serialized = None
+    elif dtype.kind in "SV":
+        # There's a relationship between dtype and fill_value
+        # that mypy isn't aware of. The fact that we have S or V dtype here
+        # means we should have a bytes-type fill_value.
+        serialized = base64.standard_b64encode(cast(bytes, fill_value)).decode("ascii")
+    elif isinstance(fill_value, np.datetime64):
+        serialized = np.datetime_as_string(fill_value)
+    elif isinstance(fill_value, numbers.Integral):
+        serialized = int(fill_value)
+    elif isinstance(fill_value, numbers.Real):
+        float_fv = float(fill_value)
+        if np.isnan(float_fv):
+            serialized = "NaN"
+        elif np.isinf(float_fv):
+            serialized = "Infinity" if float_fv > 0 else "-Infinity"
+        else:
+            serialized = float_fv
+    elif isinstance(fill_value, numbers.Complex):
+        serialized = [
+            _serialize_fill_value(fill_value.real, dtype),
+            _serialize_fill_value(fill_value.imag, dtype),
+        ]
+    else:
+        serialized = fill_value
+
+    return serialized
+
+
 def _default_fill_value(dtype: np.dtype[Any]) -> Any:
     """
     Get the default fill value for a type.

diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py
@@ -5,7 +5,7 @@
 import hypothesis.extra.numpy as npst
 import hypothesis.strategies as st
 import numpy as np
-from hypothesis import event, given, settings  # noqa: F401
+from hypothesis import event
 from hypothesis.strategies import SearchStrategy
 
 import zarr
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization