Skip to content

Fix fill_value serialization issues #2802

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/2802.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization
2 changes: 1 addition & 1 deletion src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3456,7 +3456,7 @@ def _build_metadata_v3(zarr_json: dict[str, JSON]) -> ArrayV3Metadata | GroupMet


def _build_metadata_v2(
zarr_json: dict[str, object], attrs_json: dict[str, JSON]
zarr_json: dict[str, JSON], attrs_json: dict[str, JSON]
) -> ArrayV2Metadata | GroupMetadata:
"""
Convert a dict representation of Zarr V2 metadata into the corresponding metadata class.
Expand Down
112 changes: 82 additions & 30 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,25 @@

import base64
import warnings
from collections.abc import Iterable
from collections.abc import Iterable, Sequence
from enum import Enum
from functools import cached_property
from typing import TYPE_CHECKING, TypedDict, cast
from typing import TYPE_CHECKING, Any, TypedDict, cast

import numcodecs.abc

from zarr.abc.metadata import Metadata

if TYPE_CHECKING:
from typing import Any, Literal, Self
from typing import Literal, Self

import numpy.typing as npt

from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.common import ChunkCoords

import json
import numbers
from dataclasses import dataclass, field, fields, replace

import numcodecs
Expand Down Expand Up @@ -146,41 +147,39 @@ def _json_convert(
raise TypeError

zarray_dict = self.to_dict()
zarray_dict["fill_value"] = _serialize_fill_value(self.fill_value, self.dtype)
zattrs_dict = zarray_dict.pop("attributes", {})
json_indent = config.get("json_indent")
return {
ZARRAY_JSON: prototype.buffer.from_bytes(
json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
json.dumps(
zarray_dict, default=_json_convert, indent=json_indent, allow_nan=False
).encode()
),
ZATTRS_JSON: prototype.buffer.from_bytes(
json.dumps(zattrs_dict, indent=json_indent).encode()
json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode()
),
}

@classmethod
def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
# make a copy to protect the original from modification
# Make a copy to protect the original from modification.
_data = data.copy()
# check that the zarr_format attribute is correct
# Check that the zarr_format attribute is correct.
_ = parse_zarr_format(_data.pop("zarr_format"))
dtype = parse_dtype(_data["dtype"])

if dtype.kind in "SV":
fill_value_encoded = _data.get("fill_value")
if fill_value_encoded is not None:
fill_value = base64.standard_b64decode(fill_value_encoded)
_data["fill_value"] = fill_value

# zarr v2 allowed arbitrary keys here.
# We don't want the ArrayV2Metadata constructor to fail just because someone put an
# extra key in the metadata.
# zarr v2 allowed arbitrary keys in the metadata.
# Filter the keys to only those expected by the constructor.
expected = {x.name for x in fields(cls)}
# https://github.com/zarr-developers/zarr-python/issues/2269
# handle the renames
expected |= {"dtype", "chunks"}

# check if `filters` is an empty sequence; if so use None instead and raise a warning
if _data["filters"] is not None and len(_data["filters"]) == 0:
filters = _data.get("filters")
if (
isinstance(filters, Sequence)
and not isinstance(filters, (str, bytes))
and len(filters) == 0
):
msg = (
"Found an empty list of filters in the array metadata document. "
"This is contrary to the Zarr V2 specification, and will cause an error in the future. "
Expand All @@ -196,13 +195,6 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
def to_dict(self) -> dict[str, JSON]:
zarray_dict = super().to_dict()

if self.dtype.kind in "SV" and self.fill_value is not None:
# There's a relationship between self.dtype and self.fill_value
# that mypy isn't aware of. The fact that we have S or V dtype here
# means we should have a bytes-type fill_value.
fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
zarray_dict["fill_value"] = fill_value

_ = zarray_dict.pop("dtype")
dtype_json: JSON
# In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
Expand Down Expand Up @@ -300,7 +292,26 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
return data


def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
"""Handle structured dtype/fill value pairs"""
print("FILL VALUE", fill_value, "DT", dtype)
try:
if isinstance(fill_value, list):
return np.array([tuple(fill_value)], dtype=dtype)[0]
elif isinstance(fill_value, tuple):
return np.array([fill_value], dtype=dtype)[0]
elif isinstance(fill_value, bytes):
return np.frombuffer(fill_value, dtype=dtype)[0]
elif isinstance(fill_value, str):
decoded = base64.standard_b64decode(fill_value)
return np.frombuffer(decoded, dtype=dtype)[0]
else:
return np.array(fill_value, dtype=dtype)[()]
except Exception as e:
raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e


def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
"""
Parse a potential fill value into a value that is compatible with the provided dtype.

Expand All @@ -317,13 +328,16 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
"""

if fill_value is None or dtype.hasobject:
# no fill value
pass
elif dtype.fields is not None:
# the dtype is structured (has multiple fields), so the fill_value might be a
# compound value (e.g., a tuple or dict) that needs field-wise processing.
# We use parse_structured_fill_value to correctly convert each component.
fill_value = _parse_structured_fill_value(fill_value, dtype)
elif not isinstance(fill_value, np.void) and fill_value == 0:
# this should be compatible across numpy versions for any array type, including
# structured arrays
fill_value = np.zeros((), dtype=dtype)[()]

elif dtype.kind == "U":
# special case unicode because of encoding issues on Windows if passed through numpy
# https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713
Expand All @@ -332,6 +346,11 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
raise ValueError(
f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string"
)
elif dtype.kind in "SV" and isinstance(fill_value, str):
fill_value = base64.standard_b64decode(fill_value)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out of scope but does anyone know why we base64 encode scalars with dtype S ? S is ascii-encoded strings, which shouldn't need any encoding / decoding.

elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2:
complex_val = complex(float(fill_value[0]), float(fill_value[1]))
fill_value = np.array(complex_val, dtype=dtype)[()]
else:
try:
if isinstance(fill_value, bytes) and dtype.kind == "V":
Expand All @@ -347,6 +366,39 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
return fill_value


def _serialize_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> JSON:
serialized: JSON

if fill_value is None:
serialized = None
elif dtype.kind in "SV":
# There's a relationship between dtype and fill_value
# that mypy isn't aware of. The fact that we have S or V dtype here
# means we should have a bytes-type fill_value.
serialized = base64.standard_b64encode(cast(bytes, fill_value)).decode("ascii")
elif isinstance(fill_value, np.datetime64):
serialized = np.datetime_as_string(fill_value)
elif isinstance(fill_value, numbers.Integral):
serialized = int(fill_value)
elif isinstance(fill_value, numbers.Real):
float_fv = float(fill_value)
if np.isnan(float_fv):
serialized = "NaN"
elif np.isinf(float_fv):
serialized = "Infinity" if float_fv > 0 else "-Infinity"
else:
serialized = float_fv
elif isinstance(fill_value, numbers.Complex):
serialized = [
_serialize_fill_value(fill_value.real, dtype),
_serialize_fill_value(fill_value.imag, dtype),
]
else:
serialized = fill_value

return serialized


def _default_fill_value(dtype: np.dtype[Any]) -> Any:
"""
Get the default fill value for a type.
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/testing/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import hypothesis.extra.numpy as npst
import hypothesis.strategies as st
import numpy as np
from hypothesis import event, given, settings # noqa: F401
from hypothesis import event
from hypothesis.strategies import SearchStrategy

import zarr
Expand Down
Loading