From f5e3f789d85507f1bf7e8b4078f07a92fc315051 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 21 Feb 2025 13:43:34 +0100 Subject: [PATCH 001/130] modernize typing --- src/zarr/core/strings.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/strings.py b/src/zarr/core/strings.py index ffca0c3b0c..5bc7ceece5 100644 --- a/src/zarr/core/strings.py +++ b/src/zarr/core/strings.py @@ -2,7 +2,9 @@ different versions of Numpy. """ -from typing import Any, Union, cast +from __future__ import annotations + +from typing import Any, cast from warnings import warn import numpy as np @@ -11,13 +13,13 @@ # when reading data back from Zarr. # Any valid string-like datatype should be fine for *setting* data. -_STRING_DTYPE: Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"] +_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType _NUMPY_SUPPORTS_VLEN_STRING: bool def cast_array( data: np.ndarray[Any, np.dtype[Any]], -) -> np.ndarray[Any, Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"]]: +) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: raise NotImplementedError @@ -39,14 +41,14 @@ def cast_array( def cast_array( data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"]]: + ) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: out = data.astype(_STRING_DTYPE, copy=False) return cast(np.ndarray[Any, np.dtypes.ObjectDType], out) def cast_to_string_dtype( data: np.ndarray[Any, np.dtype[Any]], safe: bool = False -) -> np.ndarray[Any, Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"]]: +) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: """Take any data and attempt to cast to to our preferred string dtype. data : np.ndarray From 3c50f54942e92ae0f13d960a4e476e63ed31aa54 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 24 Feb 2025 14:41:25 +0100 Subject: [PATCH 002/130] lint --- src/zarr/core/common.py | 4 ++-- src/zarr/core/metadata/v3.py | 2 +- src/zarr/core/strings.py | 21 +++++++++++---------- tests/test_strings.py | 20 +++++++++++--------- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index ad3316b619..e398eff406 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -19,7 +19,7 @@ import numpy as np from zarr.core.config import config as zarr_config -from zarr.core.strings import _STRING_DTYPE +from zarr.core.strings import _VLEN_STRING_DTYPE if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -173,7 +173,7 @@ def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: # special case as object return np.dtype("object") else: - return _STRING_DTYPE + return _VLEN_STRING_DTYPE return np.dtype(dtype) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 9154762648..649e79b7ae 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -38,7 +38,7 @@ from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.core.strings import _STRING_DTYPE as STRING_NP_DTYPE +from zarr.core.strings import _VLEN_STRING_DTYPE as STRING_NP_DTYPE from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class diff --git a/src/zarr/core/strings.py b/src/zarr/core/strings.py index 5bc7ceece5..f14b38840d 100644 --- a/src/zarr/core/strings.py +++ b/src/zarr/core/strings.py @@ -13,42 +13,43 @@ # when reading data back from Zarr. # Any valid string-like datatype should be fine for *setting* data. -_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType +VLenStringType = np.dtypes.StringDType | np.dtypes.ObjectDType +_VLEN_STRING_DTYPE: VLenStringType _NUMPY_SUPPORTS_VLEN_STRING: bool def cast_array( data: np.ndarray[Any, np.dtype[Any]], -) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: +) -> np.ndarray[Any, VLenStringType]: raise NotImplementedError try: # this new vlen string dtype was added in NumPy 2.0 - _STRING_DTYPE = np.dtypes.StringDType() + _VLEN_STRING_DTYPE = np.dtypes.StringDType() _NUMPY_SUPPORTS_VLEN_STRING = True def cast_array( data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: - out = data.astype(_STRING_DTYPE, copy=False) + ) -> np.ndarray[Any, VLenStringType]: + out = data.astype(_VLEN_STRING_DTYPE, copy=False) return cast(np.ndarray[Any, np.dtypes.StringDType], out) except AttributeError: # if not available, we fall back on an object array of strings, as in Zarr < 3 - _STRING_DTYPE = np.dtypes.ObjectDType() + _VLEN_STRING_DTYPE = np.dtypes.ObjectDType() _NUMPY_SUPPORTS_VLEN_STRING = False def cast_array( data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: - out = data.astype(_STRING_DTYPE, copy=False) + ) -> np.ndarray[Any, VLenStringType]: + out = data.astype(_VLEN_STRING_DTYPE, copy=False) return cast(np.ndarray[Any, np.dtypes.ObjectDType], out) def cast_to_string_dtype( data: np.ndarray[Any, np.dtype[Any]], safe: bool = False -) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: +) -> np.ndarray[Any, VLenStringType]: """Take any data and attempt to cast to to our preferred string dtype. data : np.ndarray @@ -63,7 +64,7 @@ def cast_to_string_dtype( return cast_array(data) # out = data.astype(STRING_DTYPE, copy=False) # return cast(np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType], out) - if _NUMPY_SUPPORTS_VLEN_STRING and np.issubdtype(data.dtype, _STRING_DTYPE): + if _NUMPY_SUPPORTS_VLEN_STRING and np.issubdtype(data.dtype, _VLEN_STRING_DTYPE): # already a valid string variable length string dtype return cast_array(data) if np.issubdtype(data.dtype, np.object_): diff --git a/tests/test_strings.py b/tests/test_strings.py index dca0570a25..963f2e305e 100644 --- a/tests/test_strings.py +++ b/tests/test_strings.py @@ -3,33 +3,35 @@ import numpy as np import pytest -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING, _STRING_DTYPE, cast_to_string_dtype +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING, _VLEN_STRING_DTYPE, cast_to_string_dtype def test_string_defaults() -> None: if _NUMPY_SUPPORTS_VLEN_STRING: - assert _STRING_DTYPE == np.dtypes.StringDType() + assert _VLEN_STRING_DTYPE == np.dtypes.StringDType() else: - assert _STRING_DTYPE == np.dtypes.ObjectDType() + assert _VLEN_STRING_DTYPE == np.dtypes.ObjectDType() def test_cast_to_string_dtype() -> None: d1 = np.array(["a", "b", "c"]) assert d1.dtype == np.dtype(" Date: Wed, 26 Feb 2025 09:35:37 +0100 Subject: [PATCH 003/130] new dtypes --- src/zarr/core/_info.py | 6 +- src/zarr/core/array.py | 20 +- src/zarr/core/dtype/__init__.py | 3 + src/zarr/core/dtype/core.py | 196 +++++++++++++++++ src/zarr/core/metadata/dtype.py | 372 ++++++++++++++++++++++++++++++++ src/zarr/core/metadata/v3.py | 5 +- src/zarr/core/strings.py | 4 +- src/zarr/registry.py | 106 ++++++++- tests/test_array.py | 2 +- tests/test_codecs/test_vlen.py | 19 +- tests/test_metadata/test_v3.py | 1 - 11 files changed, 703 insertions(+), 31 deletions(-) create mode 100644 src/zarr/core/dtype/__init__.py create mode 100644 src/zarr/core/dtype/core.py create mode 100644 src/zarr/core/metadata/dtype.py diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 845552c8be..14eb98d6e4 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -7,7 +7,9 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.v3 import DataType +from zarr.core.metadata.dtype import BaseDataType + +# from zarr.core.metadata.v3 import DataType @dataclasses.dataclass(kw_only=True) @@ -78,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | DataType + _data_type: np.dtype[Any] | BaseDataType _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9c2f8a7260..2bb809037d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -98,19 +98,21 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) +from zarr.core.metadata.dtype import BaseDataType from zarr.core.metadata.v2 import ( _default_compressor, _default_filters, parse_compressor, parse_filters, ) -from zarr.core.metadata.v3 import DataType, parse_node_type_array +from zarr.core.metadata.v3 import parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError from zarr.registry import ( _parse_array_array_codec, _parse_array_bytes_codec, _parse_bytes_bytes_codec, + get_data_type_from_numpy, get_pipeline_class, ) from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path @@ -1682,7 +1684,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | DataType + _data_type: np.dtype[Any] | BaseDataType if isinstance(self.metadata, ArrayV2Metadata): _data_type = self.metadata.dtype else: @@ -4203,17 +4205,11 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - dtype = DataType.from_numpy(np_dtype) - if dtype == DataType.string: - dtype_key = "string" - elif dtype == DataType.bytes: - dtype_key = "bytes" - else: - dtype_key = "numeric" + dtype = get_data_type_from_numpy(np_dtype) - default_filters = zarr_config.get("array.v3_default_filters").get(dtype_key) - default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype_key) - default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype_key) + default_filters = zarr_config.get("array.v3_default_filters").get(dtype.type) + default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.type) + default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype.type) filters = tuple(_parse_array_array_codec(codec_dict) for codec_dict in default_filters) serializer = _parse_array_bytes_codec(default_serializer) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py new file mode 100644 index 0000000000..58b884ff23 --- /dev/null +++ b/src/zarr/core/dtype/__init__.py @@ -0,0 +1,3 @@ +from zarr.core.dtype.core import ZarrDType + +__all__ = ["ZarrDType"] diff --git a/src/zarr/core/dtype/core.py b/src/zarr/core/dtype/core.py new file mode 100644 index 0000000000..c6460706aa --- /dev/null +++ b/src/zarr/core/dtype/core.py @@ -0,0 +1,196 @@ +""" +# Overview + +This module provides a proof-of-concept standalone interface for managing dtypes in the zarr-python codebase. + +The `ZarrDType` class introduced in this module effectively acts as a replacement for `np.dtype` throughout the +zarr-python codebase. It attempts to encapsulate all relevant runtime information necessary for working with +dtypes in the context of the Zarr V3 specification (e.g. is this a core dtype or not, how many bytes and what +endianness is the dtype etc). By providing this abstraction, the module aims to: + +- Simplify dtype management within zarr-python +- Support runtime flexibility and custom extensions +- Remove unnecessary dependencies on the numpy API + +## Extensibility + +The module attempts to support user-driven extensions, allowing developers to introduce custom dtypes +without requiring immediate changes to zarr-python. Extensions can leverage the current entrypoint mechanism, +enabling integration of experimental features. Over time, widely adopted extensions may be formalized through +inclusion in zarr-python or standardized via a Zarr Enhancement Proposal (ZEP), but this is not essential. + +## Examples + +### Core `dtype` Registration + +The following example demonstrates how to register a built-in `dtype` in the core codebase: + +```python +from zarr.core.dtype import ZarrDType +from zarr.registry import register_v3dtype + +class Float16(ZarrDType): + zarr_spec_format = "3" + experimental = False + endianness = "little" + byte_count = 2 + to_numpy = np.dtype('float16') + +register_v3dtype(Float16) +``` + +### Entrypoint Extension + +The following example demonstrates how users can register a new `bfloat16` dtype for Zarr. +This approach adheres to the existing Zarr entrypoint pattern as much as possible, ensuring +consistency with other extensions. The code below would typically be part of a Python package +that specifies the entrypoints for the extension: + +```python +import ml_dtypes +from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype + +class Bfloat16(ZarrDType): + zarr_spec_format = "3" + experimental = True + endianness = "little" + byte_count = 2 + to_numpy = np.dtype('bfloat16') # Enabled by importing ml_dtypes + configuration_v3 = { + "version": "example_value", + "author": "example_value", + "ml_dtypes_version": "example_value" + } +``` + +### dtype lookup + +The following examples demonstrate how to perform a lookup for the relevant ZarrDType, given +a string that matches the dtype Zarr specification ID, or a numpy dtype object: + +``` +from zarr.registry import get_v3dtype_class, get_v3dtype_class_from_numpy + +get_v3dtype_class('complex64') # returns little-endian Complex64 ZarrDType +get_v3dtype_class('not_registered_dtype') # ValueError + +get_v3dtype_class_from_numpy('>i2') # returns big-endian Int16 ZarrDType +get_v3dtype_class_from_numpy(np.dtype('float32')) # returns little-endian Float32 ZarrDType +get_v3dtype_class_from_numpy('i10') # ValueError +``` + +### String dtypes + +The following indicates one possibility for supporting variable-length strings. It is via the +entrypoint mechanism as in a previous example. The Apache Arrow specification does not currently +include a dtype for fixed-length strings (only for fixed-length bytes) and so I am using string +here to implicitly refer to a variable-length string data (there may be some subtleties with codecs +that means this needs to be refined further): + +```python +import numpy as np +from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype + +try: + to_numpy = np.dtypes.StringDType() +except AttributeError: + to_numpy = np.dtypes.ObjectDType() + +class String(ZarrDType): + zarr_spec_format = "3" + experimental = True + endianness = 'little' + byte_count = None # None is defined to mean variable + to_numpy = to_numpy +``` + +### int4 dtype + +There is currently considerable interest in the AI community in 'quantising' models - storing +models at reduced precision, while minimising loss of information content. There are a number +of sub-byte dtypes that the community are using e.g. int4. Unfortunately numpy does not +currently have support for handling such sub-byte dtypes in an easy way. However, they can +still be held in a numpy array and then passed (in a zero-copy way) to something like pytorch +which can handle appropriately: + +```python +import numpy as np +from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype + +class Int4(ZarrDType): + zarr_spec_format = "3" + experimental = True + endianness = 'little' + byte_count = 1 # this is ugly, but I could change this from byte_count to bit_count if there was consensus + to_numpy = np.dtype('B') # could also be np.dtype('V1'), but this would prevent bit-twiddling + configuration_v3 = { + "version": "example_value", + "author": "example_value", + } +``` +""" + +from __future__ import annotations + +from typing import Any, Literal + +import numpy as np + + +class FrozenClassVariables(type): + def __setattr__(cls, attr: str, value: object) -> None: + if hasattr(cls, attr): + raise ValueError(f"Attribute {attr} on ZarrDType class can not be changed once set.") + else: + raise AttributeError(f"'{cls}' object has no attribute '{attr}'") + + +class ZarrDType(metaclass=FrozenClassVariables): + zarr_spec_format: Literal["2", "3"] # the version of the zarr spec used + experimental: bool # is this in the core spec or not + endianness: Literal[ + "big", "little", None + ] # None indicates not defined i.e. single byte or byte strings + byte_count: int | None # None indicates variable count + to_numpy: np.dtype[Any] # may involve installing a a numpy extension e.g. ml_dtypes; + + configuration_v3: dict | None # TODO: understand better how this is recommended by the spec + + _zarr_spec_identifier: str # implementation detail used to map to core spec + + def __init_subclass__( # enforces all required fields are set and basic sanity checks + cls, + **kwargs, + ) -> None: + required_attrs = [ + "zarr_spec_format", + "experimental", + "endianness", + "byte_count", + "to_numpy", + ] + for attr in required_attrs: + if not hasattr(cls, attr): + raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") + + if not hasattr(cls, "configuration_v3"): + cls.configuration_v3 = None + + cls._zarr_spec_identifier = ( + "big_" + cls.__qualname__.lower() + if cls.endianness == "big" + else cls.__qualname__.lower() + ) # how this dtype is identified in core spec; convention is prefix with big_ for big-endian + + cls._validate() # sanity check on basic requirements + + super().__init_subclass__(**kwargs) + + # TODO: add further checks + @classmethod + def _validate(cls): + if cls.byte_count is not None and cls.byte_count <= 0: + raise ValueError("byte_count must be a positive integer.") + + if cls.byte_count == 1 and cls.endianness is not None: + raise ValueError("Endianness must be None for single-byte types.") diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py new file mode 100644 index 0000000000..ab101f2fad --- /dev/null +++ b/src/zarr/core/metadata/dtype.py @@ -0,0 +1,372 @@ +from abc import ABC +from dataclasses import dataclass, field +from typing import Any, ClassVar, Literal, Self, get_args + +import numpy as np + +from zarr.abc.metadata import Metadata +from zarr.core.common import JSON +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.registry import register_data_type + +Endianness = Literal["little", "big", "native"] +DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] + + +def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", "=", "|"]: + match endianness: + case "little": + return "<" + case "big": + return ">" + case "native": + return "=" + case None: + return "|" + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(endianness)} or None" + ) + + +class BaseDataType(ABC, Metadata): + name: ClassVar[str] + numpy_character_code: ClassVar[str] + item_size: ClassVar[int | None] + type: ClassVar[DataTypeFlavor] + capacity: int + + def __init_subclass__(cls, **kwargs: object) -> None: + required_attrs = [ + "name", + "numpy_character_code", + "item_size", + "type", + ] + for attr in required_attrs: + if not hasattr(cls, attr): + raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") + + return super().__init_subclass__(**kwargs) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name} + + def to_numpy(self: Self, *, endianness: Endianness | None = None) -> np.dtype[Any]: + endian_str = endianness_to_numpy_str(endianness) + return np.dtype(endian_str + self.numpy_character_code) + + +@dataclass(frozen=True, kw_only=True) +class Bool(BaseDataType): + name = "bool" + item_size = 1 + type = "boolean" + numpy_character_code = "?" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Bool) + + +@dataclass(frozen=True, kw_only=True) +class Int8(BaseDataType): + name = "int8" + item_size = 1 + type = "numeric" + numpy_character_code = "b" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Int8) + + +@dataclass(frozen=True, kw_only=True) +class UInt8(BaseDataType): + name = "uint8" + item_size = 2 + type = "numeric" + numpy_character_code = "B" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(UInt8) + + +@dataclass(frozen=True, kw_only=True) +class Int16(BaseDataType): + name = "int16" + item_size = 2 + type = "numeric" + numpy_character_code = "h" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Int16) + + +@dataclass(frozen=True, kw_only=True) +class UInt16(BaseDataType): + name = "uint16" + item_size = 2 + type = "numeric" + numpy_character_code = "H" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(UInt16) + + +@dataclass(frozen=True, kw_only=True) +class Int32(BaseDataType): + name = "int32" + item_size = 4 + type = "numeric" + numpy_character_code = "i" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Int32) + + +@dataclass(frozen=True, kw_only=True) +class UInt32(BaseDataType): + name = "uint32" + item_size = 4 + type = "numeric" + numpy_character_code = "I" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(UInt32) + + +@dataclass(frozen=True, kw_only=True) +class Int64(BaseDataType): + name = "int64" + item_size = 8 + type = "numeric" + numpy_character_code = "l" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Int64) + + +@dataclass(frozen=True, kw_only=True) +class UInt64(BaseDataType): + name = "uint64" + item_size = 8 + type = "numeric" + numpy_character_code = "L" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(UInt64) + + +@dataclass(frozen=True, kw_only=True) +class Float16(BaseDataType): + name = "float16" + item_size = 2 + type = "numeric" + numpy_character_code = "e" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Float16) + + +@dataclass(frozen=True, kw_only=True) +class Float32(BaseDataType): + name = "float32" + item_size = 4 + type = "numeric" + numpy_character_code = "f" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Float32) + + +@dataclass(frozen=True, kw_only=True) +class Float64(BaseDataType): + name = "float64" + item_size = 8 + type = "numeric" + numpy_character_code = "d" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Float64) + + +@dataclass(frozen=True, kw_only=True) +class Complex64(BaseDataType): + name = "complex64" + item_size = 16 + type = "numeric" + numpy_character_code = "F" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Complex64) + + +@dataclass(frozen=True, kw_only=True) +class Complex128(BaseDataType): + name = "complex64" + item_size = 32 + type = "numeric" + numpy_character_code = "D" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Complex128) + + +@dataclass(frozen=True, kw_only=True) +class StaticByteString(BaseDataType): + name = "numpy/static_byte_string" + type = "string" + numpy_character_code = "S" + item_size = 1 + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"capacity": self.capacity}} + + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.bytes_]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + + +register_data_type(StaticByteString) + +if _NUMPY_SUPPORTS_VLEN_STRING: + + @dataclass(frozen=True, kw_only=True) + class VlenString(BaseDataType): + name = "numpy/vlen_string" + type = "string" + numpy_character_code = "T" + item_size = None + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"capacity": self.capacity}} + + def to_numpy( + self, endianness: Endianness | None = "native" + ) -> np.dtype[np.dtypes.StringDType]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code) + +else: + + @dataclass(frozen=True, kw_only=True) + class VlenString(BaseDataType): + name = "numpy/vlen_string" + type = "string" + numpy_character_code = "O" + item_size = None + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"capacity": self.capacity}} + + def to_numpy( + self, endianness: Endianness | None = "native" + ) -> np.dtype[np.dtypes.ObjectDType]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code) + + +register_data_type(VlenString) + + +@dataclass(frozen=True, kw_only=True) +class StaticUnicodeString(BaseDataType): + name = "numpy/static_unicode_string" + type = "string" + numpy_character_code = "U" + item_size = 4 + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"capacity": self.capacity}} + + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + + +register_data_type(StaticUnicodeString) + + +@dataclass(frozen=True, kw_only=True) +class StaticRawBytes(BaseDataType): + name = "r*" + type = "bytes" + numpy_character_code = "V" + item_size = 1 + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": f"r{self.capacity * 8}"} + + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + + +def parse_dtype(dtype: npt.DtypeLike | BaseDataType) -> BaseDataType: + from zarr.registry import get_data_type_from_numpy + + if isinstance(dtype, BaseDataType): + return dtype + return get_data_type_from_numpy(dtype) + + +register_data_type(StaticRawBytes) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 649e79b7ae..86503e64cd 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -5,6 +5,7 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.metadata.dtype import BaseDataType if TYPE_CHECKING: from collections.abc import Callable @@ -251,7 +252,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: npt.DTypeLike | DataType, + data_type: npt.DTypeLike | BaseDataType, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: Any, @@ -595,7 +596,7 @@ def default_fill_value(dtype: DataType) -> str | bytes | np.generic: _bool = bool -class DataType(Enum): +class DataTypex(Enum): bool = "bool" int8 = "int8" int16 = "int16" diff --git a/src/zarr/core/strings.py b/src/zarr/core/strings.py index f14b38840d..15c30b6f9b 100644 --- a/src/zarr/core/strings.py +++ b/src/zarr/core/strings.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Any, cast +from typing import Any, Union, cast from warnings import warn import numpy as np @@ -13,7 +13,7 @@ # when reading data back from Zarr. # Any valid string-like datatype should be fine for *setting* data. -VLenStringType = np.dtypes.StringDType | np.dtypes.ObjectDType +VLenStringType = Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"] _VLEN_STRING_DTYPE: VLenStringType _NUMPY_SUPPORTS_VLEN_STRING: bool diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 704db3f704..480d75d49a 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -5,11 +5,15 @@ from importlib.metadata import entry_points as get_entry_points from typing import TYPE_CHECKING, Any, Generic, TypeVar +import numpy as np + from zarr.core.config import BadConfigError, config if TYPE_CHECKING: from importlib.metadata import EntryPoint + import numpy.typing as npt + from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -19,6 +23,8 @@ ) from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON + from zarr.core.dtype import ZarrDType + from zarr.core.metadata.dtype import BaseDataType __all__ = [ "Registry", @@ -26,10 +32,14 @@ "get_codec_class", "get_ndbuffer_class", "get_pipeline_class", + "get_v2dtype_class", + "get_v3dtype_class", "register_buffer", "register_codec", "register_ndbuffer", "register_pipeline", + "register_v2dtype", + "register_v3dtype", ] T = TypeVar("T") @@ -43,6 +53,7 @@ def __init__(self) -> None: def lazy_load(self) -> None: for e in self.lazy_load_list: self.register(e.load()) + self.lazy_load_list.clear() def register(self, cls: type[T]) -> None: @@ -53,17 +64,22 @@ def register(self, cls: type[T]) -> None: __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() +__data_type_registry: Registry[BaseDataType] = Registry() +__v3_dtype_registry: Registry[ZarrDType] = Registry() +__v2_dtype_registry: Registry[ZarrDType] = Registry() """ The registry module is responsible for managing implementations of codecs, pipelines, buffers and ndbuffers and collecting them from entrypoints. The implementation used is determined by the config. + +The registry module is also responsible for managing dtypes. """ def _collect_entrypoints() -> list[Registry[Any]]: """ - Collects codecs, pipelines, buffers and ndbuffers from entrypoints. + Collects codecs, pipelines, dtypes, buffers and ndbuffers from entrypoints. Entry points can either be single items or groups of items. Allowed syntax for entry_points.txt is e.g. @@ -86,6 +102,14 @@ def _collect_entrypoints() -> list[Registry[Any]]: __buffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="buffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) + + __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + + __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v3dtype")) + __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v3dtype")) + __v2_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v2dtype")) + __v2_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v2dtype")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( entry_points.select(group="zarr", name="codec_pipeline") @@ -131,6 +155,18 @@ def register_buffer(cls: type[Buffer]) -> None: __buffer_registry.register(cls) +def register_data_type(cls: type[BaseDataType]) -> None: + __data_type_registry.register(cls) + + +def register_v3dtype(cls: type[ZarrDType]) -> None: + __v3_dtype_registry.register(cls) + + +def register_v2dtype(cls: type[ZarrDType]) -> None: + __v2_dtype_registry.register(cls) + + def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if reload_config: _reload_config() @@ -148,7 +184,8 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if len(codec_classes) == 1: return next(iter(codec_classes.values())) warnings.warn( - f"Codec '{key}' not configured in config. Selecting any implementation.", stacklevel=2 + f"Codec '{key}' not configured in config. Selecting any implementation.", + stacklevel=2, ) return list(codec_classes.values())[-1] selected_codec_cls = codec_classes[config_entry] @@ -266,4 +303,69 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) +def get_data_type(dtype: str) -> type[BaseDataType]: + __data_type_registry.lazy_load() + maybe_dtype_cls = __data_type_registry.get(dtype) + if maybe_dtype_cls is None: + raise ValueError(f"No data type class matching name {dtype}") + return maybe_dtype_cls + + +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[BaseDataType]: + np_dtype = np.dtype(dtype) + __data_type_registry.lazy_load() + for val in __data_type_registry.values(): + if val.numpy_character_code == np_dtype.char: + return val + raise ValueError( + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry)}." + ) + + +# TODO: merge the get_vXdtype_class_ functions +# these can be used instead of the various parse_X functions (hopefully) +def get_v3dtype_class(dtype: str) -> type[ZarrDType]: + __v3_dtype_registry.lazy_load() + v3dtype_class = __v3_dtype_registry.get(dtype) + if v3dtype_class: + return v3dtype_class + raise ValueError( + f"ZarrDType class '{dtype}' not found in registered buffers: {list(__v3_dtype_registry)}." + ) + + +def get_v3dtype_class_from_numpy(dtype: npt.DTypeLike) -> type[ZarrDType]: + __v3_dtype_registry.lazy_load() + + dtype = np.dtype(dtype) + for val in __v3_dtype_registry.values(): + if dtype == val.to_numpy: + return val + raise ValueError( + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__v3_dtype_registry)}." + ) + + +def get_v2dtype_class(dtype: str) -> type[ZarrDType]: + __v2_dtype_registry.lazy_load() + v2dtype_class = __v2_dtype_registry.get(dtype) + if v2dtype_class: + return v2dtype_class + raise ValueError( + f"ZarrDType class '{dtype}' not found in registered buffers: {list(__v2_dtype_registry)}." + ) + + +def get_v2dtype_class_from_numpy(dtype: npt.DTypeLike) -> type[ZarrDType]: + __v2_dtype_registry.lazy_load() + + dtype = np.dtype(dtype) + for val in __v2_dtype_registry.values(): + if dtype == val.to_numpy: + return val + raise ValueError( + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__v2_dtype_registry)}." + ) + + _collect_entrypoints() diff --git a/tests/test_array.py b/tests/test_array.py index efcf8a6bf9..72c1bbf1b7 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1008,7 +1008,7 @@ async def test_no_filters_compressors( assert arr.serializer == BytesCodec() @staticmethod - @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"]) @pytest.mark.parametrize( "compressors", [ diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index f5599f2ac0..8aeea834ce 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,17 +8,18 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.metadata.v3 import ArrayV3Metadata, DataType +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.registry import get_data_type_from_numpy from zarr.storage import StorePath -numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType] -expected_zarr_string_dtype: np.dtype[Any] +numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"] +expected_array_string_dtype: np.dtype[Any] if _NUMPY_SUPPORTS_VLEN_STRING: numpy_str_dtypes.append(np.dtypes.StringDType) - expected_zarr_string_dtype = np.dtypes.StringDType() + expected_array_string_dtype = np.dtypes.StringDType() else: - expected_zarr_string_dtype = np.dtype("O") + expected_array_string_dtype = np.dtype("O") @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @@ -49,15 +50,15 @@ def test_vlen_string( a[:, :] = data assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == DataType.string - assert a.dtype == expected_zarr_string_dtype + assert a.metadata.data_type == get_data_type_from_numpy(dtype) + assert a.dtype == expected_array_string_dtype # test round trip b = Array.open(sp) assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == DataType.string - assert a.dtype == expected_zarr_string_dtype + assert b.metadata.data_type == get_data_type_from_numpy(dtype) + assert a.dtype == expected_array_string_dtype @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index a47cbf43bb..4f6b2a5de6 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -14,7 +14,6 @@ from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, - DataType, default_fill_value, parse_dimension_names, parse_fill_value, From 5000dcb616aabda90a91e02a8d27bc02ce54f63d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Feb 2025 14:56:13 +0100 Subject: [PATCH 004/130] rename base dtype, change type to kind --- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 12 +- src/zarr/core/common.py | 9 +- src/zarr/core/metadata/dtype.py | 192 +++++++++++++++++++------------- src/zarr/core/metadata/v3.py | 8 +- src/zarr/registry.py | 10 +- 6 files changed, 130 insertions(+), 105 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 14eb98d6e4..d2b23d8b5f 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -7,7 +7,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.dtype import BaseDataType +from zarr.core.metadata.dtype import DtypeBase # from zarr.core.metadata.v3 import DataType @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | BaseDataType + _data_type: np.dtype[Any] | DtypeBase _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2bb809037d..2e15db3790 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -98,7 +98,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import BaseDataType +from zarr.core.metadata.dtype import DtypeBase from zarr.core.metadata.v2 import ( _default_compressor, _default_filters, @@ -679,7 +679,7 @@ def _create_metadata_v3( """ shape = parse_shapelike(shape) - codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype)) + codecs = list(codecs) if codecs is not None else _get_default_codecs(dtype) chunk_key_encoding_parsed: ChunkKeyEncodingLike if chunk_key_encoding is None: chunk_key_encoding_parsed = {"name": "default", "separator": "/"} @@ -1684,7 +1684,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | BaseDataType + _data_type: np.dtype[Any] | DtypeBase if isinstance(self.metadata, ArrayV2Metadata): _data_type = self.metadata.dtype else: @@ -4207,9 +4207,9 @@ def _get_default_chunk_encoding_v3( """ dtype = get_data_type_from_numpy(np_dtype) - default_filters = zarr_config.get("array.v3_default_filters").get(dtype.type) - default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.type) - default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype.type) + default_filters = zarr_config.get("array.v3_default_filters").get(dtype.kind) + default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.kind) + default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype.kind) filters = tuple(_parse_array_array_codec(codec_dict) for codec_dict in default_filters) serializer = _parse_array_bytes_codec(default_serializer) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index e398eff406..e005cceed0 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -19,7 +19,6 @@ import numpy as np from zarr.core.config import config as zarr_config -from zarr.core.strings import _VLEN_STRING_DTYPE if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -167,13 +166,7 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: - if dtype is str or dtype == "str": - if zarr_format == 2: - # special case as object - return np.dtype("object") - else: - return _VLEN_STRING_DTYPE +def parse_dtype(dtype: Any) -> np.dtype[Any]: return np.dtype(dtype) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index ab101f2fad..f3a571b372 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,8 +1,9 @@ from abc import ABC -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, ClassVar, Literal, Self, get_args import numpy as np +import numpy.typing as npt from zarr.abc.metadata import Metadata from zarr.core.common import JSON @@ -28,19 +29,22 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", ) -class BaseDataType(ABC, Metadata): +class Flexible: + capacity: int + + +class DtypeBase(ABC, Metadata): name: ClassVar[str] numpy_character_code: ClassVar[str] item_size: ClassVar[int | None] - type: ClassVar[DataTypeFlavor] - capacity: int + kind: ClassVar[DataTypeFlavor] def __init_subclass__(cls, **kwargs: object) -> None: required_attrs = [ "name", "numpy_character_code", "item_size", - "type", + "kind", ] for attr in required_attrs: if not hasattr(cls, attr): @@ -51,18 +55,43 @@ def __init_subclass__(cls, **kwargs: object) -> None: def to_dict(self) -> dict[str, JSON]: return {"name": self.name} + @classmethod + def from_numpy(cls, dtype: npt.DTypeLike) -> Self: + """ + Create an instance of this dtype from a numpy dtype. + + Parameters + ---------- + dtype : npt.DTypeLike + The numpy dtype to create an instance from. + + Returns + ------- + Self + An instance of this dtype. + + Raises + ------ + ValueError + If the provided numpy dtype does not match this class. + """ + if np.dtype(dtype).char != cls.numpy_character_code: + raise ValueError( + f"Invalid dtype {dtype}. Expected dtype with character code == {cls.numpy_character_code}." + ) + return cls() + def to_numpy(self: Self, *, endianness: Endianness | None = None) -> np.dtype[Any]: endian_str = endianness_to_numpy_str(endianness) return np.dtype(endian_str + self.numpy_character_code) @dataclass(frozen=True, kw_only=True) -class Bool(BaseDataType): +class Bool(DtypeBase): name = "bool" item_size = 1 - type = "boolean" + kind = "boolean" numpy_character_code = "?" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDType: return super().to_numpy(endianness=endianness) @@ -72,12 +101,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDTy @dataclass(frozen=True, kw_only=True) -class Int8(BaseDataType): +class Int8(DtypeBase): name = "int8" item_size = 1 - type = "numeric" + kind = "numeric" numpy_character_code = "b" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: return super().to_numpy(endianness=endianness) @@ -87,12 +115,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DTy @dataclass(frozen=True, kw_only=True) -class UInt8(BaseDataType): +class UInt8(DtypeBase): name = "uint8" item_size = 2 - type = "numeric" + kind = "numeric" numpy_character_code = "B" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: return super().to_numpy(endianness=endianness) @@ -102,12 +129,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DT @dataclass(frozen=True, kw_only=True) -class Int16(BaseDataType): +class Int16(DtypeBase): name = "int16" item_size = 2 - type = "numeric" + kind = "numeric" numpy_character_code = "h" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: return super().to_numpy(endianness=endianness) @@ -117,12 +143,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DT @dataclass(frozen=True, kw_only=True) -class UInt16(BaseDataType): +class UInt16(DtypeBase): name = "uint16" item_size = 2 - type = "numeric" + kind = "numeric" numpy_character_code = "H" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: return super().to_numpy(endianness=endianness) @@ -132,12 +157,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16D @dataclass(frozen=True, kw_only=True) -class Int32(BaseDataType): +class Int32(DtypeBase): name = "int32" item_size = 4 - type = "numeric" + kind = "numeric" numpy_character_code = "i" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: return super().to_numpy(endianness=endianness) @@ -147,12 +171,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DT @dataclass(frozen=True, kw_only=True) -class UInt32(BaseDataType): +class UInt32(DtypeBase): name = "uint32" item_size = 4 - type = "numeric" + kind = "numeric" numpy_character_code = "I" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: return super().to_numpy(endianness=endianness) @@ -162,12 +185,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32D @dataclass(frozen=True, kw_only=True) -class Int64(BaseDataType): +class Int64(DtypeBase): name = "int64" item_size = 8 - type = "numeric" + kind = "numeric" numpy_character_code = "l" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: return super().to_numpy(endianness=endianness) @@ -177,12 +199,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DT @dataclass(frozen=True, kw_only=True) -class UInt64(BaseDataType): +class UInt64(DtypeBase): name = "uint64" item_size = 8 - type = "numeric" + kind = "numeric" numpy_character_code = "L" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: return super().to_numpy(endianness=endianness) @@ -192,12 +213,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64D @dataclass(frozen=True, kw_only=True) -class Float16(BaseDataType): +class Float16(DtypeBase): name = "float16" item_size = 2 - type = "numeric" + kind = "numeric" numpy_character_code = "e" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: return super().to_numpy(endianness=endianness) @@ -207,12 +227,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16 @dataclass(frozen=True, kw_only=True) -class Float32(BaseDataType): +class Float32(DtypeBase): name = "float32" item_size = 4 - type = "numeric" + kind = "numeric" numpy_character_code = "f" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: return super().to_numpy(endianness=endianness) @@ -222,12 +241,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32 @dataclass(frozen=True, kw_only=True) -class Float64(BaseDataType): +class Float64(DtypeBase): name = "float64" item_size = 8 - type = "numeric" + kind = "numeric" numpy_character_code = "d" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: return super().to_numpy(endianness=endianness) @@ -237,12 +255,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64 @dataclass(frozen=True, kw_only=True) -class Complex64(BaseDataType): +class Complex64(DtypeBase): name = "complex64" item_size = 16 - type = "numeric" + kind = "numeric" numpy_character_code = "F" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: return super().to_numpy(endianness=endianness) @@ -252,12 +269,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex @dataclass(frozen=True, kw_only=True) -class Complex128(BaseDataType): +class Complex128(DtypeBase): name = "complex64" item_size = 32 - type = "numeric" + kind = "numeric" numpy_character_code = "D" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: return super().to_numpy(endianness=endianness) @@ -267,12 +283,17 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex @dataclass(frozen=True, kw_only=True) -class StaticByteString(BaseDataType): +class StaticByteString(DtypeBase, Flexible): name = "numpy/static_byte_string" - type = "string" + kind = "string" numpy_character_code = "S" item_size = 1 - capacity: int + + def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: + dtype = np.dtype(dtype) + if dtype.kind != cls.numpy_character_code: + raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") + return cls(capacity=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"capacity": self.capacity}} @@ -282,20 +303,42 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.byte return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) +@dataclass(frozen=True, kw_only=True) +class StaticRawBytes(DtypeBase, Flexible): + name = "r*" + kind = "bytes" + numpy_character_code = "V" + item_size = 1 + + def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: + dtype = np.dtype(dtype) + if dtype.kind != "V": + raise ValueError(f"Invalid dtype {dtype}. Expected a bytes dtype.") + return cls(capacity=dtype.itemsize) + + def to_dict(self) -> dict[str, JSON]: + return {"name": f"r{self.capacity * 8}"} + + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + + register_data_type(StaticByteString) if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VlenString(BaseDataType): + class VlenString(DtypeBase): name = "numpy/vlen_string" - type = "string" + kind = "string" numpy_character_code = "T" + # this uses UTF-8, so the encoding of a code point varies between + # 1 and 4 bytes item_size = None - capacity: int def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.capacity}} + return {"name": self.name} def to_numpy( self, endianness: Endianness | None = "native" @@ -306,15 +349,14 @@ def to_numpy( else: @dataclass(frozen=True, kw_only=True) - class VlenString(BaseDataType): + class VlenString(DtypeBase): name = "numpy/vlen_string" - type = "string" + kind = "string" numpy_character_code = "O" item_size = None - capacity: int def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.capacity}} + return {"name": self.name} def to_numpy( self, endianness: Endianness | None = "native" @@ -327,12 +369,17 @@ def to_numpy( @dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(BaseDataType): +class StaticUnicodeString(DtypeBase, Flexible): name = "numpy/static_unicode_string" - type = "string" + kind = "string" numpy_character_code = "U" item_size = 4 - capacity: int + + def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: + dtype = np.dtype(dtype) + if dtype.kind != "U": + raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") + return cls(capacity=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"capacity": self.capacity}} @@ -345,28 +392,13 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_ register_data_type(StaticUnicodeString) -@dataclass(frozen=True, kw_only=True) -class StaticRawBytes(BaseDataType): - name = "r*" - type = "bytes" - numpy_character_code = "V" - item_size = 1 - capacity: int - - def to_dict(self) -> dict[str, JSON]: - return {"name": f"r{self.capacity * 8}"} - - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: - endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) - - -def parse_dtype(dtype: npt.DtypeLike | BaseDataType) -> BaseDataType: +def resolve_dtype(dtype: npt.DTypeLike | DtypeBase) -> DtypeBase: from zarr.registry import get_data_type_from_numpy - if isinstance(dtype, BaseDataType): + if isinstance(dtype, DtypeBase): return dtype - return get_data_type_from_numpy(dtype) + cls = get_data_type_from_numpy(dtype) + return cls.from_numpy(dtype) register_data_type(StaticRawBytes) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 86503e64cd..839459d8e0 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -5,7 +5,7 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import BaseDataType +from zarr.core.metadata.dtype import DtypeBase, resolve_dtype if TYPE_CHECKING: from collections.abc import Callable @@ -237,7 +237,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DataType + data_type: DtypeBase chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -252,7 +252,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: npt.DTypeLike | BaseDataType, + data_type: npt.DTypeLike | DtypeBase, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: Any, @@ -265,7 +265,7 @@ def __init__( Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ shape_parsed = parse_shapelike(shape) - data_type_parsed = DataType.parse(data_type) + data_type_parsed = resolve_dtype(data_type) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 480d75d49a..272a72a16f 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -24,7 +24,7 @@ from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON from zarr.core.dtype import ZarrDType - from zarr.core.metadata.dtype import BaseDataType + from zarr.core.metadata.dtype import DtypeBase __all__ = [ "Registry", @@ -64,7 +64,7 @@ def register(self, cls: type[T]) -> None: __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() -__data_type_registry: Registry[BaseDataType] = Registry() +__data_type_registry: Registry[DtypeBase] = Registry() __v3_dtype_registry: Registry[ZarrDType] = Registry() __v2_dtype_registry: Registry[ZarrDType] = Registry() @@ -155,7 +155,7 @@ def register_buffer(cls: type[Buffer]) -> None: __buffer_registry.register(cls) -def register_data_type(cls: type[BaseDataType]) -> None: +def register_data_type(cls: type[DtypeBase]) -> None: __data_type_registry.register(cls) @@ -303,7 +303,7 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type(dtype: str) -> type[BaseDataType]: +def get_data_type(dtype: str) -> type[DtypeBase]: __data_type_registry.lazy_load() maybe_dtype_cls = __data_type_registry.get(dtype) if maybe_dtype_cls is None: @@ -311,7 +311,7 @@ def get_data_type(dtype: str) -> type[BaseDataType]: return maybe_dtype_cls -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[BaseDataType]: +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[DtypeBase]: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() for val in __data_type_registry.values(): From 9cd5c5197b869110139b8e922b54c29bc9b5b425 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 Feb 2025 09:57:23 +0100 Subject: [PATCH 005/130] start working on JSON serialization --- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 6 +- src/zarr/core/metadata/dtype.py | 363 ++++++++++++++++++++++++++------ src/zarr/core/metadata/v3.py | 207 +++--------------- src/zarr/registry.py | 10 +- 5 files changed, 343 insertions(+), 247 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index d2b23d8b5f..2ede547600 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -7,7 +7,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.dtype import DtypeBase +from zarr.core.metadata.dtype import DTypeBase # from zarr.core.metadata.v3 import DataType @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | DtypeBase + _data_type: np.dtype[Any] | DTypeBase _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2e15db3790..2986b27fb0 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -98,7 +98,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import DtypeBase +from zarr.core.metadata.dtype import DTypeBase from zarr.core.metadata.v2 import ( _default_compressor, _default_filters, @@ -1684,7 +1684,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | DtypeBase + _data_type: np.dtype[Any] | DTypeBase if isinstance(self.metadata, ArrayV2Metadata): _data_type = self.metadata.dtype else: @@ -3909,7 +3909,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + dtype_parsed = parse_dtype(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index f3a571b372..19a00343c8 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,17 +1,18 @@ -from abc import ABC +from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Any, ClassVar, Literal, Self, get_args +from typing import Any, ClassVar, Literal, Self, TypeGuard, cast, get_args import numpy as np import numpy.typing as npt from zarr.abc.metadata import Metadata -from zarr.core.common import JSON +from zarr.core.common import JSON, ZarrFormat from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.registry import register_data_type Endianness = Literal["little", "big", "native"] DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] +JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", "=", "|"]: @@ -29,23 +30,121 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", ) +def check_json_bool(data: JSON) -> TypeGuard[bool]: + return bool(isinstance(data, bool)) + + +def check_json_int(data: JSON) -> TypeGuard[int]: + return bool(isinstance(data, int)) + + +def check_json_float(data: JSON) -> TypeGuard[float]: + if data == "NaN" or data == "Infinity" or data == "-Infinity": + return True + else: + return bool(isinstance(data, float)) + + +def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: + if np.isnan(data): + return "NaN" + elif np.isinf(data): + return "Infinity" if data > 0 else "-Infinity" + return float(data) + + +def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: + # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly + # so we just re-use the v2 routine here + return float_to_json_v2(data) + + +def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: + """ + convert a float to JSON as per the zarr v3 spec + """ + if zarr_format == 2: + return float_to_json_v2(data) + else: + return float_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def complex_to_json_v2(data: complex | np.complex_) -> JSONFloat: + return float_to_json_v2(data) + + +def complex_to_json_v3(data: complex | np.complex_) -> tuple[JSONFloat, JSONFloat]: + return float_to_json_v3(data.real), float_to_json_v3(data.imag) + + +def complex_to_json( + data: complex | np.complex_, zarr_format: ZarrFormat +) -> tuple[JSONFloat, JSONFloat] | JSONFloat: + if zarr_format == 2: + return complex_to_json_v2(data) + else: + return complex_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def float_from_json_v2(data: JSONFloat, dtype: np.floating[Any]) -> np.float_: + if data == "NaN": + _data = np.nan + elif data == "Infinity": + _data = np.inf + elif data == "-Infinity": + _data = -np.inf + else: + _data = data + return dtype.type(_data) + + +def float_from_json_v3(data: JSONFloat, dtype: Any) -> np.floating[Any]: + # todo: support the v3-specific NaN handling + return float_from_json_v2(data, dtype) + + +def float_from_json(data: JSONFloat, dtype: Any, zarr_format: ZarrFormat) -> np.floating[Any]: + if zarr_format == 2: + return float_from_json_v2(data, dtype) + else: + return float_from_json_v3(data, dtype) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complex_: + return dtype.type(data) + + +def complex_from_json_v3(data: tuple[JSONFloat, JSONFloat], dtype: Any) -> np.complex_: + return dtype.type(data[0] + 1j * data[1]) + + +def complex_from_json( + data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat +) -> np.complex_: + if zarr_format == 2: + return complex_from_json_v2(data, dtype) + else: + return complex_from_json_v3(data, dtype) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +@dataclass(frozen=True, kw_only=True) class Flexible: - capacity: int + length: int -class DtypeBase(ABC, Metadata): +class DTypeBase(ABC, Metadata): name: ClassVar[str] numpy_character_code: ClassVar[str] item_size: ClassVar[int | None] kind: ClassVar[DataTypeFlavor] + default: object def __init_subclass__(cls, **kwargs: object) -> None: - required_attrs = [ - "name", - "numpy_character_code", - "item_size", - "kind", - ] + required_attrs = ["name", "numpy_character_code", "item_size", "kind", "default"] for attr in required_attrs: if not hasattr(cls, attr): raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") @@ -57,223 +156,356 @@ def to_dict(self) -> dict[str, JSON]: @classmethod def from_numpy(cls, dtype: npt.DTypeLike) -> Self: - """ - Create an instance of this dtype from a numpy dtype. - - Parameters - ---------- - dtype : npt.DTypeLike - The numpy dtype to create an instance from. - - Returns - ------- - Self - An instance of this dtype. - - Raises - ------ - ValueError - If the provided numpy dtype does not match this class. - """ if np.dtype(dtype).char != cls.numpy_character_code: raise ValueError( f"Invalid dtype {dtype}. Expected dtype with character code == {cls.numpy_character_code}." ) return cls() + def default_value(self: Self, *, endianness: Endianness | None = None) -> np.generic: + return cast(np.generic, self.to_numpy(endianness=endianness).type(self.default)) + def to_numpy(self: Self, *, endianness: Endianness | None = None) -> np.dtype[Any]: endian_str = endianness_to_numpy_str(endianness) return np.dtype(endian_str + self.numpy_character_code) + @abstractmethod + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: + """ + Convert a single value to JSON-serializable format. Depends on the zarr format. + """ + raise NotImplementedError + + @abstractmethod + def from_json_value( + self: Self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.generic: + """ + Read a JSON-serializable value as a numpy scalar + """ + raise NotImplementedError + @dataclass(frozen=True, kw_only=True) -class Bool(DtypeBase): +class Bool(DTypeBase): name = "bool" item_size = 1 kind = "boolean" numpy_character_code = "?" + default = False def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: + return bool(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.bool_: + if check_json_bool(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected a boolean.") + register_data_type(Bool) @dataclass(frozen=True, kw_only=True) -class Int8(DtypeBase): +class Int8(DTypeBase): name = "int8" item_size = 1 kind = "numeric" numpy_character_code = "b" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.int8: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(Int8) @dataclass(frozen=True, kw_only=True) -class UInt8(DtypeBase): +class UInt8(DTypeBase): name = "uint8" item_size = 2 kind = "numeric" numpy_character_code = "B" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.uint8: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(UInt8) @dataclass(frozen=True, kw_only=True) -class Int16(DtypeBase): +class Int16(DTypeBase): name = "int16" item_size = 2 kind = "numeric" numpy_character_code = "h" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.int16: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(Int16) @dataclass(frozen=True, kw_only=True) -class UInt16(DtypeBase): +class UInt16(DTypeBase): name = "uint16" item_size = 2 kind = "numeric" numpy_character_code = "H" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.uint16: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(UInt16) @dataclass(frozen=True, kw_only=True) -class Int32(DtypeBase): +class Int32(DTypeBase): name = "int32" item_size = 4 kind = "numeric" numpy_character_code = "i" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.int32: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(Int32) @dataclass(frozen=True, kw_only=True) -class UInt32(DtypeBase): +class UInt32(DTypeBase): name = "uint32" item_size = 4 kind = "numeric" numpy_character_code = "I" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.uint32: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(UInt32) @dataclass(frozen=True, kw_only=True) -class Int64(DtypeBase): +class Int64(DTypeBase): name = "int64" item_size = 8 kind = "numeric" numpy_character_code = "l" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.int64: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(Int64) @dataclass(frozen=True, kw_only=True) -class UInt64(DtypeBase): +class UInt64(DTypeBase): name = "uint64" item_size = 8 kind = "numeric" numpy_character_code = "L" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.uint64: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(UInt64) @dataclass(frozen=True, kw_only=True) -class Float16(DtypeBase): +class Float16(DTypeBase): name = "float16" item_size = 2 kind = "numeric" numpy_character_code = "e" + default = 0.0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> float: + return float(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float16: + if check_json_float(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected a float.") + register_data_type(Float16) @dataclass(frozen=True, kw_only=True) -class Float32(DtypeBase): +class Float32(DTypeBase): name = "float32" item_size = 4 kind = "numeric" numpy_character_code = "f" + default = 0.0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> float: + return float(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float32: + if check_json_float(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected a float.") + register_data_type(Float32) @dataclass(frozen=True, kw_only=True) -class Float64(DtypeBase): +class Float64(DTypeBase): name = "float64" item_size = 8 kind = "numeric" numpy_character_code = "d" + default = 0.0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> float: + return float(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float64: + if check_json_float(data): + return float_from_json(data, dtype=self.to_numpy(endianness=endianness)) + raise TypeError(f"Invalid type: {data}. Expected a float.") + register_data_type(Float64) @dataclass(frozen=True, kw_only=True) -class Complex64(DtypeBase): +class Complex64(DTypeBase): name = "complex64" item_size = 16 kind = "numeric" numpy_character_code = "F" + default = 0.0 + 0.0j def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> float: + return float(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.complex64: + if check_json_float(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected a float.") + register_data_type(Complex64) @dataclass(frozen=True, kw_only=True) -class Complex128(DtypeBase): +class Complex128(DTypeBase): name = "complex64" item_size = 32 kind = "numeric" numpy_character_code = "D" + default = 0.0 + 0.0j def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: return super().to_numpy(endianness=endianness) @@ -283,45 +515,49 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex @dataclass(frozen=True, kw_only=True) -class StaticByteString(DtypeBase, Flexible): +class StaticByteString(DTypeBase, Flexible): name = "numpy/static_byte_string" kind = "string" numpy_character_code = "S" item_size = 1 + default = b"" + @classmethod def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: dtype = np.dtype(dtype) if dtype.kind != cls.numpy_character_code: raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") - return cls(capacity=dtype.itemsize) + return cls(length=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.capacity}} + return {"name": self.name, "configuration": {"capacity": self.length}} def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.bytes_]: endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) @dataclass(frozen=True, kw_only=True) -class StaticRawBytes(DtypeBase, Flexible): +class StaticRawBytes(DTypeBase, Flexible): name = "r*" kind = "bytes" numpy_character_code = "V" item_size = 1 + default = b"" + @classmethod def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: dtype = np.dtype(dtype) if dtype.kind != "V": raise ValueError(f"Invalid dtype {dtype}. Expected a bytes dtype.") - return cls(capacity=dtype.itemsize) + return cls(length=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: - return {"name": f"r{self.capacity * 8}"} + return {"name": f"r{self.length * 8}"} def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) register_data_type(StaticByteString) @@ -329,13 +565,14 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VlenString(DtypeBase): + class VlenString(DTypeBase): name = "numpy/vlen_string" kind = "string" numpy_character_code = "T" # this uses UTF-8, so the encoding of a code point varies between # 1 and 4 bytes item_size = None + default = "" def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @@ -349,11 +586,12 @@ def to_numpy( else: @dataclass(frozen=True, kw_only=True) - class VlenString(DtypeBase): + class VlenString(DTypeBase): name = "numpy/vlen_string" kind = "string" numpy_character_code = "O" item_size = None + default = "" def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @@ -369,36 +607,43 @@ def to_numpy( @dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(DtypeBase, Flexible): +class StaticUnicodeString(DTypeBase, Flexible): name = "numpy/static_unicode_string" kind = "string" numpy_character_code = "U" item_size = 4 + default = "" + @classmethod def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: dtype = np.dtype(dtype) if dtype.kind != "U": raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") - return cls(capacity=dtype.itemsize) + return cls(length=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.capacity}} + return {"name": self.name, "configuration": {"capacity": self.length}} def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_]: endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) register_data_type(StaticUnicodeString) -def resolve_dtype(dtype: npt.DTypeLike | DtypeBase) -> DtypeBase: +def resolve_dtype(dtype: npt.DTypeLike | DTypeBase) -> DTypeBase: from zarr.registry import get_data_type_from_numpy - if isinstance(dtype, DtypeBase): + if isinstance(dtype, DTypeBase): return dtype cls = get_data_type_from_numpy(dtype) return cls.from_numpy(dtype) register_data_type(StaticRawBytes) + +INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 +FLOAT_DTYPE = Float16 | Float32 | Float64 +COMPLEX_DTYPE = Complex64 | Complex128 +STRING_DTYPE = StaticUnicodeString | VlenString | StaticByteString diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 839459d8e0..87bb001164 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -5,7 +5,16 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import DtypeBase, resolve_dtype +from zarr.core.metadata.dtype import ( + COMPLEX_DTYPE, + FLOAT_DTYPE, + INTEGER_DTYPE, + STRING_DTYPE, + Bool, + DTypeBase, + StaticRawBytes, + resolve_dtype, +) if TYPE_CHECKING: from collections.abc import Callable @@ -19,7 +28,7 @@ from collections.abc import Iterable, Sequence from dataclasses import dataclass, field, replace from enum import Enum -from typing import Any, Literal, cast +from typing import Any, Literal import numcodecs.abc import numpy as np @@ -38,8 +47,6 @@ ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.core.strings import _VLEN_STRING_DTYPE as STRING_NP_DTYPE from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class @@ -94,7 +101,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeBase) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -107,11 +114,11 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name codec_class_name = abc.__class__.__name__ - if dtype == DataType.string and not codec_class_name == "VLenUTF8Codec": + if dtype.kind == "string" and not codec_class_name == "VLenUTF8Codec": raise ValueError( f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) - if dtype == DataType.bytes and not codec_class_name == "VLenBytesCodec": + if dtype.kind == "bytes" and not codec_class_name == "VLenBytesCodec": raise ValueError( f"For bytes dtype, ArrayBytesCodec must be `VLenBytesCodec`, got `{codec_class_name}`." ) @@ -237,7 +244,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DtypeBase + data_type: DTypeBase chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -252,10 +259,10 @@ def __init__( self, *, shape: Iterable[int], - data_type: npt.DTypeLike | DtypeBase, + data_type: npt.DTypeLike | DTypeBase, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, - fill_value: Any, + fill_value: object, codecs: Iterable[Codec | dict[str, JSON]], attributes: dict[str, JSON] | None, dimension_names: Iterable[str] | None, @@ -269,12 +276,8 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - if fill_value is None: - fill_value = default_fill_value(data_type_parsed) # we pass a string here rather than an enum to make mypy happy - fill_value_parsed = parse_fill_value( - fill_value, dtype=cast(ALL_DTYPES, data_type_parsed.value) - ) + fill_value_parsed = parse_fill_value(fill_value, data_type_parsed) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) @@ -433,26 +436,19 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: # enum Literals can't be used in typing, so we have to restate all of the V3 dtypes as types # https://github.com/python/typing/issues/781 -BOOL_DTYPE = Literal["bool"] BOOL = np.bool_ -INTEGER_DTYPE = Literal["int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"] INTEGER = np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 -FLOAT_DTYPE = Literal["float16", "float32", "float64"] FLOAT = np.float16 | np.float32 | np.float64 -COMPLEX_DTYPE = Literal["complex64", "complex128"] COMPLEX = np.complex64 | np.complex128 -STRING_DTYPE = Literal["string"] + STRING = np.str_ -BYTES_DTYPE = Literal["bytes"] BYTES = np.bytes_ -ALL_DTYPES = BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | BYTES_DTYPE - @overload def parse_fill_value( fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: BOOL_DTYPE, + dtype: Bool, ) -> BOOL: ... @@ -487,14 +483,14 @@ def parse_fill_value( @overload def parse_fill_value( fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: BYTES_DTYPE, + dtype: StaticRawBytes, ) -> BYTES: ... def parse_fill_value( fill_value: Any, - dtype: ALL_DTYPES, -) -> Any: + dtype: DTypeBase, +) -> np.generic: """ Parse `fill_value`, a potential fill value, into an instance of `dtype`, a data type. If `fill_value` is `None`, then this function will return the result of casting the value 0 @@ -508,26 +504,26 @@ def parse_fill_value( ---------- fill_value : Any A potential fill value. - dtype : str + dtype : DTypeBase A valid Zarr format 3 DataType. Returns ------- A scalar instance of `dtype` """ - data_type = DataType(dtype) if fill_value is None: raise ValueError("Fill value cannot be None") - if data_type == DataType.string: + + if dtype.kind == "string": return np.str_(fill_value) - if data_type == DataType.bytes: + if dtype.kind == "bytes": return np.bytes_(fill_value) # the rest are numeric types - np_dtype = cast(np.dtype[Any], data_type.to_numpy()) + np_dtype = dtype.to_numpy() if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): - if data_type in (DataType.complex64, DataType.complex128): + if isindata_type in (DataType.complex64, DataType.complex128): if len(fill_value) == 2: decoded_fill_value = tuple( SPECIAL_FLOATS_ENCODED.get(value, value) for value in fill_value @@ -579,148 +575,3 @@ def parse_fill_value( raise ValueError(f"fill value {fill_value!r} is not valid for dtype {data_type}") return casted_value - - -def default_fill_value(dtype: DataType) -> str | bytes | np.generic: - if dtype == DataType.string: - return "" - elif dtype == DataType.bytes: - return b"" - else: - np_dtype = dtype.to_numpy() - np_dtype = cast(np.dtype[Any], np_dtype) - return np_dtype.type(0) # type: ignore[misc] - - -# For type checking -_bool = bool - - -class DataTypex(Enum): - bool = "bool" - int8 = "int8" - int16 = "int16" - int32 = "int32" - int64 = "int64" - uint8 = "uint8" - uint16 = "uint16" - uint32 = "uint32" - uint64 = "uint64" - float16 = "float16" - float32 = "float32" - float64 = "float64" - complex64 = "complex64" - complex128 = "complex128" - string = "string" - bytes = "bytes" - - @property - def byte_count(self) -> int | None: - data_type_byte_counts = { - DataType.bool: 1, - DataType.int8: 1, - DataType.int16: 2, - DataType.int32: 4, - DataType.int64: 8, - DataType.uint8: 1, - DataType.uint16: 2, - DataType.uint32: 4, - DataType.uint64: 8, - DataType.float16: 2, - DataType.float32: 4, - DataType.float64: 8, - DataType.complex64: 8, - DataType.complex128: 16, - } - try: - return data_type_byte_counts[self] - except KeyError: - # string and bytes have variable length - return None - - @property - def has_endianness(self) -> _bool: - return self.byte_count is not None and self.byte_count != 1 - - def to_numpy_shortname(self) -> str: - data_type_to_numpy = { - DataType.bool: "bool", - DataType.int8: "i1", - DataType.int16: "i2", - DataType.int32: "i4", - DataType.int64: "i8", - DataType.uint8: "u1", - DataType.uint16: "u2", - DataType.uint32: "u4", - DataType.uint64: "u8", - DataType.float16: "f2", - DataType.float32: "f4", - DataType.float64: "f8", - DataType.complex64: "c8", - DataType.complex128: "c16", - } - return data_type_to_numpy[self] - - def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[Any]: - # note: it is not possible to round trip DataType <-> np.dtype - # due to the fact that DataType.string and DataType.bytes both - # generally return np.dtype("O") from this function, even though - # they can originate as fixed-length types (e.g. " DataType: - if dtype.kind in "UT": - return DataType.string - elif dtype.kind == "S": - return DataType.bytes - elif not _NUMPY_SUPPORTS_VLEN_STRING and dtype.kind == "O": - # numpy < 2.0 does not support vlen string dtype - # so we fall back on object array of strings - return DataType.string - dtype_to_data_type = { - "|b1": "bool", - "bool": "bool", - "|i1": "int8", - " DataType: - if dtype is None: - return DataType[DEFAULT_DTYPE] - if isinstance(dtype, DataType): - return dtype - try: - return DataType(dtype) - except ValueError: - pass - try: - dtype = np.dtype(dtype) - except (ValueError, TypeError) as e: - raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e - # check that this is a valid v3 data_type - try: - data_type = DataType.from_numpy(dtype) - except KeyError as e: - raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e - return data_type diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 272a72a16f..db2effaa76 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -24,7 +24,7 @@ from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON from zarr.core.dtype import ZarrDType - from zarr.core.metadata.dtype import DtypeBase + from zarr.core.metadata.dtype import DTypeBase __all__ = [ "Registry", @@ -64,7 +64,7 @@ def register(self, cls: type[T]) -> None: __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() -__data_type_registry: Registry[DtypeBase] = Registry() +__data_type_registry: Registry[DTypeBase] = Registry() __v3_dtype_registry: Registry[ZarrDType] = Registry() __v2_dtype_registry: Registry[ZarrDType] = Registry() @@ -155,7 +155,7 @@ def register_buffer(cls: type[Buffer]) -> None: __buffer_registry.register(cls) -def register_data_type(cls: type[DtypeBase]) -> None: +def register_data_type(cls: type[DTypeBase]) -> None: __data_type_registry.register(cls) @@ -303,7 +303,7 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type(dtype: str) -> type[DtypeBase]: +def get_data_type(dtype: str) -> type[DTypeBase]: __data_type_registry.lazy_load() maybe_dtype_cls = __data_type_registry.get(dtype) if maybe_dtype_cls is None: @@ -311,7 +311,7 @@ def get_data_type(dtype: str) -> type[DtypeBase]: return maybe_dtype_cls -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[DtypeBase]: +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[DTypeBase]: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() for val in __data_type_registry.values(): From 042fac1081b561a07c39c0089945f2a723f61694 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 Feb 2025 18:06:14 +0100 Subject: [PATCH 006/130] get json de/serialization largely working, and start making tests pass --- src/zarr/api/asynchronous.py | 2 +- src/zarr/codecs/sharding.py | 7 +- src/zarr/core/array.py | 18 ++- src/zarr/core/common.py | 9 +- src/zarr/core/config.py | 6 +- src/zarr/core/metadata/dtype.py | 200 ++++++++++++++++++++++++-------- src/zarr/core/metadata/v3.py | 190 ++++-------------------------- src/zarr/registry.py | 56 +++++++-- tests/test_array.py | 10 +- tests/test_metadata/test_v3.py | 147 ++++++----------------- 10 files changed, 291 insertions(+), 354 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 6059893920..792e445c9d 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -982,7 +982,7 @@ async def create( if zarr_format == 2: if chunks is None: chunks = shape - dtype = parse_dtype(dtype, zarr_format) + dtype = parse_dtype(dtype, zarr_format=zarr_format) if not filters: filters = _default_filters(dtype) if not compressor: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 42b1313fac..09ceb538d0 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -355,9 +355,10 @@ def __init__( object.__setattr__(self, "index_location", index_location_parsed) # Use instance-local lru_cache to avoid memory leaks - object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) - object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) - object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) + # TODO: fix these when we don't get hashability errors for certain numpy dtypes + # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) + # object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) + # object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) # todo: typedict return type def __getstate__(self) -> dict[str, Any]: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2986b27fb0..c4da46da92 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -580,7 +580,7 @@ async def _create( """ store_path = await make_store_path(store) - dtype_parsed = parse_dtype(dtype, zarr_format) + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) shape = parse_shapelike(shape) if chunks is not None and chunk_shape is not None: @@ -693,13 +693,23 @@ def _create_metadata_v3( category=UserWarning, stacklevel=2, ) + + # resolve the numpy dtype into zarr v3 datatype + zarr_data_type = get_data_type_from_numpy(dtype) + + if fill_value is None: + # v3 spec will not allow a null fill value + fill_value_parsed = dtype.type(zarr_data_type.default) + else: + fill_value_parsed = fill_value + chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape) return ArrayV3Metadata( shape=shape, - data_type=dtype, + data_type=zarr_data_type, chunk_grid=chunk_grid_parsed, chunk_key_encoding=chunk_key_encoding_parsed, - fill_value=fill_value, + fill_value=fill_value_parsed, codecs=codecs, dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, @@ -3909,7 +3919,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_parsed = parse_dtype(dtype) + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index e005cceed0..e398eff406 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -19,6 +19,7 @@ import numpy as np from zarr.core.config import config as zarr_config +from zarr.core.strings import _VLEN_STRING_DTYPE if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -166,7 +167,13 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any) -> np.dtype[Any]: +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: + if dtype is str or dtype == "str": + if zarr_format == 2: + # special case as object + return np.dtype("object") + else: + return _VLEN_STRING_DTYPE return np.dtype(dtype) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index c565cb0708..98252f572c 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -88,13 +88,17 @@ def enable_gpu(self) -> ConfigSet: "bytes": [{"id": "vlen-bytes"}], "raw": None, }, - "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, + "v3_default_filters": {"boolean": [], "numeric": [], "string": [], "bytes": []}, "v3_default_serializer": { + "boolean": {"name": "bytes", "configuration": {"endian": "little"}}, "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, "string": {"name": "vlen-utf8"}, "bytes": {"name": "vlen-bytes"}, }, "v3_default_compressors": { + "boolean": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], "numeric": [ {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 19a00343c8..8f940b0e0b 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from collections.abc import Sequence from dataclasses import dataclass from typing import Any, ClassVar, Literal, Self, TypeGuard, cast, get_args @@ -8,7 +9,7 @@ from zarr.abc.metadata import Metadata from zarr.core.common import JSON, ZarrFormat from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.registry import register_data_type +from zarr.registry import get_data_type_from_dict, register_data_type Endianness = Literal["little", "big", "native"] DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] @@ -30,11 +31,11 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", ) -def check_json_bool(data: JSON) -> TypeGuard[bool]: +def check_str(data: JSON) -> TypeGuard[bool]: return bool(isinstance(data, bool)) -def check_json_int(data: JSON) -> TypeGuard[int]: +def check_int(data: JSON) -> TypeGuard[int]: return bool(isinstance(data, int)) @@ -42,7 +43,21 @@ def check_json_float(data: JSON) -> TypeGuard[float]: if data == "NaN" or data == "Infinity" or data == "-Infinity": return True else: - return bool(isinstance(data, float)) + return bool(isinstance(data, float | int)) + + +def check_json_complex_float(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float(data[0]) + and check_json_float(data[1]) + ) + + +def check_str(data: JSON) -> TypeGuard[str]: + return bool(isinstance(data, str)) def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: @@ -70,16 +85,16 @@ def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JS raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_to_json_v2(data: complex | np.complex_) -> JSONFloat: +def complex_to_json_v2(data: complex | np.complexfloating) -> JSONFloat: return float_to_json_v2(data) -def complex_to_json_v3(data: complex | np.complex_) -> tuple[JSONFloat, JSONFloat]: +def complex_to_json_v3(data: complex | np.complexfloating) -> tuple[JSONFloat, JSONFloat]: return float_to_json_v3(data.real), float_to_json_v3(data.imag) def complex_to_json( - data: complex | np.complex_, zarr_format: ZarrFormat + data: complex | np.complexfloating, zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat] | JSONFloat: if zarr_format == 2: return complex_to_json_v2(data) @@ -88,7 +103,7 @@ def complex_to_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def float_from_json_v2(data: JSONFloat, dtype: np.floating[Any]) -> np.float_: +def float_from_json_v2(data: JSONFloat, dtype: np.floating[Any]) -> np.floating[Any]: if data == "NaN": _data = np.nan elif data == "Infinity": @@ -113,21 +128,24 @@ def float_from_json(data: JSONFloat, dtype: Any, zarr_format: ZarrFormat) -> np. raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complex_: +def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating: return dtype.type(data) -def complex_from_json_v3(data: tuple[JSONFloat, JSONFloat], dtype: Any) -> np.complex_: - return dtype.type(data[0] + 1j * data[1]) +def complex_from_json_v3(data: tuple[JSONFloat, JSONFloat], dtype: Any) -> np.complexfloating: + return dtype.type(complex(*data)) def complex_from_json( data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat -) -> np.complex_: +) -> np.complexfloating: if zarr_format == 2: return complex_from_json_v2(data, dtype) else: - return complex_from_json_v3(data, dtype) + if check_json_complex_float(data): + return complex_from_json_v3(data, dtype) + else: + raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") @@ -203,7 +221,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bool_: - if check_json_bool(data): + if check_str(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @@ -222,13 +240,13 @@ class Int8(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int8: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -247,13 +265,13 @@ class UInt8(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint8: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -272,13 +290,13 @@ class Int16(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int16: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -297,13 +315,13 @@ class UInt16(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint16: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -322,13 +340,13 @@ class Int32(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int32: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -347,13 +365,13 @@ class UInt32(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint32: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -372,11 +390,13 @@ class Int64(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.int64: - if check_json_int(data): + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.int64: + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -395,11 +415,13 @@ class UInt64(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.uint64: - if check_json_int(data): + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.uint64: + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -418,10 +440,12 @@ class Float16(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> float: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: return float(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float16: + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.float16: if check_json_float(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -441,10 +465,12 @@ class Float32(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> float: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: return float(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float32: + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.float32: if check_json_float(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -464,10 +490,12 @@ class Float64(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> float: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: return float(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float64: + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.float64: if check_json_float(data): return float_from_json(data, dtype=self.to_numpy(endianness=endianness)) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -487,13 +515,19 @@ class Complex64(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> float: - return float(data) + def to_json_value( + self, data: np.generic, zarr_format: ZarrFormat + ) -> tuple[JSONFloat, JSONFloat]: + return complex_to_json(data, zarr_format) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.complex64: - if check_json_float(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected a float.") + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.complex64: + if check_json_complex_float(data): + return complex_from_json( + data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + ) + raise TypeError(f"Invalid type: {data}. Expected a complex float.") register_data_type(Complex64) @@ -501,7 +535,7 @@ def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) - @dataclass(frozen=True, kw_only=True) class Complex128(DTypeBase): - name = "complex64" + name = "complex128" item_size = 32 kind = "numeric" numpy_character_code = "D" @@ -510,6 +544,20 @@ class Complex128(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: return super().to_numpy(endianness=endianness) + def to_json_value( + self, data: np.generic, zarr_format: ZarrFormat + ) -> tuple[JSONFloat, JSONFloat]: + return complex_to_json(data, zarr_format) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.complex128: + if check_json_complex_float(data): + return complex_from_json( + data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + ) + raise TypeError(f"Invalid type: {data}. Expected a complex float.") + register_data_type(Complex128) @@ -536,6 +584,21 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.byte endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + def to_json_value( + self, data: np.generic, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> str: + return data.tobytes().decode("ascii") + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.bytes_: + if check_str(data): + return self.to_numpy(endianness=endianness).type(data.encode("ascii")) + raise TypeError(f"Invalid type: {data}. Expected a string.") + + +register_data_type(StaticByteString) + @dataclass(frozen=True, kw_only=True) class StaticRawBytes(DTypeBase, Flexible): @@ -559,8 +622,17 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: + return tuple(*data.tobytes()) -register_data_type(StaticByteString) + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.void: + # todo: check that this is well-formed + return self.to_numpy(endianness=endianness).type(bytes(data)) + + +register_data_type(StaticRawBytes) if _NUMPY_SUPPORTS_VLEN_STRING: @@ -583,6 +655,14 @@ def to_numpy( endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> "np.dtypes.StringDType": + return self.to_numpy(endianness=endianness).type(data) + else: @dataclass(frozen=True, kw_only=True) @@ -602,6 +682,14 @@ def to_numpy( endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) + def to_json_value(self, data, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.dtypes.ObjectDType: + return self.to_numpy(endianness=endianness).type(data) + register_data_type(VlenString) @@ -628,20 +716,30 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_ endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.str_: + if not check_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.to_numpy(endianness=endianness).type(data) + register_data_type(StaticUnicodeString) -def resolve_dtype(dtype: npt.DTypeLike | DTypeBase) -> DTypeBase: +def resolve_dtype(dtype: npt.DTypeLike | DTypeBase | dict[str, JSON]) -> DTypeBase: from zarr.registry import get_data_type_from_numpy if isinstance(dtype, DTypeBase): return dtype - cls = get_data_type_from_numpy(dtype) - return cls.from_numpy(dtype) - + elif isinstance(dtype, dict): + return get_data_type_from_dict(dtype) + else: + return get_data_type_from_numpy(dtype) -register_data_type(StaticRawBytes) INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 87bb001164..b117e00dd0 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -1,20 +1,9 @@ from __future__ import annotations -import warnings -from typing import TYPE_CHECKING, TypedDict, overload +from typing import TYPE_CHECKING, TypedDict from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import ( - COMPLEX_DTYPE, - FLOAT_DTYPE, - INTEGER_DTYPE, - STRING_DTYPE, - Bool, - DTypeBase, - StaticRawBytes, - resolve_dtype, -) if TYPE_CHECKING: from collections.abc import Callable @@ -23,16 +12,18 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords + from zarr.core.metadata.dtype import ( + DTypeBase, + ) import json -from collections.abc import Iterable, Sequence +from collections.abc import Iterable from dataclasses import dataclass, field, replace from enum import Enum from typing import Any, Literal import numcodecs.abc import numpy as np -import numpy.typing as npt from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -48,7 +39,7 @@ from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec_class +from zarr.registry import get_codec_class, get_data_type_by_name, get_data_type_from_dict DEFAULT_DTYPE = "float64" @@ -259,7 +250,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: npt.DTypeLike | DTypeBase, + data_type: DTypeBase, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -272,12 +263,12 @@ def __init__( Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ shape_parsed = parse_shapelike(shape) - data_type_parsed = resolve_dtype(data_type) + data_type_parsed = data_type chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # we pass a string here rather than an enum to make mypy happy - fill_value_parsed = parse_fill_value(fill_value, data_type_parsed) + fill_value_parsed = data_type_parsed.to_numpy().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) @@ -392,7 +383,8 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - d = _replace_special_floats(self.to_dict()) + d = self.to_dict() + # d = _replace_special_floats(self.to_dict()) return {ZARR_JSON: prototype.buffer.from_bytes(json.dumps(d, cls=V3JsonEncoder).encode())} @classmethod @@ -405,8 +397,13 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: # check that the node_type attribute is correct _ = parse_node_type_array(_data.pop("node_type")) - # check that the data_type attribute is valid - data_type = DataType.parse(_data.pop("data_type")) + data_type_json = _data.pop("data_type") + if isinstance(data_type_json, str): + # check that the data_type attribute is valid + data_type = get_data_type_by_name(data_type_json) + + else: + data_type = get_data_type_from_dict(data_type_json) # dimension_names key is optional, normalize missing to `None` _data["dimension_names"] = _data.pop("dimension_names", None) @@ -416,7 +413,9 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: out_dict = super().to_dict() - + out_dict["fill_value"] = self.data_type.to_json_value( + self.fill_value, zarr_format=self.zarr_format + ) if not isinstance(out_dict, dict): raise TypeError(f"Expected dict. Got {type(out_dict)}.") @@ -424,6 +423,9 @@ def to_dict(self) -> dict[str, JSON]: # the metadata document if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") + # if data_type has no configuration, we just serialize the name + if "configuration" not in out_dict["data_type"]: + out_dict["data_type"] = out_dict["data_type"]["name"] return out_dict def update_shape(self, shape: ChunkCoords) -> Self: @@ -431,147 +433,3 @@ def update_shape(self, shape: ChunkCoords) -> Self: def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) - - -# enum Literals can't be used in typing, so we have to restate all of the V3 dtypes as types -# https://github.com/python/typing/issues/781 - -BOOL = np.bool_ -INTEGER = np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 -FLOAT = np.float16 | np.float32 | np.float64 -COMPLEX = np.complex64 | np.complex128 - -STRING = np.str_ -BYTES = np.bytes_ - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: Bool, -) -> BOOL: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: INTEGER_DTYPE, -) -> INTEGER: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: FLOAT_DTYPE, -) -> FLOAT: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: COMPLEX_DTYPE, -) -> COMPLEX: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: STRING_DTYPE, -) -> STRING: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: StaticRawBytes, -) -> BYTES: ... - - -def parse_fill_value( - fill_value: Any, - dtype: DTypeBase, -) -> np.generic: - """ - Parse `fill_value`, a potential fill value, into an instance of `dtype`, a data type. - If `fill_value` is `None`, then this function will return the result of casting the value 0 - to the provided data type. Otherwise, `fill_value` will be cast to the provided data type. - - Note that some numpy dtypes use very permissive casting rules. For example, - `np.bool_({'not remotely a bool'})` returns `True`. Thus this function should not be used for - validating that the provided fill value is a valid instance of the data type. - - Parameters - ---------- - fill_value : Any - A potential fill value. - dtype : DTypeBase - A valid Zarr format 3 DataType. - - Returns - ------- - A scalar instance of `dtype` - """ - if fill_value is None: - raise ValueError("Fill value cannot be None") - - if dtype.kind == "string": - return np.str_(fill_value) - if dtype.kind == "bytes": - return np.bytes_(fill_value) - - # the rest are numeric types - np_dtype = dtype.to_numpy() - - if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): - if isindata_type in (DataType.complex64, DataType.complex128): - if len(fill_value) == 2: - decoded_fill_value = tuple( - SPECIAL_FLOATS_ENCODED.get(value, value) for value in fill_value - ) - # complex datatypes serialize to JSON arrays with two elements - return np_dtype.type(complex(*decoded_fill_value)) - else: - msg = ( - f"Got an invalid fill value for complex data type {data_type.value}." - f"Expected a sequence with 2 elements, but {fill_value!r} has " - f"length {len(fill_value)}." - ) - raise ValueError(msg) - msg = f"Cannot parse non-string sequence {fill_value!r} as a scalar with type {data_type.value}." - raise TypeError(msg) - - # Cast the fill_value to the given dtype - try: - # This warning filter can be removed after Zarr supports numpy>=2.0 - # The warning is saying that the future behavior of out of bounds casting will be to raise - # an OverflowError. In the meantime, we allow overflow and catch cases where - # fill_value != casted_value below. - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - casted_value = np.dtype(np_dtype).type(fill_value) - except (ValueError, OverflowError, TypeError) as e: - raise ValueError(f"fill value {fill_value!r} is not valid for dtype {data_type}") from e - # Check if the value is still representable by the dtype - if (fill_value == "NaN" and np.isnan(casted_value)) or ( - fill_value in ["Infinity", "-Infinity"] and not np.isfinite(casted_value) - ): - pass - elif np_dtype.kind == "f": - # float comparison is not exact, especially when dtype None: self[fully_qualified_name(cls)] = cls +@dataclass(frozen=True, kw_only=True) +class DataTypeRegistry: + contents: dict[str, type[DTypeBase]] = field(default_factory=dict, init=False) + lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) + + def lazy_load(self) -> None: + for e in self.lazy_load_list: + self.register(e.load()) + + self.lazy_load_list.clear() + + def register(self: Self, cls: type[DTypeBase], clobber: bool = False) -> None: + if cls.name in self.contents and not clobber: + raise ValueError( + f"Data type {cls.name} already registered. Use clobber=True to overwrite." + ) + self.contents[cls.name] = cls + + def get(self, key: str) -> type[DTypeBase]: + return self.contents[key] + + __codec_registries: dict[str, Registry[Codec]] = defaultdict(Registry) __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() -__data_type_registry: Registry[DTypeBase] = Registry() +__data_type_registry = DataTypeRegistry() __v3_dtype_registry: Registry[ZarrDType] = Registry() __v2_dtype_registry: Registry[ZarrDType] = Registry() @@ -103,8 +126,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + # __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + # __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v3dtype")) __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v3dtype")) @@ -303,22 +326,35 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type(dtype: str) -> type[DTypeBase]: +def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = None) -> DTypeBase: __data_type_registry.lazy_load() + if configuration is None: + _configuration = {} + else: + _configuration = configuration maybe_dtype_cls = __data_type_registry.get(dtype) if maybe_dtype_cls is None: raise ValueError(f"No data type class matching name {dtype}") - return maybe_dtype_cls + return maybe_dtype_cls.from_dict(_configuration) -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[DTypeBase]: +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeBase: + __data_type_registry.lazy_load() + dtype_name = dtype["name"] + dtype_cls = __data_type_registry.get(dtype_name) + if dtype_cls is None: + raise ValueError(f"No data type class matching name {dtype_name}") + return dtype_cls.from_dict(dtype.get("configuration", {})) + + +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeBase: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() - for val in __data_type_registry.values(): + for val in __data_type_registry.contents.values(): if val.numpy_character_code == np_dtype.char: - return val + return val.from_numpy(np_dtype) raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry)}." + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." ) diff --git a/tests/test_array.py b/tests/test_array.py index 72c1bbf1b7..ce149d0f9a 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -43,7 +43,7 @@ from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv -from zarr.core.metadata.v3 import ArrayV3Metadata, DataType +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath @@ -509,7 +509,7 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=DataType.parse("float64"), + _data_type=arr.metadata.data_type, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -534,7 +534,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=DataType.parse("float64"), + _data_type=arr.metadata.data_type, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -594,7 +594,7 @@ async def test_info_v3_async( result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=DataType.parse("float64"), + _data_type=arr.metadata.data_type, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -621,7 +621,7 @@ async def test_info_complete_async( result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=DataType.parse("float64"), + _data_type=arr.metadata.data_type, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 4f6b2a5de6..74caf2ab43 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,14 +12,14 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.group import GroupMetadata, parse_node_type +from zarr.core.metadata.dtype import complex_from_json from zarr.core.metadata.v3 import ( ArrayV3Metadata, - default_fill_value, parse_dimension_names, - parse_fill_value, parse_zarr_format, ) from zarr.errors import MetadataValidationError +from zarr.registry import get_data_type_from_numpy if TYPE_CHECKING: from collections.abc import Sequence @@ -107,90 +107,32 @@ def parse_dimension_names_valid(data: Sequence[str] | None) -> None: assert parse_dimension_names(data) == data -@pytest.mark.parametrize("dtype_str", dtypes) -def test_default_fill_value(dtype_str: str) -> None: - """ - Test that parse_fill_value(None, dtype) results in the 0 value for the given dtype. - """ - dtype = DataType(dtype_str) - fill_value = default_fill_value(dtype) - if dtype == DataType.string: - assert fill_value == "" - elif dtype == DataType.bytes: - assert fill_value == b"" - else: - assert fill_value == dtype.to_numpy().type(0) - - -@pytest.mark.parametrize( - ("fill_value", "dtype_str"), - [ - (True, "bool"), - (False, "bool"), - (-8, "int8"), - (0, "int16"), - (1e10, "uint64"), - (-999, "float32"), - (1e32, "float64"), - (float("NaN"), "float64"), - (np.nan, "float64"), - (np.inf, "float64"), - (-1 * np.inf, "float64"), - (0j, "complex64"), - ], -) -def test_parse_fill_value_valid(fill_value: Any, dtype_str: str) -> None: - """ - Test that parse_fill_value(fill_value, dtype) casts fill_value to the given dtype. - """ - parsed = parse_fill_value(fill_value, dtype_str) - - if np.isnan(fill_value): - assert np.isnan(parsed) - else: - assert parsed == DataType(dtype_str).to_numpy().type(fill_value) - - -@pytest.mark.parametrize("fill_value", ["not a valid value"]) -@pytest.mark.parametrize("dtype_str", [*int_dtypes, *float_dtypes, *complex_dtypes]) -def test_parse_fill_value_invalid_value(fill_value: Any, dtype_str: str) -> None: - """ - Test that parse_fill_value(fill_value, dtype) raises ValueError for invalid values. - This test excludes bool because the bool constructor takes anything. - """ - with pytest.raises(ValueError): - parse_fill_value(fill_value, dtype_str) - - -@pytest.mark.parametrize("fill_value", [[1.0, 0.0], [0, 1], complex(1, 1), np.complex64(0)]) +@pytest.mark.parametrize("fill_value", [[1.0, 0.0], [0, 1]]) @pytest.mark.parametrize("dtype_str", [*complex_dtypes]) -def test_parse_fill_value_complex(fill_value: Any, dtype_str: str) -> None: +def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ Test that parse_fill_value(fill_value, dtype) correctly handles complex values represented as length-2 sequences """ - dtype = DataType(dtype_str) - if isinstance(fill_value, list): - expected = dtype.to_numpy().type(complex(*fill_value)) - else: - expected = dtype.to_numpy().type(fill_value) - assert expected == parse_fill_value(fill_value, dtype_str) + zarr_format = 3 + dtype = get_data_type_from_numpy(dtype_str) + expected = dtype.to_numpy().type(complex(*fill_value)) + observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) + assert observed == expected + assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) -@pytest.mark.parametrize("fill_value", [[1.0, 0.0, 3.0], [0, 1, 3], [1]]) @pytest.mark.parametrize("dtype_str", [*complex_dtypes]) -def test_parse_fill_value_complex_invalid(fill_value: Any, dtype_str: str) -> None: +@pytest.mark.parametrize("data", [[1.0, 0.0, 3.0], [0, 1, 3], [1]]) +def test_complex_to_json_invalid(data: object, dtype_str: str) -> None: """ Test that parse_fill_value(fill_value, dtype) correctly rejects sequences with length not equal to 2 """ - match = ( - f"Got an invalid fill value for complex data type {dtype_str}." - f"Expected a sequence with 2 elements, but {fill_value} has " - f"length {len(fill_value)}." - ) - with pytest.raises(ValueError, match=re.escape(match)): - parse_fill_value(fill_value=fill_value, dtype=dtype_str) + dtype_instance = get_data_type_from_numpy(dtype_str) + match = f"Invalid type: {data}. Expected a sequence of two numbers." + with pytest.raises(TypeError, match=re.escape(match)): + complex_from_json(data=data, dtype=dtype_instance, zarr_format=3) @pytest.mark.parametrize("fill_value", [{"foo": 10}]) @@ -200,8 +142,9 @@ def test_parse_fill_value_invalid_type(fill_value: Any, dtype_str: str) -> None: Test that parse_fill_value(fill_value, dtype) raises TypeError for invalid non-sequential types. This test excludes bool because the bool constructor takes anything. """ - with pytest.raises(ValueError, match=r"fill value .* is not valid for dtype .*"): - parse_fill_value(fill_value, dtype_str) + dtype_instance = get_data_type_from_numpy(dtype_str) + with pytest.raises(TypeError, match=f"Invalid type: {fill_value}"): + dtype_instance.from_json_value(fill_value, zarr_format=3) @pytest.mark.parametrize( @@ -220,9 +163,9 @@ def test_parse_fill_value_invalid_type_sequence(fill_value: Any, dtype_str: str) This test excludes bool because the bool constructor takes anything, and complex because complex values can be created from length-2 sequences. """ - match = f"Cannot parse non-string sequence {fill_value} as a scalar with type {dtype_str}" - with pytest.raises(TypeError, match=re.escape(match)): - parse_fill_value(fill_value, dtype_str) + dtype_instance = get_data_type_from_numpy(dtype_str) + with pytest.raises(TypeError, match=re.escape(f"Invalid type: {fill_value}")): + dtype_instance.from_json_value(fill_value, zarr_format=3) @pytest.mark.parametrize("chunk_grid", ["regular"]) @@ -244,7 +187,7 @@ def test_metadata_to_dict( storage_transformers: tuple[dict[str, JSON]] | None, ) -> None: shape = (1, 2, 3) - data_type = DataType.uint8 + data_type_str = "uint8" if chunk_grid == "regular": cgrid = {"name": "regular", "configuration": {"chunk_shape": (1, 1, 1)}} @@ -268,7 +211,7 @@ def test_metadata_to_dict( "node_type": "array", "shape": shape, "chunk_grid": cgrid, - "data_type": data_type, + "data_type": data_type_str, "chunk_key_encoding": cke, "codecs": tuple(c.to_dict() for c in codecs), "fill_value": fill_value, @@ -312,46 +255,26 @@ def test_json_indent(indent: int): assert d == json.dumps(json.loads(d), indent=indent).encode() -# @pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) -# @pytest.mark.parametrize("precision", ["ns", "D"]) -# async def test_datetime_metadata(fill_value: int, precision: str) -> None: -# metadata_dict = { -# "zarr_format": 3, -# "node_type": "array", -# "shape": (1,), -# "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, -# "data_type": f" None: +@pytest.mark.xfail(reason="Data type not supported yet") +@pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) +@pytest.mark.parametrize("precision", ["ns", "D"]) +async def test_datetime_metadata(fill_value: int, precision: str) -> None: metadata_dict = { "zarr_format": 3, "node_type": "array", "shape": (1,), "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, - "data_type": " Date: Thu, 27 Feb 2025 18:11:18 +0100 Subject: [PATCH 007/130] tweak json type guards --- src/zarr/core/metadata/dtype.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 8f940b0e0b..542cc85e5f 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -31,14 +31,15 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", ) -def check_str(data: JSON) -> TypeGuard[bool]: +def check_json_bool(data: JSON) -> TypeGuard[bool]: return bool(isinstance(data, bool)) +def check_json_str(data: JSON) -> TypeGuard[str]: + return bool(isinstance(data, str)) -def check_int(data: JSON) -> TypeGuard[int]: +def check_json_int(data: JSON) -> TypeGuard[int]: return bool(isinstance(data, int)) - def check_json_float(data: JSON) -> TypeGuard[float]: if data == "NaN" or data == "Infinity" or data == "-Infinity": return True @@ -56,10 +57,6 @@ def check_json_complex_float(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat ) -def check_str(data: JSON) -> TypeGuard[str]: - return bool(isinstance(data, str)) - - def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: if np.isnan(data): return "NaN" @@ -221,7 +218,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bool_: - if check_str(data): + if check_json_bool(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @@ -246,7 +243,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int8: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -271,7 +268,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint8: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -296,7 +293,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int16: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -321,7 +318,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint16: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -346,7 +343,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int32: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -371,7 +368,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint32: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -396,7 +393,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int64: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -421,7 +418,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint64: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -592,7 +589,7 @@ def to_json_value( def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bytes_: - if check_str(data): + if check_json_bool(data): return self.to_numpy(endianness=endianness).type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -722,7 +719,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.str_: - if not check_str(data): + if not check_json_bool(data): raise TypeError(f"Invalid type: {data}. Expected a string.") return self.to_numpy(endianness=endianness).type(data) From b588f7025a86b6c003887d7a538b68b5c7025a28 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 Feb 2025 19:55:51 +0100 Subject: [PATCH 008/130] fix dtype sizes, adjust fill value parsing in from_dict, fix tests --- src/zarr/core/metadata/dtype.py | 15 ++++++++++----- src/zarr/core/metadata/v3.py | 17 ++++++++++------- tests/test_metadata/test_v3.py | 24 ++++++++++++------------ 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 542cc85e5f..008751adc5 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -34,12 +34,15 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", def check_json_bool(data: JSON) -> TypeGuard[bool]: return bool(isinstance(data, bool)) + def check_json_str(data: JSON) -> TypeGuard[str]: return bool(isinstance(data, str)) + def check_json_int(data: JSON) -> TypeGuard[int]: return bool(isinstance(data, int)) + def check_json_float(data: JSON) -> TypeGuard[float]: if data == "NaN" or data == "Infinity" or data == "-Infinity": return True @@ -254,7 +257,7 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) class UInt8(DTypeBase): name = "uint8" - item_size = 2 + item_size = 1 kind = "numeric" numpy_character_code = "B" default = 0 @@ -488,13 +491,15 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64 return super().to_numpy(endianness=endianness) def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: - return float(data) + return float_to_json(data, zarr_format) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.float64: if check_json_float(data): - return float_from_json(data, dtype=self.to_numpy(endianness=endianness)) + return float_from_json( + data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + ) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -504,7 +509,7 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) class Complex64(DTypeBase): name = "complex64" - item_size = 16 + item_size = 8 kind = "numeric" numpy_character_code = "F" default = 0.0 + 0.0j @@ -533,7 +538,7 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) class Complex128(DTypeBase): name = "complex128" - item_size = 32 + item_size = 16 kind = "numeric" numpy_character_code = "D" default = 0.0 + 0.0j diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index b117e00dd0..ce1a8b77fa 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -263,28 +263,26 @@ def __init__( Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ shape_parsed = parse_shapelike(shape) - data_type_parsed = data_type chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - # we pass a string here rather than an enum to make mypy happy - fill_value_parsed = data_type_parsed.to_numpy().type(fill_value) + fill_value_parsed = data_type.to_numpy().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type_parsed.to_numpy(), + dtype=data_type.to_numpy(), fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial) - validate_codecs(codecs_parsed_partial, data_type_parsed) + validate_codecs(codecs_parsed_partial, data_type) object.__setattr__(self, "shape", shape_parsed) - object.__setattr__(self, "data_type", data_type_parsed) + object.__setattr__(self, "data_type", data_type) object.__setattr__(self, "chunk_grid", chunk_grid_parsed) object.__setattr__(self, "chunk_key_encoding", chunk_key_encoding_parsed) object.__setattr__(self, "codecs", codecs_parsed) @@ -405,11 +403,16 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: else: data_type = get_data_type_from_dict(data_type_json) + # check that the fill value is consistent with the data type + fill_value_parsed = data_type.from_json_value(_data.pop("fill_value"), zarr_format=3) + # dimension_names key is optional, normalize missing to `None` _data["dimension_names"] = _data.pop("dimension_names", None) + # attributes key is optional, normalize missing to `None` _data["attributes"] = _data.pop("attributes", None) - return cls(**_data, data_type=data_type) # type: ignore[arg-type] + + return cls(**_data, fill_value=fill_value_parsed, data_type=data_type) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: out_dict = super().to_dict() diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 74caf2ab43..41d8b9a4d5 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,7 +12,7 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import complex_from_json +from zarr.core.metadata.dtype import Flexible, complex_from_json from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -278,7 +278,7 @@ async def test_datetime_metadata(fill_value: int, precision: str) -> None: @pytest.mark.parametrize( - ("data_type", "fill_value"), [("uint8", -1), ("int32", 22.5), ("float32", "foo")] + ("data_type", "fill_value"), [("uint8", {}), ("int32", [0, 1]), ("float32", "foo")] ) async def test_invalid_fill_value_raises(data_type: str, fill_value: float) -> None: metadata_dict = { @@ -288,10 +288,11 @@ async def test_invalid_fill_value_raises(data_type: str, fill_value: float) -> N "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, "data_type": data_type, "chunk_key_encoding": {"name": "default", "separator": "."}, - "codecs": (), + "codecs": ({"name": "bytes"},), "fill_value": fill_value, # this is not a valid fill value for uint8 } - with pytest.raises(ValueError, match=r"fill value .* is not valid for dtype .*"): + # multiple things can go wrong here, so we don't match on the error message. + with pytest.raises(TypeError): ArrayV3Metadata.from_dict(metadata_dict) @@ -323,13 +324,12 @@ async def test_special_float_fill_values(fill_value: str) -> None: @pytest.mark.parametrize("dtype_str", dtypes) def test_dtypes(dtype_str: str) -> None: - dt = DataType(dtype_str) + dt = get_data_type_from_numpy(dtype_str) np_dtype = dt.to_numpy() - if dtype_str not in vlen_dtypes: - # we can round trip "normal" dtypes - assert dt == DataType.from_numpy(np_dtype) - assert dt.byte_count == np_dtype.itemsize - assert dt.has_endianness == (dt.byte_count > 1) + + if not isinstance(dt, Flexible): + assert dt.item_size == np_dtype.itemsize else: - # return type for vlen types may vary depending on numpy version - assert dt.byte_count is None + assert dt.length == np_dtype.itemsize + + assert dt.numpy_character_code == np_dtype.char From 4ed41c6a9b5731d336239325ebcec3321c4ff585 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 2 Mar 2025 12:54:57 +0100 Subject: [PATCH 009/130] mid-refactor commit --- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 6 +- src/zarr/core/metadata/dtype.py | 345 ++++++-------------------------- src/zarr/core/metadata/v3.py | 8 +- src/zarr/registry.py | 18 +- 5 files changed, 80 insertions(+), 301 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 2ede547600..6b594583e2 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -7,7 +7,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.dtype import DTypeBase +from zarr.core.metadata.dtype import DTypeWrapper # from zarr.core.metadata.v3 import DataType @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | DTypeBase + _data_type: np.dtype[Any] | DTypeWrapper _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index c4da46da92..975408a01d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -98,7 +98,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import DTypeBase +from zarr.core.metadata.dtype import DTypeWrapper from zarr.core.metadata.v2 import ( _default_compressor, _default_filters, @@ -699,7 +699,7 @@ def _create_metadata_v3( if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = dtype.type(zarr_data_type.default) + fill_value_parsed = dtype.type(zarr_data_type._default_value) else: fill_value_parsed = fill_value @@ -1694,7 +1694,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | DTypeBase + _data_type: np.dtype[Any] | DTypeWrapper if isinstance(self.metadata, ArrayV2Metadata): _data_type = self.metadata.dtype else: diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 008751adc5..106b3088d0 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, ClassVar, Literal, Self, TypeGuard, cast, get_args +from typing import Any, ClassVar, Generic, Literal, Self, TypeGuard, TypeVar, cast, get_args import numpy as np import numpy.typing as npt @@ -148,44 +148,28 @@ def complex_from_json( raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") +TDType = TypeVar("TDType", bound=np.dtype[Any]) +TScalar = TypeVar("TScalar", bound=np.generic) @dataclass(frozen=True, kw_only=True) class Flexible: length: int - -class DTypeBase(ABC, Metadata): +class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): name: ClassVar[str] - numpy_character_code: ClassVar[str] - item_size: ClassVar[int | None] + dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype kind: ClassVar[DataTypeFlavor] - default: object - - def __init_subclass__(cls, **kwargs: object) -> None: - required_attrs = ["name", "numpy_character_code", "item_size", "kind", "default"] - for attr in required_attrs: - if not hasattr(cls, attr): - raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") - - return super().__init_subclass__(**kwargs) + _default_value: object def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - @classmethod - def from_numpy(cls, dtype: npt.DTypeLike) -> Self: - if np.dtype(dtype).char != cls.numpy_character_code: - raise ValueError( - f"Invalid dtype {dtype}. Expected dtype with character code == {cls.numpy_character_code}." - ) - return cls() - - def default_value(self: Self, *, endianness: Endianness | None = None) -> np.generic: - return cast(np.generic, self.to_numpy(endianness=endianness).type(self.default)) + def default_value(self: Self, *, endianness: Endianness | None = None) -> TScalar: + return cast(np.generic, self.to_numpy(endianness=endianness).type(self._default_value)) - def to_numpy(self: Self, *, endianness: Endianness | None = None) -> np.dtype[Any]: + def to_numpy(self: Self, *, endianness: Endianness | None = None) -> TDType: endian_str = endianness_to_numpy_str(endianness) - return np.dtype(endian_str + self.numpy_character_code) + return self.dtype_cls().newbyteorder(endian_str) @abstractmethod def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: @@ -197,7 +181,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: @abstractmethod def from_json_value( self: Self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.generic: + ) -> TScalar: """ Read a JSON-serializable value as a numpy scalar """ @@ -205,16 +189,11 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class Bool(DTypeBase): +class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): name = "bool" - item_size = 1 kind = "boolean" - numpy_character_code = "?" default = False - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDType: - return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: return bool(data) @@ -228,295 +207,128 @@ def from_json_value( register_data_type(Bool) - -@dataclass(frozen=True, kw_only=True) -class Int8(DTypeBase): - name = "int8" - item_size = 1 +class BaseInt(DTypeWrapper[TDType, TScalar]): kind = "numeric" - numpy_character_code = "b" default = 0 - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: - return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.int8: + ) -> TScalar: if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") +@dataclass(frozen=True, kw_only=True) +class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): + name = "int8" + + register_data_type(Int8) @dataclass(frozen=True, kw_only=True) -class UInt8(DTypeBase): +class UInt8(DTypeWrapper[np.dtypes.UInt8DType, np.uint8]): name = "uint8" - item_size = 1 - kind = "numeric" - numpy_character_code = "B" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.uint8: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(UInt8) @dataclass(frozen=True, kw_only=True) -class Int16(DTypeBase): +class Int16(DTypeWrapper[np.dtypes.Int16DType, np.int16]): name = "int16" - item_size = 2 - kind = "numeric" - numpy_character_code = "h" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.int16: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(Int16) @dataclass(frozen=True, kw_only=True) -class UInt16(DTypeBase): +class UInt16(DTypeWrapper[np.dtypes.UInt16DType, np.uint16]): name = "uint16" - item_size = 2 - kind = "numeric" - numpy_character_code = "H" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.uint16: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - register_data_type(UInt16) @dataclass(frozen=True, kw_only=True) -class Int32(DTypeBase): +class Int32(DTypeWrapper[np.dtypes.Int32DType, np.int32]): name = "int32" - item_size = 4 - kind = "numeric" - numpy_character_code = "i" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.int32: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(Int32) @dataclass(frozen=True, kw_only=True) -class UInt32(DTypeBase): +class UInt32(DTypeWrapper[np.dtypes.UInt32DType, np.uint32]): name = "uint32" - item_size = 4 - kind = "numeric" - numpy_character_code = "I" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.uint32: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - register_data_type(UInt32) @dataclass(frozen=True, kw_only=True) -class Int64(DTypeBase): +class Int64(DTypeWrapper[np.dtypes.Int64DType, np.int64]): name = "int64" - item_size = 8 - kind = "numeric" - numpy_character_code = "l" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.int64: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(Int64) @dataclass(frozen=True, kw_only=True) -class UInt64(DTypeBase): +class UInt64(DTypeWrapper[np.dtypes.UInt64DType, np.uint64]): name = "uint64" - item_size = 8 - kind = "numeric" - numpy_character_code = "L" - default = 0 - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.uint64: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(UInt64) -@dataclass(frozen=True, kw_only=True) -class Float16(DTypeBase): - name = "float16" - item_size = 2 +class FloatBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" - numpy_character_code = "e" default = 0.0 - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: - return float(data) + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: + return float_to_json(data, zarr_format) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.float16: + ) -> TScalar: if check_json_float(data): - return self.to_numpy(endianness=endianness).type(data) + return self.to_numpy(endianness=endianness).type(float_from_json)(data, zarr_format) raise TypeError(f"Invalid type: {data}. Expected a float.") +@dataclass(frozen=True, kw_only=True) +class Float16(DTypeWrapper[np.dtypes.Float16DType, np.float16]): + name = "float16" + register_data_type(Float16) @dataclass(frozen=True, kw_only=True) -class Float32(DTypeBase): +class Float32(DTypeWrapper[np.dtypes.Float32DType, np.float32]): name = "float32" - item_size = 4 - kind = "numeric" - numpy_character_code = "f" - default = 0.0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: - return float(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.float32: - if check_json_float(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected a float.") - + register_data_type(Float32) @dataclass(frozen=True, kw_only=True) -class Float64(DTypeBase): +class Float64(DTypeWrapper[np.dtypes.Float64DType, np.float64]): name = "float64" - item_size = 8 - kind = "numeric" - numpy_character_code = "d" - default = 0.0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: - return float_to_json(data, zarr_format) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.float64: - if check_json_float(data): - return float_from_json( - data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format - ) - raise TypeError(f"Invalid type: {data}. Expected a float.") register_data_type(Float64) @dataclass(frozen=True, kw_only=True) -class Complex64(DTypeBase): +class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): name = "complex64" - item_size = 8 kind = "numeric" - numpy_character_code = "F" default = 0.0 + 0.0j - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: - return super().to_numpy(endianness=endianness) - def to_json_value( self, data: np.generic, zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat]: @@ -536,16 +348,12 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class Complex128(DTypeBase): +class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): name = "complex128" - item_size = 16 kind = "numeric" - numpy_character_code = "D" + dtype_cls = np.dtypes.Complex128DType default = 0.0 + 0.0j - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: - return super().to_numpy(endianness=endianness) - def to_json_value( self, data: np.generic, zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat]: @@ -565,26 +373,17 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class StaticByteString(DTypeBase, Flexible): +class StaticByteString(DTypeWrapper[np.dtypes.BytesDType, np.bytes_], Flexible): name = "numpy/static_byte_string" kind = "string" - numpy_character_code = "S" - item_size = 1 default = b"" - @classmethod - def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: - dtype = np.dtype(dtype) - if dtype.kind != cls.numpy_character_code: - raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") - return cls(length=dtype.itemsize) - def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"capacity": self.length}} - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.bytes_]: + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.BytesDType: endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + return self.dtype_cls(self.length).newbyteorder(endianness_code) def to_json_value( self, data: np.generic, *, zarr_format: ZarrFormat, endianness: Endianness | None = None @@ -603,26 +402,20 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class StaticRawBytes(DTypeBase, Flexible): +class StaticRawBytes(DTypeWrapper[np.dtypes.VoidDType, np.void], Flexible): name = "r*" kind = "bytes" - numpy_character_code = "V" - item_size = 1 default = b"" - @classmethod - def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: - dtype = np.dtype(dtype) - if dtype.kind != "V": - raise ValueError(f"Invalid dtype {dtype}. Expected a bytes dtype.") - return cls(length=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * 8}"} - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.VoidDType: + # this needs to be overridden because numpy does not allow creating a void type + # by invoking np.dtypes.VoidDType directly endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + return np.dtype(f'{endianness_code}V{self.length}') def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: return tuple(*data.tobytes()) @@ -639,13 +432,10 @@ def from_json_value( if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VlenString(DTypeBase): + class VlenString(DTypeWrapper[np.dtypes.StringDType, str]): name = "numpy/vlen_string" kind = "string" - numpy_character_code = "T" - # this uses UTF-8, so the encoding of a code point varies between - # 1 and 4 bytes - item_size = None + dtype_cls = np.dtypes.StringDType default = "" def to_dict(self) -> dict[str, JSON]: @@ -653,7 +443,7 @@ def to_dict(self) -> dict[str, JSON]: def to_numpy( self, endianness: Endianness | None = "native" - ) -> np.dtype[np.dtypes.StringDType]: + ) -> np.dtypes.StringDType: endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) @@ -662,34 +452,32 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> "np.dtypes.StringDType": + ) -> str: return self.to_numpy(endianness=endianness).type(data) else: @dataclass(frozen=True, kw_only=True) - class VlenString(DTypeBase): + class VlenString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" kind = "string" - numpy_character_code = "O" - item_size = None + dtype_cls = np.dtypes.ObjectDType default = "" def to_dict(self) -> dict[str, JSON]: return {"name": self.name} def to_numpy( - self, endianness: Endianness | None = "native" + self, endianness: Endianness | None = None ) -> np.dtype[np.dtypes.ObjectDType]: - endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code) + return super().to_numpy(endianness=endianness) def to_json_value(self, data, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.dtypes.ObjectDType: + ) -> str: return self.to_numpy(endianness=endianness).type(data) @@ -697,24 +485,15 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(DTypeBase, Flexible): +class StaticUnicodeString(DTypeWrapper[np.dtypes.StrDType, np.str_], Flexible): name = "numpy/static_unicode_string" kind = "string" - numpy_character_code = "U" - item_size = 4 default = "" - @classmethod - def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: - dtype = np.dtype(dtype) - if dtype.kind != "U": - raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") - return cls(length=dtype.itemsize) - def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"capacity": self.length}} - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_]: + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.StrDType: endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) @@ -732,10 +511,10 @@ def from_json_value( register_data_type(StaticUnicodeString) -def resolve_dtype(dtype: npt.DTypeLike | DTypeBase | dict[str, JSON]) -> DTypeBase: +def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: from zarr.registry import get_data_type_from_numpy - if isinstance(dtype, DTypeBase): + if isinstance(dtype, DTypeWrapper): return dtype elif isinstance(dtype, dict): return get_data_type_from_dict(dtype) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index ce1a8b77fa..6d2f8d35e7 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -13,7 +13,7 @@ from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords from zarr.core.metadata.dtype import ( - DTypeBase, + DTypeWrapper, ) import json @@ -92,7 +92,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeBase) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -235,7 +235,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DTypeBase + data_type: DTypeWrapper chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -250,7 +250,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: DTypeBase, + data_type: DTypeWrapper, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 7ad688a61a..1b8ecc7a92 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -25,7 +25,7 @@ from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON from zarr.core.dtype import ZarrDType - from zarr.core.metadata.dtype import DTypeBase + from zarr.core.metadata.dtype import DTypeWrapper __all__ = [ "Registry", @@ -63,7 +63,7 @@ def register(self, cls: type[T]) -> None: @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: - contents: dict[str, type[DTypeBase]] = field(default_factory=dict, init=False) + contents: dict[str, type[DTypeWrapper]] = field(default_factory=dict, init=False) lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) def lazy_load(self) -> None: @@ -72,14 +72,14 @@ def lazy_load(self) -> None: self.lazy_load_list.clear() - def register(self: Self, cls: type[DTypeBase], clobber: bool = False) -> None: + def register(self: Self, cls: type[DTypeWrapper], clobber: bool = False) -> None: if cls.name in self.contents and not clobber: raise ValueError( f"Data type {cls.name} already registered. Use clobber=True to overwrite." ) self.contents[cls.name] = cls - def get(self, key: str) -> type[DTypeBase]: + def get(self, key: str) -> type[DTypeWrapper]: return self.contents[key] @@ -178,7 +178,7 @@ def register_buffer(cls: type[Buffer]) -> None: __buffer_registry.register(cls) -def register_data_type(cls: type[DTypeBase]) -> None: +def register_data_type(cls: type[DTypeWrapper]) -> None: __data_type_registry.register(cls) @@ -326,7 +326,7 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = None) -> DTypeBase: +def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = None) -> DTypeWrapper: __data_type_registry.lazy_load() if configuration is None: _configuration = {} @@ -338,7 +338,7 @@ def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = No return maybe_dtype_cls.from_dict(_configuration) -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeBase: +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper: __data_type_registry.lazy_load() dtype_name = dtype["name"] dtype_cls = __data_type_registry.get(dtype_name) @@ -347,12 +347,12 @@ def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeBase: return dtype_cls.from_dict(dtype.get("configuration", {})) -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeBase: +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() for val in __data_type_registry.contents.values(): if val.numpy_character_code == np_dtype.char: - return val.from_numpy(np_dtype) + return val.from_str(np_dtype) raise ValueError( f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." ) From 1b2c773fca1f92caef8b33f41865a31df4e8fa26 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 2 Mar 2025 19:44:43 +0100 Subject: [PATCH 010/130] working form for dtype classes --- src/zarr/core/array.py | 2 +- src/zarr/core/metadata/dtype.py | 366 ++++++++++++++++++-------------- src/zarr/core/metadata/v3.py | 8 +- src/zarr/registry.py | 14 +- tests/test_codecs/test_vlen.py | 8 +- tests/test_metadata/test_v3.py | 23 +- 6 files changed, 234 insertions(+), 187 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 975408a01d..7edd467a54 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -699,7 +699,7 @@ def _create_metadata_v3( if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = dtype.type(zarr_data_type._default_value) + fill_value_parsed = zarr_data_type.default_value else: fill_value_parsed = fill_value diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 106b3088d0..1b57831943 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -5,11 +5,12 @@ import numpy as np import numpy.typing as npt +from typing_extensions import get_original_bases from zarr.abc.metadata import Metadata from zarr.core.common import JSON, ZarrFormat from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.registry import get_data_type_from_dict, register_data_type +from zarr.registry import register_data_type Endianness = Literal["little", "big", "native"] DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] @@ -32,34 +33,80 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", def check_json_bool(data: JSON) -> TypeGuard[bool]: + """ + Check if a JSON value represents a boolean. + """ return bool(isinstance(data, bool)) def check_json_str(data: JSON) -> TypeGuard[str]: + """ + Check if a JSON value represents a string. + """ return bool(isinstance(data, str)) def check_json_int(data: JSON) -> TypeGuard[int]: + """ + Check if a JSON value represents an integer. + """ return bool(isinstance(data, int)) -def check_json_float(data: JSON) -> TypeGuard[float]: +def check_json_float_v2(data: JSON) -> TypeGuard[float]: if data == "NaN" or data == "Infinity" or data == "-Infinity": return True else: return bool(isinstance(data, float | int)) -def check_json_complex_float(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: +def check_json_float_v3(data: JSON) -> TypeGuard[float]: + # TODO: handle the special JSON serialization of different NaN values + return check_json_float_v2(data) + + +def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: + if zarr_format == 2: + return check_json_float_v2(data) + else: + return check_json_float_v3(data) + + +def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float, as per the zarr v3 spec + """ return ( not isinstance(data, str) and isinstance(data, Sequence) and len(data) == 2 - and check_json_float(data[0]) - and check_json_float(data[1]) + and check_json_float_v3(data[0]) + and check_json_float_v3(data[1]) ) +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x + """ + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v2(data[0]) + and check_json_float_v2(data[1]) + ) + + +def check_json_complex_float( + data: JSON, zarr_format: ZarrFormat +) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + if zarr_format == 2: + return check_json_complex_float_v2(data) + else: + return check_json_complex_float_v3(data) + + def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: if np.isnan(data): return "NaN" @@ -103,29 +150,28 @@ def complex_to_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def float_from_json_v2(data: JSONFloat, dtype: np.floating[Any]) -> np.floating[Any]: - if data == "NaN": - _data = np.nan - elif data == "Infinity": - _data = np.inf - elif data == "-Infinity": - _data = -np.inf - else: - _data = data - return dtype.type(_data) +def float_from_json_v2(data: JSONFloat) -> float: + match data: + case "NaN": + return float("nan") + case "Infinity": + return float("inf") + case "-Infinity": + return float("-inf") + case _: + return float(data) -def float_from_json_v3(data: JSONFloat, dtype: Any) -> np.floating[Any]: +def float_from_json_v3(data: JSONFloat) -> float: # todo: support the v3-specific NaN handling - return float_from_json_v2(data, dtype) + return float_from_json_v2(data) -def float_from_json(data: JSONFloat, dtype: Any, zarr_format: ZarrFormat) -> np.floating[Any]: +def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: if zarr_format == 2: - return float_from_json_v2(data, dtype) + return float_from_json_v2(data) else: - return float_from_json_v3(data, dtype) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + return float_from_json_v3(data) def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating: @@ -142,32 +188,42 @@ def complex_from_json( if zarr_format == 2: return complex_from_json_v2(data, dtype) else: - if check_json_complex_float(data): + if check_json_complex_float_v3(data): return complex_from_json_v3(data, dtype) else: raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + TDType = TypeVar("TDType", bound=np.dtype[Any]) TScalar = TypeVar("TScalar", bound=np.generic) -@dataclass(frozen=True, kw_only=True) -class Flexible: - length: int class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): name: ClassVar[str] - dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype + dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype kind: ClassVar[DataTypeFlavor] - _default_value: object + default_value: TScalar + + def __init_subclass__(cls) -> None: + # Subclasses will bind the first generic type parameter to an attribute of the class + # TODO: wrap this in some *very informative* error handling + generic_args = get_args(get_original_bases(cls)[0]) + cls.dtype_cls = generic_args[0] + return super().__init_subclass__() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def default_value(self: Self, *, endianness: Endianness | None = None) -> TScalar: - return cast(np.generic, self.to_numpy(endianness=endianness).type(self._default_value)) + def cast_value(self: Self, value: object, *, endianness: Endianness | None = None) -> TScalar: + return cast(np.generic, self.to_dtype(endianness=endianness).type(value)) - def to_numpy(self: Self, *, endianness: Endianness | None = None) -> TDType: + @classmethod + @abstractmethod + def from_dtype(cls: type[Self], dtype: TDType) -> Self: + raise NotImplementedError + + def to_dtype(self: Self, *, endianness: Endianness | None = None) -> TDType: endian_str = endianness_to_numpy_str(endianness) return self.dtype_cls().newbyteorder(endian_str) @@ -192,7 +248,11 @@ def from_json_value( class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): name = "bool" kind = "boolean" - default = False + default_value = np.False_ + + @classmethod + def from_dtype(cls, dtype: np.dtypes.BoolDType) -> Self: + return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: return bool(data) @@ -201,15 +261,16 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bool_: if check_json_bool(data): - return self.to_numpy(endianness=endianness).type(data) + return self.to_dtype(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") -register_data_type(Bool) - -class BaseInt(DTypeWrapper[TDType, TScalar]): +class IntWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" - default = 0 + + @classmethod + def from_dtype(cls, dtype: TDType) -> Self: + return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) @@ -218,76 +279,64 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> TScalar: if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) + return self.to_dtype(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): +class Int8(IntWrapperBase[np.dtypes.Int8DType, np.int8]): name = "int8" - - -register_data_type(Int8) + default_value = np.int8(0) @dataclass(frozen=True, kw_only=True) -class UInt8(DTypeWrapper[np.dtypes.UInt8DType, np.uint8]): +class UInt8(IntWrapperBase[np.dtypes.UInt8DType, np.uint8]): name = "uint8" - - -register_data_type(UInt8) + default_value = np.uint8(0) @dataclass(frozen=True, kw_only=True) -class Int16(DTypeWrapper[np.dtypes.Int16DType, np.int16]): +class Int16(IntWrapperBase[np.dtypes.Int16DType, np.int16]): name = "int16" - - -register_data_type(Int16) + default_value = np.int16(0) @dataclass(frozen=True, kw_only=True) -class UInt16(DTypeWrapper[np.dtypes.UInt16DType, np.uint16]): +class UInt16(IntWrapperBase[np.dtypes.UInt16DType, np.uint16]): name = "uint16" - -register_data_type(UInt16) + default_value = np.uint16(0) @dataclass(frozen=True, kw_only=True) -class Int32(DTypeWrapper[np.dtypes.Int32DType, np.int32]): +class Int32(IntWrapperBase[np.dtypes.Int32DType, np.int32]): name = "int32" - - -register_data_type(Int32) + default_value = np.int32(0) @dataclass(frozen=True, kw_only=True) -class UInt32(DTypeWrapper[np.dtypes.UInt32DType, np.uint32]): +class UInt32(IntWrapperBase[np.dtypes.UInt32DType, np.uint32]): name = "uint32" - -register_data_type(UInt32) + default_value = np.uint32(0) @dataclass(frozen=True, kw_only=True) -class Int64(DTypeWrapper[np.dtypes.Int64DType, np.int64]): +class Int64(IntWrapperBase[np.dtypes.Int64DType, np.int64]): name = "int64" - - -register_data_type(Int64) + default_value = np.int64(0) @dataclass(frozen=True, kw_only=True) -class UInt64(DTypeWrapper[np.dtypes.UInt64DType, np.uint64]): +class UInt64(IntWrapperBase[np.dtypes.UInt64DType, np.uint64]): name = "uint64" + default_value = np.uint64(0) - -register_data_type(UInt64) - - -class FloatBase(DTypeWrapper[TDType, TScalar]): +class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" - default = 0.0 + + @classmethod + def from_dtype(cls, dtype: TDType) -> Self: + return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) @@ -295,39 +344,38 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> TScalar: - if check_json_float(data): - return self.to_numpy(endianness=endianness).type(float_from_json)(data, zarr_format) + if check_json_float_v2(data): + return self.to_dtype(endianness=endianness).type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") + @dataclass(frozen=True, kw_only=True) -class Float16(DTypeWrapper[np.dtypes.Float16DType, np.float16]): +class Float16(FloatWrapperBase[np.dtypes.Float16DType, np.float16]): name = "float16" - - -register_data_type(Float16) + default_value = np.float16(0) @dataclass(frozen=True, kw_only=True) -class Float32(DTypeWrapper[np.dtypes.Float32DType, np.float32]): +class Float32(FloatWrapperBase[np.dtypes.Float32DType, np.float32]): name = "float32" - - -register_data_type(Float32) + default_value = np.float32(0) @dataclass(frozen=True, kw_only=True) -class Float64(DTypeWrapper[np.dtypes.Float64DType, np.float64]): +class Float64(FloatWrapperBase[np.dtypes.Float64DType, np.float64]): name = "float64" - - -register_data_type(Float64) + default_value = np.float64(0) @dataclass(frozen=True, kw_only=True) class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): name = "complex64" kind = "numeric" - default = 0.0 + 0.0j + default_value = np.complex64(0) + + @classmethod + def from_dtype(cls, dtype: np.dtypes.Complex64DType) -> Self: + return cls() def to_json_value( self, data: np.generic, zarr_format: ZarrFormat @@ -337,22 +385,22 @@ def to_json_value( def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.complex64: - if check_json_complex_float(data): + if check_json_complex_float_v3(data): return complex_from_json( - data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + data, dtype=self.to_dtype(endianness=endianness), zarr_format=zarr_format ) raise TypeError(f"Invalid type: {data}. Expected a complex float.") -register_data_type(Complex64) - - @dataclass(frozen=True, kw_only=True) class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): name = "complex128" kind = "numeric" - dtype_cls = np.dtypes.Complex128DType - default = 0.0 + 0.0j + default_value = np.complex128(0) + + @classmethod + def from_dtype(cls, dtype: np.dtypes.Complex128DType) -> Self: + return cls() def to_json_value( self, data: np.generic, zarr_format: ZarrFormat @@ -362,28 +410,36 @@ def to_json_value( def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.complex128: - if check_json_complex_float(data): + if check_json_complex_float_v3(data): return complex_from_json( - data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + data, dtype=self.to_dtype(endianness=endianness), zarr_format=zarr_format ) raise TypeError(f"Invalid type: {data}. Expected a complex float.") -register_data_type(Complex128) +@dataclass(frozen=True, kw_only=True) +class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): + item_size_bits: ClassVar[int] + length: int + + @classmethod + def from_dtype(cls, dtype: TDType) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def to_dtype(self, endianness: Endianness | None = None) -> TDType: + endianness_code = endianness_to_numpy_str(endianness) + return self.dtype_cls(self.length).newbyteorder(endianness_code) @dataclass(frozen=True, kw_only=True) -class StaticByteString(DTypeWrapper[np.dtypes.BytesDType, np.bytes_], Flexible): +class StaticByteString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): name = "numpy/static_byte_string" kind = "string" - default = b"" + default_value = b"" + item_size_bits = 8 def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.length}} - - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.BytesDType: - endianness_code = endianness_to_numpy_str(endianness) - return self.dtype_cls(self.length).newbyteorder(endianness_code) + return {"name": self.name, "configuration": {"length": self.length}} def to_json_value( self, data: np.generic, *, zarr_format: ZarrFormat, endianness: Endianness | None = None @@ -394,28 +450,25 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bytes_: if check_json_bool(data): - return self.to_numpy(endianness=endianness).type(data.encode("ascii")) + return self.to_dtype(endianness=endianness).type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") -register_data_type(StaticByteString) - - @dataclass(frozen=True, kw_only=True) -class StaticRawBytes(DTypeWrapper[np.dtypes.VoidDType, np.void], Flexible): +class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): name = "r*" kind = "bytes" - default = b"" - + default_value = np.void(b"") + item_size_bits = 8 def to_dict(self) -> dict[str, JSON]: - return {"name": f"r{self.length * 8}"} + return {"name": f"r{self.length * self.item_size_bits}"} - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.VoidDType: + def to_dtype(self, endianness: Endianness | None = None) -> np.dtypes.VoidDType: # this needs to be overridden because numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(f'{endianness_code}V{self.length}') + return np.dtype(f"{endianness_code}V{self.length}") def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: return tuple(*data.tobytes()) @@ -424,10 +477,29 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.void: # todo: check that this is well-formed - return self.to_numpy(endianness=endianness).type(bytes(data)) + return self.to_dtype(endianness=endianness).type(bytes(data)) + + +@dataclass(frozen=True, kw_only=True) +class StaticUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): + name = "numpy/static_unicode_string" + kind = "string" + default_value = np.str_("") + item_size_bits = 32 # UCS4 is 32 bits per code point + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"length": self.length}} + + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.str_: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.to_dtype(endianness=endianness).type(data) -register_data_type(StaticRawBytes) if _NUMPY_SUPPORTS_VLEN_STRING: @@ -435,15 +507,16 @@ def from_json_value( class VlenString(DTypeWrapper[np.dtypes.StringDType, str]): name = "numpy/vlen_string" kind = "string" - dtype_cls = np.dtypes.StringDType - default = "" + default_value = "" + + @classmethod + def from_dtype(cls, dtype: np.dtypes.StringDType) -> Self: + return cls() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def to_numpy( - self, endianness: Endianness | None = "native" - ) -> np.dtypes.StringDType: + def to_dtype(self, endianness: Endianness | None = None) -> np.dtypes.StringDType: endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) @@ -453,7 +526,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> str: - return self.to_numpy(endianness=endianness).type(data) + return self.to_dtype(endianness=endianness).type(data) else: @@ -461,58 +534,29 @@ def from_json_value( class VlenString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" kind = "string" - dtype_cls = np.dtypes.ObjectDType - default = "" + default_value = np.object_("") def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def to_numpy( - self, endianness: Endianness | None = None - ) -> np.dtype[np.dtypes.ObjectDType]: - return super().to_numpy(endianness=endianness) + @classmethod + def from_dtype(cls, dtype: np.dtypes.ObjectDType) -> Self: + return cls() + + def to_dtype(self, endianness: Endianness | None = None) -> np.dtype[np.dtypes.ObjectDType]: + return super().to_dtype(endianness=endianness) - def to_json_value(self, data, *, zarr_format: ZarrFormat) -> str: + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> str: - return self.to_numpy(endianness=endianness).type(data) - - -register_data_type(VlenString) - - -@dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(DTypeWrapper[np.dtypes.StrDType, np.str_], Flexible): - name = "numpy/static_unicode_string" - kind = "string" - default = "" - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.length}} - - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.StrDType: - endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.str_: - if not check_json_bool(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.to_numpy(endianness=endianness).type(data) - - -register_data_type(StaticUnicodeString) + return self.to_dtype(endianness=endianness).type(data) def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: - from zarr.registry import get_data_type_from_numpy + from zarr.registry import get_data_type_from_dict, get_data_type_from_numpy if isinstance(dtype, DTypeWrapper): return dtype @@ -526,3 +570,7 @@ def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTyp FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 STRING_DTYPE = StaticUnicodeString | VlenString | StaticByteString +for dtype in get_args( + Bool | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | StaticRawBytes +): + register_data_type(dtype) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 6d2f8d35e7..3966b0d72c 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -266,14 +266,14 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - fill_value_parsed = data_type.to_numpy().type(fill_value) + fill_value_parsed = data_type.to_dtype().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type.to_numpy(), + dtype=data_type.to_dtype(), fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -308,13 +308,13 @@ def _validate_metadata(self) -> None: raise ValueError("`fill_value` is required.") for codec in self.codecs: codec.validate( - shape=self.shape, dtype=self.data_type.to_numpy(), chunk_grid=self.chunk_grid + shape=self.shape, dtype=self.data_type.to_dtype(), chunk_grid=self.chunk_grid ) @property def dtype(self) -> np.dtype[Any]: """Interpret Zarr dtype as NumPy dtype""" - return self.data_type.to_numpy() + return self.data_type.to_dtype() @property def ndim(self) -> int: diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 1b8ecc7a92..997174de77 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -72,12 +72,10 @@ def lazy_load(self) -> None: self.lazy_load_list.clear() - def register(self: Self, cls: type[DTypeWrapper], clobber: bool = False) -> None: - if cls.name in self.contents and not clobber: - raise ValueError( - f"Data type {cls.name} already registered. Use clobber=True to overwrite." - ) - self.contents[cls.name] = cls + def register(self: Self, cls: type[DTypeWrapper]) -> None: + # don't register the same dtype twice + if cls.name not in self.contents or self.contents[cls.name] != cls: + self.contents[cls.name] = cls def get(self, key: str) -> type[DTypeWrapper]: return self.contents[key] @@ -351,8 +349,8 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() for val in __data_type_registry.contents.values(): - if val.numpy_character_code == np_dtype.char: - return val.from_str(np_dtype) + if val.dtype_cls is type(np_dtype): + return val.from_dtype(np_dtype) raise ValueError( f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." ) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 8aeea834ce..f73b5e1969 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -50,15 +50,15 @@ def test_vlen_string( a[:, :] = data assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == get_data_type_from_numpy(dtype) - assert a.dtype == expected_array_string_dtype + assert a.metadata.data_type == get_data_type_from_numpy(data.dtype) + assert a.dtype == data.dtype # test round trip b = Array.open(sp) assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == get_data_type_from_numpy(dtype) - assert a.dtype == expected_array_string_dtype + assert b.metadata.data_type == get_data_type_from_numpy(data.dtype) + assert a.dtype == data.dtype @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 41d8b9a4d5..b5ca92c568 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,12 +12,13 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import Flexible, complex_from_json +from zarr.core.metadata.dtype import FlexibleWrapperBase, complex_from_json from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, parse_zarr_format, ) +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.errors import MetadataValidationError from zarr.registry import get_data_type_from_numpy @@ -53,9 +54,13 @@ ) complex_dtypes = ("complex64", "complex128") -vlen_dtypes = ("string", "bytes") +flexible_dtypes = ("str", "bytes", 'void') +if _NUMPY_SUPPORTS_VLEN_STRING: + vlen_string_dtypes = ("T","O") +else: + vlen_string_dtypes = ("O") -dtypes = (*bool_dtypes, *int_dtypes, *float_dtypes, *complex_dtypes, *vlen_dtypes) +dtypes = (*bool_dtypes, *int_dtypes, *float_dtypes, *complex_dtypes, *flexible_dtypes, *vlen_string_dtypes) @pytest.mark.parametrize("data", [None, 1, 2, 4, 5, "3"]) @@ -116,7 +121,7 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ zarr_format = 3 dtype = get_data_type_from_numpy(dtype_str) - expected = dtype.to_numpy().type(complex(*fill_value)) + expected = dtype.to_dtype().type(complex(*fill_value)) observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) @@ -325,11 +330,7 @@ async def test_special_float_fill_values(fill_value: str) -> None: @pytest.mark.parametrize("dtype_str", dtypes) def test_dtypes(dtype_str: str) -> None: dt = get_data_type_from_numpy(dtype_str) - np_dtype = dt.to_numpy() + np_dtype = dt.to_dtype() + assert isinstance(np_dtype, dt.dtype_cls) + assert np_dtype.type(0) == dt.cast_value(0) - if not isinstance(dt, Flexible): - assert dt.item_size == np_dtype.itemsize - else: - assert dt.length == np_dtype.itemsize - - assert dt.numpy_character_code == np_dtype.char From 24930b330bebaba1263d3daa33581566dc02e4c8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 2 Mar 2025 21:55:27 +0100 Subject: [PATCH 011/130] remove unused code --- src/zarr/core/dtype/__init__.py | 3 - src/zarr/core/dtype/core.py | 196 -------------------------------- src/zarr/registry.py | 69 +---------- 3 files changed, 2 insertions(+), 266 deletions(-) delete mode 100644 src/zarr/core/dtype/__init__.py delete mode 100644 src/zarr/core/dtype/core.py diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py deleted file mode 100644 index 58b884ff23..0000000000 --- a/src/zarr/core/dtype/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from zarr.core.dtype.core import ZarrDType - -__all__ = ["ZarrDType"] diff --git a/src/zarr/core/dtype/core.py b/src/zarr/core/dtype/core.py deleted file mode 100644 index c6460706aa..0000000000 --- a/src/zarr/core/dtype/core.py +++ /dev/null @@ -1,196 +0,0 @@ -""" -# Overview - -This module provides a proof-of-concept standalone interface for managing dtypes in the zarr-python codebase. - -The `ZarrDType` class introduced in this module effectively acts as a replacement for `np.dtype` throughout the -zarr-python codebase. It attempts to encapsulate all relevant runtime information necessary for working with -dtypes in the context of the Zarr V3 specification (e.g. is this a core dtype or not, how many bytes and what -endianness is the dtype etc). By providing this abstraction, the module aims to: - -- Simplify dtype management within zarr-python -- Support runtime flexibility and custom extensions -- Remove unnecessary dependencies on the numpy API - -## Extensibility - -The module attempts to support user-driven extensions, allowing developers to introduce custom dtypes -without requiring immediate changes to zarr-python. Extensions can leverage the current entrypoint mechanism, -enabling integration of experimental features. Over time, widely adopted extensions may be formalized through -inclusion in zarr-python or standardized via a Zarr Enhancement Proposal (ZEP), but this is not essential. - -## Examples - -### Core `dtype` Registration - -The following example demonstrates how to register a built-in `dtype` in the core codebase: - -```python -from zarr.core.dtype import ZarrDType -from zarr.registry import register_v3dtype - -class Float16(ZarrDType): - zarr_spec_format = "3" - experimental = False - endianness = "little" - byte_count = 2 - to_numpy = np.dtype('float16') - -register_v3dtype(Float16) -``` - -### Entrypoint Extension - -The following example demonstrates how users can register a new `bfloat16` dtype for Zarr. -This approach adheres to the existing Zarr entrypoint pattern as much as possible, ensuring -consistency with other extensions. The code below would typically be part of a Python package -that specifies the entrypoints for the extension: - -```python -import ml_dtypes -from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype - -class Bfloat16(ZarrDType): - zarr_spec_format = "3" - experimental = True - endianness = "little" - byte_count = 2 - to_numpy = np.dtype('bfloat16') # Enabled by importing ml_dtypes - configuration_v3 = { - "version": "example_value", - "author": "example_value", - "ml_dtypes_version": "example_value" - } -``` - -### dtype lookup - -The following examples demonstrate how to perform a lookup for the relevant ZarrDType, given -a string that matches the dtype Zarr specification ID, or a numpy dtype object: - -``` -from zarr.registry import get_v3dtype_class, get_v3dtype_class_from_numpy - -get_v3dtype_class('complex64') # returns little-endian Complex64 ZarrDType -get_v3dtype_class('not_registered_dtype') # ValueError - -get_v3dtype_class_from_numpy('>i2') # returns big-endian Int16 ZarrDType -get_v3dtype_class_from_numpy(np.dtype('float32')) # returns little-endian Float32 ZarrDType -get_v3dtype_class_from_numpy('i10') # ValueError -``` - -### String dtypes - -The following indicates one possibility for supporting variable-length strings. It is via the -entrypoint mechanism as in a previous example. The Apache Arrow specification does not currently -include a dtype for fixed-length strings (only for fixed-length bytes) and so I am using string -here to implicitly refer to a variable-length string data (there may be some subtleties with codecs -that means this needs to be refined further): - -```python -import numpy as np -from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype - -try: - to_numpy = np.dtypes.StringDType() -except AttributeError: - to_numpy = np.dtypes.ObjectDType() - -class String(ZarrDType): - zarr_spec_format = "3" - experimental = True - endianness = 'little' - byte_count = None # None is defined to mean variable - to_numpy = to_numpy -``` - -### int4 dtype - -There is currently considerable interest in the AI community in 'quantising' models - storing -models at reduced precision, while minimising loss of information content. There are a number -of sub-byte dtypes that the community are using e.g. int4. Unfortunately numpy does not -currently have support for handling such sub-byte dtypes in an easy way. However, they can -still be held in a numpy array and then passed (in a zero-copy way) to something like pytorch -which can handle appropriately: - -```python -import numpy as np -from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype - -class Int4(ZarrDType): - zarr_spec_format = "3" - experimental = True - endianness = 'little' - byte_count = 1 # this is ugly, but I could change this from byte_count to bit_count if there was consensus - to_numpy = np.dtype('B') # could also be np.dtype('V1'), but this would prevent bit-twiddling - configuration_v3 = { - "version": "example_value", - "author": "example_value", - } -``` -""" - -from __future__ import annotations - -from typing import Any, Literal - -import numpy as np - - -class FrozenClassVariables(type): - def __setattr__(cls, attr: str, value: object) -> None: - if hasattr(cls, attr): - raise ValueError(f"Attribute {attr} on ZarrDType class can not be changed once set.") - else: - raise AttributeError(f"'{cls}' object has no attribute '{attr}'") - - -class ZarrDType(metaclass=FrozenClassVariables): - zarr_spec_format: Literal["2", "3"] # the version of the zarr spec used - experimental: bool # is this in the core spec or not - endianness: Literal[ - "big", "little", None - ] # None indicates not defined i.e. single byte or byte strings - byte_count: int | None # None indicates variable count - to_numpy: np.dtype[Any] # may involve installing a a numpy extension e.g. ml_dtypes; - - configuration_v3: dict | None # TODO: understand better how this is recommended by the spec - - _zarr_spec_identifier: str # implementation detail used to map to core spec - - def __init_subclass__( # enforces all required fields are set and basic sanity checks - cls, - **kwargs, - ) -> None: - required_attrs = [ - "zarr_spec_format", - "experimental", - "endianness", - "byte_count", - "to_numpy", - ] - for attr in required_attrs: - if not hasattr(cls, attr): - raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") - - if not hasattr(cls, "configuration_v3"): - cls.configuration_v3 = None - - cls._zarr_spec_identifier = ( - "big_" + cls.__qualname__.lower() - if cls.endianness == "big" - else cls.__qualname__.lower() - ) # how this dtype is identified in core spec; convention is prefix with big_ for big-endian - - cls._validate() # sanity check on basic requirements - - super().__init_subclass__(**kwargs) - - # TODO: add further checks - @classmethod - def _validate(cls): - if cls.byte_count is not None and cls.byte_count <= 0: - raise ValueError("byte_count must be a positive integer.") - - if cls.byte_count == 1 and cls.endianness is not None: - raise ValueError("Endianness must be None for single-byte types.") diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 997174de77..373e118e78 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -24,7 +24,6 @@ ) from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON - from zarr.core.dtype import ZarrDType from zarr.core.metadata.dtype import DTypeWrapper __all__ = [ @@ -33,14 +32,10 @@ "get_codec_class", "get_ndbuffer_class", "get_pipeline_class", - "get_v2dtype_class", - "get_v3dtype_class", "register_buffer", "register_codec", "register_ndbuffer", "register_pipeline", - "register_v2dtype", - "register_v3dtype", ] T = TypeVar("T") @@ -86,8 +81,6 @@ def get(self, key: str) -> type[DTypeWrapper]: __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() __data_type_registry = DataTypeRegistry() -__v3_dtype_registry: Registry[ZarrDType] = Registry() -__v2_dtype_registry: Registry[ZarrDType] = Registry() """ The registry module is responsible for managing implementations of codecs, @@ -124,13 +117,9 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - # __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - # __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) - __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v3dtype")) - __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v3dtype")) - __v2_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v2dtype")) - __v2_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v2dtype")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( entry_points.select(group="zarr", name="codec_pipeline") @@ -180,14 +169,6 @@ def register_data_type(cls: type[DTypeWrapper]) -> None: __data_type_registry.register(cls) -def register_v3dtype(cls: type[ZarrDType]) -> None: - __v3_dtype_registry.register(cls) - - -def register_v2dtype(cls: type[ZarrDType]) -> None: - __v2_dtype_registry.register(cls) - - def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if reload_config: _reload_config() @@ -356,50 +337,4 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: ) -# TODO: merge the get_vXdtype_class_ functions -# these can be used instead of the various parse_X functions (hopefully) -def get_v3dtype_class(dtype: str) -> type[ZarrDType]: - __v3_dtype_registry.lazy_load() - v3dtype_class = __v3_dtype_registry.get(dtype) - if v3dtype_class: - return v3dtype_class - raise ValueError( - f"ZarrDType class '{dtype}' not found in registered buffers: {list(__v3_dtype_registry)}." - ) - - -def get_v3dtype_class_from_numpy(dtype: npt.DTypeLike) -> type[ZarrDType]: - __v3_dtype_registry.lazy_load() - - dtype = np.dtype(dtype) - for val in __v3_dtype_registry.values(): - if dtype == val.to_numpy: - return val - raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__v3_dtype_registry)}." - ) - - -def get_v2dtype_class(dtype: str) -> type[ZarrDType]: - __v2_dtype_registry.lazy_load() - v2dtype_class = __v2_dtype_registry.get(dtype) - if v2dtype_class: - return v2dtype_class - raise ValueError( - f"ZarrDType class '{dtype}' not found in registered buffers: {list(__v2_dtype_registry)}." - ) - - -def get_v2dtype_class_from_numpy(dtype: npt.DTypeLike) -> type[ZarrDType]: - __v2_dtype_registry.lazy_load() - - dtype = np.dtype(dtype) - for val in __v2_dtype_registry.values(): - if dtype == val.to_numpy: - return val - raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__v2_dtype_registry)}." - ) - - _collect_entrypoints() From 703e0e16e96fb0ffb9a934b959f00d0e764be4df Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 2 Mar 2025 23:31:31 +0100 Subject: [PATCH 012/130] use wrap / unwrap instead of to_dtype / from_dtype; push into v2 codebase --- src/zarr/api/asynchronous.py | 13 +++---- src/zarr/codecs/_v2.py | 6 ++-- src/zarr/codecs/bytes.py | 2 +- src/zarr/core/array.py | 63 ++++++++++++++++----------------- src/zarr/core/array_spec.py | 16 ++++++--- src/zarr/core/buffer/cpu.py | 9 +++-- src/zarr/core/chunk_grids.py | 5 ++- src/zarr/core/common.py | 13 +++---- src/zarr/core/metadata/dtype.py | 58 +++++++++++++++--------------- src/zarr/core/metadata/v2.py | 59 ++++++++++-------------------- src/zarr/core/metadata/v3.py | 8 ++--- src/zarr/registry.py | 2 +- tests/conftest.py | 5 ++- tests/test_array.py | 12 +++++-- tests/test_metadata/test_v3.py | 24 ++++++++----- 15 files changed, 147 insertions(+), 148 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 792e445c9d..d8462b72ef 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -32,6 +32,7 @@ from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.v2 import _default_compressor, _default_filters from zarr.errors import NodeTypeValidationError +from zarr.registry import get_data_type_from_numpy from zarr.storage._common import make_store_path if TYPE_CHECKING: @@ -428,11 +429,12 @@ async def save_array( shape = arr.shape chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute overwrite = kwargs.pop("overwrite", None) or _infer_overwrite(mode) + zarr_dtype = get_data_type_from_numpy(arr.dtype) new = await AsyncArray._create( store_path, zarr_format=zarr_format, shape=shape, - dtype=arr.dtype, + dtype=zarr_dtype, chunks=chunks, overwrite=overwrite, **kwargs, @@ -978,15 +980,14 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - + dtype_wrapped = parse_dtype(dtype, zarr_format=zarr_format) if zarr_format == 2: if chunks is None: chunks = shape - dtype = parse_dtype(dtype, zarr_format=zarr_format) if not filters: - filters = _default_filters(dtype) + filters = _default_filters(dtype_wrapped) if not compressor: - compressor = _default_compressor(dtype) + compressor = _default_compressor(dtype_wrapped) elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks @@ -1051,7 +1052,7 @@ async def create( store_path, shape=shape, chunks=chunks, - dtype=dtype, + dtype=dtype_wrapped, compressor=compressor, fill_value=fill_value, overwrite=overwrite, diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 53edc1f4a1..e2f228f509 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -48,7 +48,7 @@ async def _decode_single( # segfaults and other bad things happening if chunk_spec.dtype != object: try: - chunk = chunk.view(chunk_spec.dtype) + chunk = chunk.view(chunk_spec.dtype.unwrap()) except TypeError: # this will happen if the dtype of the chunk # does not match the dtype of the array spec i.g. if @@ -56,7 +56,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype) + chunk = np.array(chunk).astype(chunk_spec.dtype.unwrap()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -80,7 +80,7 @@ async def _encode_single( chunk = chunk_array.as_ndarray_like() # ensure contiguous and correct order - chunk = chunk.astype(chunk_spec.dtype, order=chunk_spec.order, copy=False) + chunk = chunk.astype(chunk_spec.dtype.unwrap(), order=chunk_spec.order, copy=False) # apply filters if self.filters: diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 78c7b22fbc..4875d8e8d8 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -56,7 +56,7 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if array_spec.dtype.itemsize == 0: + if array_spec.dtype.unwrap().itemsize == 0: if self.endian is not None: return replace(self, endian=None) elif self.endian is None: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7edd467a54..3b1e6a973f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -98,7 +98,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import DTypeWrapper +from zarr.core.metadata.dtype import DTypeWrapper, VariableLengthString from zarr.core.metadata.v2 import ( _default_compressor, _default_filters, @@ -549,7 +549,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike, + dtype: npt.DTypeLike[Any], zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -578,18 +578,22 @@ async def _create( See :func:`AsyncArray.create` for more details. Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ + # TODO: delete this and be more strict about where parsing occurs + if not isinstance(dtype, DTypeWrapper): + dtype_parsed = get_data_type_from_numpy(np.dtype(dtype)) + else: + dtype_parsed = dtype store_path = await make_store_path(store) - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) shape = parse_shapelike(shape) if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") if chunks: - _chunks = normalize_chunks(chunks, shape, dtype_parsed.itemsize) + _chunks = normalize_chunks(chunks, shape, dtype_parsed.unwrap().itemsize) else: - _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.itemsize) + _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.unwrap().itemsize) config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @@ -666,7 +670,7 @@ async def _create( @staticmethod def _create_metadata_v3( shape: ShapeLike, - dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], chunk_shape: ChunkCoords, fill_value: Any | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, @@ -694,19 +698,16 @@ def _create_metadata_v3( stacklevel=2, ) - # resolve the numpy dtype into zarr v3 datatype - zarr_data_type = get_data_type_from_numpy(dtype) - if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = zarr_data_type.default_value + fill_value_parsed = dtype.default_value else: fill_value_parsed = fill_value chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape) return ArrayV3Metadata( shape=shape, - data_type=zarr_data_type, + data_type=dtype, chunk_grid=chunk_grid_parsed, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, @@ -769,7 +770,7 @@ async def _create_v3( @staticmethod def _create_metadata_v2( shape: ChunkCoords, - dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], chunks: ChunkCoords, order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, @@ -781,10 +782,8 @@ def _create_metadata_v2( if dimension_separator is None: dimension_separator = "." - dtype = parse_dtype(dtype, zarr_format=2) - # inject VLenUTF8 for str dtype if not already present - if np.issubdtype(dtype, np.str_): + if isinstance(dtype, VariableLengthString): filters = filters or [] from numcodecs.vlen import VLenUTF8 @@ -793,7 +792,7 @@ def _create_metadata_v2( return ArrayV2Metadata( shape=shape, - dtype=np.dtype(dtype), + dtype=dtype, chunks=chunks, order=order, dimension_separator=dimension_separator, @@ -2046,7 +2045,7 @@ def dtype(self) -> np.dtype[Any]: np.dtype The NumPy data type. """ - return self._async_array.dtype + return self._async_array.dtype.unwrap() @property def attrs(self) -> Attributes: @@ -3919,7 +3918,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + dtype_wrapped = parse_dtype(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -3934,7 +3933,10 @@ async def init_array( await ensure_no_existing_node(store_path, zarr_format=zarr_format) shard_shape_parsed, chunk_shape_parsed = _auto_partition( - array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, dtype=dtype_parsed + array_shape=shape_parsed, + shard_shape=shards, + chunk_shape=chunks, + item_size=dtype_wrapped.unwrap().itemsize, ) chunks_out: tuple[int, ...] meta: ArrayV2Metadata | ArrayV3Metadata @@ -3950,9 +3952,8 @@ async def init_array( raise ValueError("Zarr format 2 arrays do not support `serializer`.") filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( - compressor=compressors, filters=filters, dtype=np.dtype(dtype) + compressor=compressors, filters=filters, dtype=dtype_wrapped ) - if dimension_names is not None: raise ValueError("Zarr format 2 arrays do not support dimension names.") if order is None: @@ -3962,7 +3963,7 @@ async def init_array( meta = AsyncArray._create_metadata_v2( shape=shape_parsed, - dtype=dtype_parsed, + dtype=dtype_wrapped, chunks=chunk_shape_parsed, dimension_separator=chunk_key_encoding_parsed.separator, fill_value=fill_value, @@ -3976,7 +3977,7 @@ async def init_array( compressors=compressors, filters=filters, serializer=serializer, - dtype=dtype_parsed, + dtype=dtype_wrapped, ) sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes)) codecs_out: tuple[Codec, ...] @@ -3991,7 +3992,7 @@ async def init_array( ) sharding_codec.validate( shape=chunk_shape_parsed, - dtype=dtype_parsed, + dtype=dtype_wrapped, chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), ) codecs_out = (sharding_codec,) @@ -4002,7 +4003,7 @@ async def init_array( meta = AsyncArray._create_metadata_v3( shape=shape_parsed, - dtype=dtype_parsed, + dtype=dtype_wrapped, fill_value=fill_value, chunk_shape=chunks_out, chunk_key_encoding=chunk_key_encoding_parsed, @@ -4210,12 +4211,11 @@ def _parse_chunk_key_encoding( def _get_default_chunk_encoding_v3( - np_dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - dtype = get_data_type_from_numpy(np_dtype) default_filters = zarr_config.get("array.v3_default_filters").get(dtype.kind) default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.kind) @@ -4229,14 +4229,14 @@ def _get_default_chunk_encoding_v3( def _get_default_chunk_encoding_v2( - np_dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Get the default chunk encoding for Zarr format 2 arrays, given a dtype """ - compressor_dict = _default_compressor(np_dtype) - filter_dicts = _default_filters(np_dtype) + compressor_dict = _default_compressor(dtype) + filter_dicts = _default_filters(dtype) compressor = None if compressor_dict is not None: @@ -4253,13 +4253,12 @@ def _parse_chunk_encoding_v2( *, compressor: CompressorsLike, filters: FiltersLike, - dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) - _filters: tuple[numcodecs.abc.Codec, ...] | None _compressor: numcodecs.abc.Codec | None diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 59d3cc6b40..cf92f11050 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -3,8 +3,6 @@ from dataclasses import dataclass, fields from typing import TYPE_CHECKING, Any, Literal, Self, TypedDict, cast -import numpy as np - from zarr.core.common import ( MemoryOrder, parse_bool, @@ -13,10 +11,14 @@ parse_shapelike, ) from zarr.core.config import config as zarr_config +from zarr.core.metadata.dtype import DTypeWrapper +from zarr.registry import get_data_type_from_numpy if TYPE_CHECKING: from typing import NotRequired + import numpy.typing as npt + from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords @@ -90,7 +92,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: np.dtype[Any] + dtype: DTypeWrapper[Any, Any] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -98,13 +100,17 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: np.dtype[Any], + dtype: npt.DtypeLike | DTypeWrapper[Any, Any], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) - dtype_parsed = np.dtype(dtype) + if not isinstance(dtype, DTypeWrapper): + dtype_parsed = get_data_type_from_numpy(dtype) + else: + dtype_parsed = dtype + fill_value_parsed = parse_fill_value(fill_value) object.__setattr__(self, "shape", shape_parsed) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 225adb6f5c..b83f710747 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -10,6 +10,7 @@ import numpy.typing as npt from zarr.core.buffer import core +from zarr.core.metadata.dtype import DTypeWrapper from zarr.registry import ( register_buffer, register_ndbuffer, @@ -150,14 +151,18 @@ def create( cls, *, shape: Iterable[int], - dtype: npt.DTypeLike, + dtype: DTypeWrapper[Any, Any], order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: if fill_value is None: return cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order)) else: - return cls(np.full(shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order)) + return cls( + np.full( + shape=tuple(shape), fill_value=fill_value, dtype=dtype.unwrap(), order=order + ) + ) @classmethod def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self: diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index d3e40c26ed..74bf9b6ba8 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -63,7 +63,7 @@ def _guess_chunks( """ if isinstance(shape, int): shape = (shape,) - + typesize = max(typesize, 8) ndims = len(shape) # require chunks to have non-zero length for all dimensions chunks = np.maximum(np.array(shape, dtype="=f8"), 1) @@ -204,7 +204,7 @@ def _auto_partition( array_shape: tuple[int, ...], chunk_shape: tuple[int, ...] | Literal["auto"], shard_shape: ShardsLike | None, - dtype: np.dtype[Any], + item_size: int, ) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: """ Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. @@ -214,7 +214,6 @@ def _auto_partition( of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, given the dtype and shard shape. Otherwise, the chunks will be returned as-is. """ - item_size = dtype.itemsize if shard_shape is None: _shards_out: None | tuple[int, ...] = None if chunk_shape == "auto": diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index e398eff406..85dadc2b53 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -19,7 +19,6 @@ import numpy as np from zarr.core.config import config as zarr_config -from zarr.core.strings import _VLEN_STRING_DTYPE if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -167,14 +166,10 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: - if dtype is str or dtype == "str": - if zarr_format == 2: - # special case as object - return np.dtype("object") - else: - return _VLEN_STRING_DTYPE - return np.dtype(dtype) +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> DTypeWrapper[Any, Any]: + from zarr.registry import get_data_type_from_numpy + + return get_data_type_from_numpy(np.dtype(dtype)) def _warn_write_empty_chunks_kwarg() -> None: diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 1b57831943..5d382076b4 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -216,14 +216,14 @@ def to_dict(self) -> dict[str, JSON]: return {"name": self.name} def cast_value(self: Self, value: object, *, endianness: Endianness | None = None) -> TScalar: - return cast(np.generic, self.to_dtype(endianness=endianness).type(value)) + return cast(np.generic, self.unwrap(endianness=endianness).type(value)) @classmethod @abstractmethod - def from_dtype(cls: type[Self], dtype: TDType) -> Self: + def wrap(cls: type[Self], dtype: TDType) -> Self: raise NotImplementedError - def to_dtype(self: Self, *, endianness: Endianness | None = None) -> TDType: + def unwrap(self: Self, *, endianness: Endianness | None = None) -> TDType: endian_str = endianness_to_numpy_str(endianness) return self.dtype_cls().newbyteorder(endian_str) @@ -251,7 +251,7 @@ class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): default_value = np.False_ @classmethod - def from_dtype(cls, dtype: np.dtypes.BoolDType) -> Self: + def wrap(cls, dtype: np.dtypes.BoolDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: @@ -261,7 +261,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bool_: if check_json_bool(data): - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @@ -269,7 +269,7 @@ class IntWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" @classmethod - def from_dtype(cls, dtype: TDType) -> Self: + def wrap(cls, dtype: TDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: @@ -279,7 +279,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> TScalar: if check_json_int(data): - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -335,7 +335,7 @@ class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" @classmethod - def from_dtype(cls, dtype: TDType) -> Self: + def wrap(cls, dtype: TDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: @@ -345,7 +345,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> TScalar: if check_json_float_v2(data): - return self.to_dtype(endianness=endianness).type(float_from_json(data, zarr_format)) + return self.unwrap(endianness=endianness).type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -374,7 +374,7 @@ class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): default_value = np.complex64(0) @classmethod - def from_dtype(cls, dtype: np.dtypes.Complex64DType) -> Self: + def wrap(cls, dtype: np.dtypes.Complex64DType) -> Self: return cls() def to_json_value( @@ -387,7 +387,7 @@ def from_json_value( ) -> np.complex64: if check_json_complex_float_v3(data): return complex_from_json( - data, dtype=self.to_dtype(endianness=endianness), zarr_format=zarr_format + data, dtype=self.unwrap(endianness=endianness), zarr_format=zarr_format ) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @@ -399,7 +399,7 @@ class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): default_value = np.complex128(0) @classmethod - def from_dtype(cls, dtype: np.dtypes.Complex128DType) -> Self: + def wrap(cls, dtype: np.dtypes.Complex128DType) -> Self: return cls() def to_json_value( @@ -412,7 +412,7 @@ def from_json_value( ) -> np.complex128: if check_json_complex_float_v3(data): return complex_from_json( - data, dtype=self.to_dtype(endianness=endianness), zarr_format=zarr_format + data, dtype=self.unwrap(endianness=endianness), zarr_format=zarr_format ) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @@ -423,10 +423,10 @@ class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): length: int @classmethod - def from_dtype(cls, dtype: TDType) -> Self: + def wrap(cls, dtype: TDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - def to_dtype(self, endianness: Endianness | None = None) -> TDType: + def unwrap(self, endianness: Endianness | None = None) -> TDType: endianness_code = endianness_to_numpy_str(endianness) return self.dtype_cls(self.length).newbyteorder(endianness_code) @@ -450,7 +450,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bytes_: if check_json_bool(data): - return self.to_dtype(endianness=endianness).type(data.encode("ascii")) + return self.unwrap(endianness=endianness).type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -464,7 +464,7 @@ class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * self.item_size_bits}"} - def to_dtype(self, endianness: Endianness | None = None) -> np.dtypes.VoidDType: + def unwrap(self, endianness: Endianness | None = None) -> np.dtypes.VoidDType: # this needs to be overridden because numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly endianness_code = endianness_to_numpy_str(endianness) @@ -477,7 +477,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.void: # todo: check that this is well-formed - return self.to_dtype(endianness=endianness).type(bytes(data)) + return self.unwrap(endianness=endianness).type(bytes(data)) @dataclass(frozen=True, kw_only=True) @@ -498,25 +498,25 @@ def from_json_value( ) -> np.str_: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VlenString(DTypeWrapper[np.dtypes.StringDType, str]): + class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): name = "numpy/vlen_string" kind = "string" default_value = "" @classmethod - def from_dtype(cls, dtype: np.dtypes.StringDType) -> Self: + def wrap(cls, dtype: np.dtypes.StringDType) -> Self: return cls() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def to_dtype(self, endianness: Endianness | None = None) -> np.dtypes.StringDType: + def unwrap(self, endianness: Endianness | None = None) -> np.dtypes.StringDType: endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) @@ -526,12 +526,12 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> str: - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) else: @dataclass(frozen=True, kw_only=True) - class VlenString(DTypeWrapper[np.dtypes.ObjectDType, str]): + class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" kind = "string" default_value = np.object_("") @@ -540,11 +540,11 @@ def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @classmethod - def from_dtype(cls, dtype: np.dtypes.ObjectDType) -> Self: + def wrap(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() - def to_dtype(self, endianness: Endianness | None = None) -> np.dtype[np.dtypes.ObjectDType]: - return super().to_dtype(endianness=endianness) + def unwrap(self, endianness: Endianness | None = None) -> np.dtype[np.dtypes.ObjectDType]: + return super().unwrap(endianness=endianness) def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) @@ -552,7 +552,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> str: - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: @@ -569,7 +569,7 @@ def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTyp INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = StaticUnicodeString | VlenString | StaticByteString +STRING_DTYPE = StaticUnicodeString | VariableLengthString | StaticByteString for dtype in get_args( Bool | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | StaticRawBytes ): diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 823944e067..8012aac02d 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -10,6 +10,8 @@ import numcodecs.abc from zarr.abc.metadata import Metadata +from zarr.core.metadata.dtype import DTypeWrapper +from zarr.registry import get_data_type_from_numpy if TYPE_CHECKING: from typing import Any, Literal, Self @@ -46,7 +48,7 @@ class ArrayV2MetadataDict(TypedDict): class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: ChunkCoords - dtype: np.dtype[Any] + dtype: DTypeWrapper[Any, Any] fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None @@ -59,7 +61,7 @@ def __init__( self, *, shape: ChunkCoords, - dtype: npt.DTypeLike, + dtype: DTypeWrapper[Any, Any], chunks: ChunkCoords, fill_value: Any, order: MemoryOrder, @@ -72,18 +74,17 @@ def __init__( Metadata for a Zarr format 2 array. """ shape_parsed = parse_shapelike(shape) - dtype_parsed = parse_dtype(dtype) chunks_parsed = parse_shapelike(chunks) compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value(fill_value, dtype=dtype_parsed) + fill_value_parsed = parse_fill_value(fill_value, dtype=dtype.unwrap()) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) - object.__setattr__(self, "dtype", dtype_parsed) + object.__setattr__(self, "dtype", dtype) object.__setattr__(self, "chunks", chunks_parsed) object.__setattr__(self, "compressor", compressor_parsed) object.__setattr__(self, "order", order_parsed) @@ -163,9 +164,9 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # check that the zarr_format attribute is correct _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = parse_dtype(_data["dtype"]) - - if dtype.kind in "SV": + dtype = get_data_type_from_numpy(parse_dtype(_data["dtype"])) + _data["dtype"] = dtype + if dtype.unwrap().kind in "SV": fill_value_encoded = _data.get("fill_value") if fill_value_encoded is not None: fill_value = base64.standard_b64decode(fill_value_encoded) @@ -205,12 +206,13 @@ def to_dict(self) -> dict[str, JSON]: _ = zarray_dict.pop("dtype") dtype_json: JSON + # TODO: Replace this with per-dtype method # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string - dtype_descr = self.dtype.descr - if self.dtype.kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0: - dtype_json = tuple(self.dtype.descr) + dtype_descr = self.dtype.unwrap().descr + if self.dtype.unwrap().kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0: + dtype_json = tuple(self.dtype.unwrap().descr) else: - dtype_json = self.dtype.str + dtype_json = self.dtype.unwrap().str zarray_dict["dtype"] = dtype_json return zarray_dict @@ -377,42 +379,19 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: def _default_compressor( - dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], ) -> dict[str, JSON] | None: """Get the default filters and compressor for a dtype. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html """ default_compressor = config.get("array.v2_default_compressor") - if dtype.kind in "biufcmM": - dtype_key = "numeric" - elif dtype.kind in "U": - dtype_key = "string" - elif dtype.kind in "OSV": - dtype_key = "bytes" - else: - raise ValueError(f"Unsupported dtype kind {dtype.kind}") - - return cast(dict[str, JSON] | None, default_compressor.get(dtype_key, None)) + return cast(dict[str, JSON] | None, default_compressor.get(dtype.kind, None)) def _default_filters( - dtype: np.dtype[Any], + dtype: DTypeWrapper, ) -> list[dict[str, JSON]] | None: - """Get the default filters and compressor for a dtype. - - https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html - """ + """Get the default filters and compressor for a dtype.""" default_filters = config.get("array.v2_default_filters") - if dtype.kind in "biufcmM": - dtype_key = "numeric" - elif dtype.kind in "U": - dtype_key = "string" - elif dtype.kind in "OS": - dtype_key = "bytes" - elif dtype.kind == "V": - dtype_key = "raw" - else: - raise ValueError(f"Unsupported dtype kind {dtype.kind}") - - return cast(list[dict[str, JSON]] | None, default_filters.get(dtype_key, None)) + return cast(list[dict[str, JSON]] | None, default_filters.get(dtype.kind, None)) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 3966b0d72c..f70cbb3cf2 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -266,14 +266,14 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - fill_value_parsed = data_type.to_dtype().type(fill_value) + fill_value_parsed = data_type.unwrap().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type.to_dtype(), + dtype=data_type.unwrap(), fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -308,13 +308,13 @@ def _validate_metadata(self) -> None: raise ValueError("`fill_value` is required.") for codec in self.codecs: codec.validate( - shape=self.shape, dtype=self.data_type.to_dtype(), chunk_grid=self.chunk_grid + shape=self.shape, dtype=self.data_type.unwrap(), chunk_grid=self.chunk_grid ) @property def dtype(self) -> np.dtype[Any]: """Interpret Zarr dtype as NumPy dtype""" - return self.data_type.to_dtype() + return self.data_type.unwrap() @property def ndim(self) -> int: diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 373e118e78..7760c599fd 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -331,7 +331,7 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: __data_type_registry.lazy_load() for val in __data_type_registry.contents.values(): if val.dtype_cls is type(np_dtype): - return val.from_dtype(np_dtype) + return val.wrap(np_dtype) raise ValueError( f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." ) diff --git a/tests/conftest.py b/tests/conftest.py index 04034cb5b8..fb7b7977a7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -259,7 +259,10 @@ def create_array_metadata( ) shard_shape_parsed, chunk_shape_parsed = _auto_partition( - array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, dtype=dtype_parsed + array_shape=shape_parsed, + shard_shape=shards, + chunk_shape=chunks, + dtype=dtype_parsed.unwrap().itemsize, ) if order is None: diff --git a/tests/test_array.py b/tests/test_array.py index ce149d0f9a..959cf02055 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -927,7 +927,10 @@ def test_auto_partition_auto_shards( expected_shards += (cs,) auto_shards, _ = _auto_partition( - array_shape=array_shape, chunk_shape=chunk_shape, shard_shape="auto", dtype=dtype + array_shape=array_shape, + chunk_shape=chunk_shape, + shard_shape="auto", + item_size=dtype.itemsize, ) assert auto_shards == expected_shards @@ -1079,7 +1082,10 @@ async def test_v3_chunk_encoding( compressors=compressors, ) filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( - filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) + filters=filters, + compressors=compressors, + serializer="auto", + dtype=arr.metadata.data_type, ) assert arr.filters == filters_expected assert arr.compressors == compressors_expected @@ -1145,7 +1151,7 @@ async def test_default_filters_compressors( elif zarr_format == 2: default_filters, default_compressors = _get_default_chunk_encoding_v2( - np_dtype=np.dtype(dtype) + dtype=np.dtype(dtype) ) if default_filters is None: expected_filters = () diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index b5ca92c568..54e077f1a6 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,7 +12,7 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import FlexibleWrapperBase, complex_from_json +from zarr.core.metadata.dtype import complex_from_json from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -54,13 +54,20 @@ ) complex_dtypes = ("complex64", "complex128") -flexible_dtypes = ("str", "bytes", 'void') +flexible_dtypes = ("str", "bytes", "void") if _NUMPY_SUPPORTS_VLEN_STRING: - vlen_string_dtypes = ("T","O") + vlen_string_dtypes = ("T", "O") else: - vlen_string_dtypes = ("O") - -dtypes = (*bool_dtypes, *int_dtypes, *float_dtypes, *complex_dtypes, *flexible_dtypes, *vlen_string_dtypes) + vlen_string_dtypes = "O" + +dtypes = ( + *bool_dtypes, + *int_dtypes, + *float_dtypes, + *complex_dtypes, + *flexible_dtypes, + *vlen_string_dtypes, +) @pytest.mark.parametrize("data", [None, 1, 2, 4, 5, "3"]) @@ -121,7 +128,7 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ zarr_format = 3 dtype = get_data_type_from_numpy(dtype_str) - expected = dtype.to_dtype().type(complex(*fill_value)) + expected = dtype.unwrap().type(complex(*fill_value)) observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) @@ -330,7 +337,6 @@ async def test_special_float_fill_values(fill_value: str) -> None: @pytest.mark.parametrize("dtype_str", dtypes) def test_dtypes(dtype_str: str) -> None: dt = get_data_type_from_numpy(dtype_str) - np_dtype = dt.to_dtype() + np_dtype = dt.unwrap() assert isinstance(np_dtype, dt.dtype_cls) assert np_dtype.type(0) == dt.cast_value(0) - From 3c232a406264716fd14cb6a1dab9a91fc6a22632 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 3 Mar 2025 10:44:34 +0100 Subject: [PATCH 013/130] push into v2 --- src/zarr/api/asynchronous.py | 3 +-- src/zarr/core/array.py | 5 ++--- src/zarr/core/buffer/cpu.py | 4 ++-- src/zarr/core/common.py | 6 ------ src/zarr/core/metadata/v2.py | 4 +++- src/zarr/core/metadata/v3.py | 10 ++++++---- tests/conftest.py | 5 +++-- 7 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index d8462b72ef..f1131003fc 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -21,7 +21,6 @@ _default_zarr_format, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, - parse_dtype, ) from zarr.core.group import ( AsyncGroup, @@ -980,7 +979,7 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - dtype_wrapped = parse_dtype(dtype, zarr_format=zarr_format) + dtype_wrapped = get_data_type_from_numpy(dtype) if zarr_format == 2: if chunks is None: chunks = shape diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 3b1e6a973f..9abb330d59 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -58,7 +58,6 @@ _default_zarr_format, _warn_order_kwarg, concurrent_map, - parse_dtype, parse_order, parse_shapelike, product, @@ -1034,7 +1033,7 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec ) @property - def dtype(self) -> np.dtype[Any]: + def dtype(self) -> DTypeWrapper[Any, Any]: """Returns the data type of the array. Returns @@ -3918,7 +3917,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_wrapped = parse_dtype(dtype, zarr_format=zarr_format) + dtype_wrapped = get_data_type_from_numpy(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index b83f710747..00444a6f76 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -151,7 +151,7 @@ def create( cls, *, shape: Iterable[int], - dtype: DTypeWrapper[Any, Any], + dtype: np.dtype[Any], order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: @@ -160,7 +160,7 @@ def create( else: return cls( np.full( - shape=tuple(shape), fill_value=fill_value, dtype=dtype.unwrap(), order=order + shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order ) ) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 85dadc2b53..5543fa9086 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -166,12 +166,6 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> DTypeWrapper[Any, Any]: - from zarr.registry import get_data_type_from_numpy - - return get_data_type_from_numpy(np.dtype(dtype)) - - def _warn_write_empty_chunks_kwarg() -> None: # TODO: link to docs page on array configuration in this message msg = ( diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 8012aac02d..2ba2ac5c45 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -75,7 +75,9 @@ def __init__( """ shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) - + # TODO: remove this + if not isinstance(dtype, DTypeWrapper): + raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index f70cbb3cf2..8bf20899c3 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,7 +4,7 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype - +from zarr.core.metadata.dtype import DTypeWrapper if TYPE_CHECKING: from collections.abc import Callable from typing import Self @@ -12,9 +12,7 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords - from zarr.core.metadata.dtype import ( - DTypeWrapper, - ) + import json from collections.abc import Iterable @@ -262,6 +260,10 @@ def __init__( """ Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ + + # TODO: remove this + if not isinstance(data_type, DTypeWrapper): + raise TypeError shape_parsed = parse_shapelike(shape) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) diff --git a/tests/conftest.py b/tests/conftest.py index fb7b7977a7..a650accc51 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,11 +18,12 @@ _parse_chunk_key_encoding, ) from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition -from zarr.core.common import JSON, parse_dtype, parse_shapelike +from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync +from zarr.registry import get_data_type_from_numpy from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore if TYPE_CHECKING: @@ -252,7 +253,7 @@ def create_array_metadata( """ Create array metadata """ - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + dtype_parsed = get_data_type_from_numpy(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format From b7fe98640a548e17f39fdf21e6b0a93186d4cabf Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 3 Mar 2025 15:03:01 +0100 Subject: [PATCH 014/130] remove endianness kwarg to methods, make it an instance variable instead --- src/zarr/core/metadata/dtype.py | 107 +++++++++++++------------------- 1 file changed, 42 insertions(+), 65 deletions(-) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 5d382076b4..f88683e1e7 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from collections.abc import Sequence -from dataclasses import dataclass +from dataclasses import dataclass, replace from typing import Any, ClassVar, Generic, Literal, Self, TypeGuard, TypeVar, cast, get_args import numpy as np @@ -199,11 +199,13 @@ def complex_from_json( TScalar = TypeVar("TScalar", bound=np.generic) +@dataclass(frozen=True, kw_only=True) class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): name: ClassVar[str] dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype kind: ClassVar[DataTypeFlavor] - default_value: TScalar + default_value: ClassVar[TScalar] + endianness: Endianness = "native" def __init_subclass__(cls) -> None: # Subclasses will bind the first generic type parameter to an attribute of the class @@ -215,18 +217,21 @@ def __init_subclass__(cls) -> None: def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def cast_value(self: Self, value: object, *, endianness: Endianness | None = None) -> TScalar: - return cast(np.generic, self.unwrap(endianness=endianness).type(value)) + def cast_value(self: Self, value: object) -> TScalar: + return cast(np.generic, self.unwrap().type(value)) @classmethod @abstractmethod def wrap(cls: type[Self], dtype: TDType) -> Self: raise NotImplementedError - def unwrap(self: Self, *, endianness: Endianness | None = None) -> TDType: - endian_str = endianness_to_numpy_str(endianness) + def unwrap(self: Self) -> TDType: + endian_str = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(endian_str) + def with_endianness(self: Self, endianness: Endianness) -> Self: + return replace(self, endianness=endianness) + @abstractmethod def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: """ @@ -235,9 +240,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: raise NotImplementedError @abstractmethod - def from_json_value( - self: Self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> TScalar: + def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: """ Read a JSON-serializable value as a numpy scalar """ @@ -257,11 +260,9 @@ def wrap(cls, dtype: np.dtypes.BoolDType) -> Self: def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: return bool(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.bool_: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: if check_json_bool(data): - return self.unwrap(endianness=endianness).type(data) + return self.unwrap().type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @@ -275,11 +276,9 @@ def wrap(cls, dtype: TDType) -> Self: def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> TScalar: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: if check_json_int(data): - return self.unwrap(endianness=endianness).type(data) + return self.unwrap().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -341,11 +340,9 @@ def wrap(cls, dtype: TDType) -> Self: def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> TScalar: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: if check_json_float_v2(data): - return self.unwrap(endianness=endianness).type(float_from_json(data, zarr_format)) + return self.unwrap().type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -382,13 +379,9 @@ def to_json_value( ) -> tuple[JSONFloat, JSONFloat]: return complex_to_json(data, zarr_format) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.complex64: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex64: if check_json_complex_float_v3(data): - return complex_from_json( - data, dtype=self.unwrap(endianness=endianness), zarr_format=zarr_format - ) + return complex_from_json(data, dtype=self.unwrap(), zarr_format=zarr_format) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @@ -407,13 +400,9 @@ def to_json_value( ) -> tuple[JSONFloat, JSONFloat]: return complex_to_json(data, zarr_format) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.complex128: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: if check_json_complex_float_v3(data): - return complex_from_json( - data, dtype=self.unwrap(endianness=endianness), zarr_format=zarr_format - ) + return complex_from_json(data, dtype=self.unwrap(), zarr_format=zarr_format) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @@ -426,8 +415,8 @@ class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): def wrap(cls, dtype: TDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - def unwrap(self, endianness: Endianness | None = None) -> TDType: - endianness_code = endianness_to_numpy_str(endianness) + def unwrap(self) -> TDType: + endianness_code = endianness_to_numpy_str(self.endianness) return self.dtype_cls(self.length).newbyteorder(endianness_code) @@ -435,22 +424,18 @@ def unwrap(self, endianness: Endianness | None = None) -> TDType: class StaticByteString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): name = "numpy/static_byte_string" kind = "string" - default_value = b"" + default_value = np.bytes_(0) item_size_bits = 8 def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"length": self.length}} - def to_json_value( - self, data: np.generic, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> str: + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return data.tobytes().decode("ascii") - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.bytes_: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_bool(data): - return self.unwrap(endianness=endianness).type(data.encode("ascii")) + return self.unwrap().type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -464,20 +449,18 @@ class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * self.item_size_bits}"} - def unwrap(self, endianness: Endianness | None = None) -> np.dtypes.VoidDType: + def unwrap(self) -> np.dtypes.VoidDType: # this needs to be overridden because numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly - endianness_code = endianness_to_numpy_str(endianness) + endianness_code = endianness_to_numpy_str(self.endianness) return np.dtype(f"{endianness_code}V{self.length}") def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: return tuple(*data.tobytes()) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.void: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: # todo: check that this is well-formed - return self.unwrap(endianness=endianness).type(bytes(data)) + return self.unwrap().type(bytes(data)) @dataclass(frozen=True, kw_only=True) @@ -493,12 +476,10 @@ def to_dict(self) -> dict[str, JSON]: def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.str_: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.unwrap(endianness=endianness).type(data) + return self.unwrap().type(data) if _NUMPY_SUPPORTS_VLEN_STRING: @@ -516,17 +497,15 @@ def wrap(cls, dtype: np.dtypes.StringDType) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def unwrap(self, endianness: Endianness | None = None) -> np.dtypes.StringDType: - endianness_code = endianness_to_numpy_str(endianness) + def unwrap(self) -> np.dtypes.StringDType: + endianness_code = endianness_to_numpy_str(self.endianness) return np.dtype(endianness_code + self.numpy_character_code) def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> str: - return self.unwrap(endianness=endianness).type(data) + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + return self.unwrap().type(data) else: @@ -543,16 +522,14 @@ def to_dict(self) -> dict[str, JSON]: def wrap(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() - def unwrap(self, endianness: Endianness | None = None) -> np.dtype[np.dtypes.ObjectDType]: - return super().unwrap(endianness=endianness) + def unwrap(self) -> np.dtype[np.dtypes.ObjectDType]: + return super().unwrap() def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> str: - return self.unwrap(endianness=endianness).type(data) + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + return self.unwrap().type(data) def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: From d9b44b4cb5973714b23cf2ae25727235f5de8e0b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 4 Mar 2025 18:10:20 +0100 Subject: [PATCH 015/130] make wrapping safe by default --- src/zarr/api/asynchronous.py | 20 ++- src/zarr/codecs/blosc.py | 8 +- src/zarr/codecs/bytes.py | 8 +- src/zarr/codecs/sharding.py | 10 +- src/zarr/core/array.py | 181 ++++++++++--------- src/zarr/core/array_spec.py | 5 +- src/zarr/core/codec_pipeline.py | 5 +- src/zarr/core/metadata/dtype.py | 215 ++++++++++++++++++++--- src/zarr/core/metadata/v2.py | 50 ++---- src/zarr/core/metadata/v3.py | 37 ++-- src/zarr/registry.py | 70 +------- tests/conftest.py | 12 +- tests/test_array.py | 83 ++------- tests/test_codecs/test_vlen.py | 44 +---- tests/test_group.py | 2 +- tests/test_metadata/test_consolidated.py | 3 +- tests/test_metadata/test_v2.py | 10 +- tests/test_metadata/test_v3.py | 9 +- tests/test_v2.py | 77 ++------ 19 files changed, 405 insertions(+), 444 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index f1131003fc..d882b1d7cc 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,7 +9,13 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata +from zarr.core.array import ( + Array, + AsyncArray, + _get_default_chunk_encoding_v2, + create_array, + get_array_metadata, +) from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.common import ( @@ -29,9 +35,8 @@ create_hierarchy, ) from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v2 import _default_compressor, _default_filters +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.errors import NodeTypeValidationError -from zarr.registry import get_data_type_from_numpy from zarr.storage._common import make_store_path if TYPE_CHECKING: @@ -983,10 +988,11 @@ async def create( if zarr_format == 2: if chunks is None: chunks = shape - if not filters: - filters = _default_filters(dtype_wrapped) - if not compressor: - compressor = _default_compressor(dtype_wrapped) + default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype_wrapped) + if filters is None: + filters = default_filters + if compressor is None: + compressor = default_compressor elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 54a23c9c57..0db9e830f1 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -139,11 +139,15 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: dtype = array_spec.dtype new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=dtype.itemsize) + new_codec = replace(new_codec, typesize=dtype.unwrap().itemsize) if new_codec.shuffle is None: new_codec = replace( new_codec, - shuffle=(BloscShuffle.bitshuffle if dtype.itemsize == 1 else BloscShuffle.shuffle), + shuffle=( + BloscShuffle.bitshuffle + if dtype.unwrap().itemsize == 1 + else BloscShuffle.shuffle + ), ) return new_codec diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 4875d8e8d8..1da497ea72 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -56,7 +56,7 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if array_spec.dtype.unwrap().itemsize == 0: + if array_spec.dtype.unwrap().itemsize == 1: if self.endian is not None: return replace(self, endian=None) elif self.endian is None: @@ -71,14 +71,14 @@ async def _decode_single( chunk_spec: ArraySpec, ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) - if chunk_spec.dtype.itemsize > 0: + if chunk_spec.dtype.unwrap().itemsize > 0: if self.endian == Endian.little: prefix = "<" else: prefix = ">" - dtype = np.dtype(f"{prefix}{chunk_spec.dtype.str[1:]}") + dtype = np.dtype(f"{prefix}{chunk_spec.dtype.unwrap().str[1:]}") else: - dtype = np.dtype(f"|{chunk_spec.dtype.str[1:]}") + dtype = np.dtype(f"|{chunk_spec.dtype.unwrap().str[1:]}") as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 09ceb538d0..7163a5fd7f 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -50,6 +50,7 @@ get_indexer, morton_order_iter, ) +from zarr.core.metadata.dtype import DTypeWrapper from zarr.core.metadata.v3 import parse_codecs from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec @@ -403,7 +404,9 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return replace(self, codecs=evolved_codecs) return self - def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: + def validate( + self, *, shape: ChunkCoords, dtype: DTypeWrapper[Any, Any], chunk_grid: ChunkGrid + ) -> None: if len(self.chunk_shape) != len(shape): raise ValueError( "The shard's `chunk_shape` and array's `shape` need to have the same number of dimensions." @@ -484,7 +487,10 @@ async def _decode_partial_single( # setup output array out = shard_spec.prototype.nd_buffer.create( - shape=indexer.shape, dtype=shard_spec.dtype, order=shard_spec.order, fill_value=0 + shape=indexer.shape, + dtype=shard_spec.dtype.unwrap(), + order=shard_spec.order, + fill_value=0, ) indexed_chunks = list(indexer) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9abb330d59..f8c6fced9f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -5,6 +5,7 @@ from asyncio import gather from collections.abc import Iterable from dataclasses import dataclass, field, replace +from functools import cached_property from itertools import starmap from logging import getLogger from typing import ( @@ -29,8 +30,11 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec +from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -97,10 +101,13 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import DTypeWrapper, VariableLengthString +from zarr.core.metadata.dtype import ( + DTypeWrapper, + StaticByteString, + VariableLengthString, + get_data_type_from_numpy, +) from zarr.core.metadata.v2 import ( - _default_compressor, - _default_filters, parse_compressor, parse_filters, ) @@ -111,7 +118,6 @@ _parse_array_array_codec, _parse_array_bytes_codec, _parse_bytes_bytes_codec, - get_data_type_from_numpy, get_pipeline_class, ) from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path @@ -548,7 +554,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike[Any], + dtype: npt.DTypeLike[Any] | DTypeWrapper[Any, Any], zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -682,14 +688,19 @@ def _create_metadata_v3( """ shape = parse_shapelike(shape) - codecs = list(codecs) if codecs is not None else _get_default_codecs(dtype) + if codecs is None: + filters, serializer, compressors = _get_default_chunk_encoding_v3(dtype) + codecs_parsed = (*filters, serializer, *compressors) + else: + codecs_parsed = tuple(codecs) + chunk_key_encoding_parsed: ChunkKeyEncodingLike if chunk_key_encoding is None: chunk_key_encoding_parsed = {"name": "default", "separator": "/"} else: chunk_key_encoding_parsed = chunk_key_encoding - if dtype.kind in "UTS": + if dtype.unwrap().kind in ("U", "T", "S"): warn( f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", @@ -710,7 +721,7 @@ def _create_metadata_v3( chunk_grid=chunk_grid_parsed, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, - codecs=codecs, + codecs=codecs_parsed, dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, ) @@ -721,7 +732,7 @@ async def _create_v3( store_path: StorePath, *, shape: ShapeLike, - dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], chunk_shape: ChunkCoords, config: ArrayConfig, fill_value: Any | None = None, @@ -781,14 +792,6 @@ def _create_metadata_v2( if dimension_separator is None: dimension_separator = "." - # inject VLenUTF8 for str dtype if not already present - if isinstance(dtype, VariableLengthString): - filters = filters or [] - from numcodecs.vlen import VLenUTF8 - - if not any(isinstance(x, VLenUTF8) or x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [VLenUTF8()] - return ArrayV2Metadata( shape=shape, dtype=dtype, @@ -807,7 +810,7 @@ async def _create_v2( store_path: StorePath, *, shape: ChunkCoords, - dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], chunks: ChunkCoords, order: MemoryOrder, config: ArrayConfig, @@ -949,6 +952,13 @@ def chunks(self) -> ChunkCoords: """ return self.metadata.chunks + @cached_property + def chunk_grid(self) -> RegularChunkGrid: + if self.metadata.zarr_format == 2: + return RegularChunkGrid(chunk_shape=self.chunks) + else: + return self.metadata.chunk_grid + @property def shards(self) -> ChunkCoords | None: """Returns the shard shape of the Array. @@ -1033,7 +1043,7 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec ) @property - def dtype(self) -> DTypeWrapper[Any, Any]: + def dtype(self) -> np.dtype[Any]: """Returns the data type of the array. Returns @@ -1041,7 +1051,10 @@ def dtype(self) -> DTypeWrapper[Any, Any]: np.dtype Data type of the array """ - return self.metadata.dtype + if self.metadata.zarr_format == 2: + return self.metadata.dtype.unwrap() + else: + return self.metadata.data_type.unwrap() @property def order(self) -> MemoryOrder: @@ -1259,6 +1272,20 @@ def nbytes(self) -> int: """ return self.size * self.dtype.itemsize + def get_chunk_spec( + self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype + ) -> ArraySpec: + assert isinstance(self.chunk_grid, RegularChunkGrid), ( + "Currently, only regular chunk grid is supported" + ) + return ArraySpec( + shape=self.chunk_grid.chunk_shape, + dtype=self.dtype, + fill_value=self.metadata.fill_value, + config=array_config, + prototype=prototype, + ) + async def _get_selection( self, indexer: Indexer, @@ -1298,7 +1325,7 @@ async def _get_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), + self.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, is_complete_chunk, @@ -1351,7 +1378,7 @@ async def getitem( indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, + chunk_grid=self.chunk_grid, ) return await self._get_selection(indexer, prototype=prototype) @@ -1397,19 +1424,19 @@ async def _set_selection( # TODO: need to handle array types that don't support __array_function__ # like PyTorch and JAX array_like_ = cast(np._typing._SupportsArrayFunc, array_like) - value = np.asanyarray(value, dtype=self.metadata.dtype, like=array_like_) + value = np.asanyarray(value, dtype=self.dtype, like=array_like_) else: if not hasattr(value, "shape"): - value = np.asarray(value, self.metadata.dtype) + value = np.asarray(value, self.dtype) # assert ( # value.shape == indexer.shape # ), f"shape of value doesn't match indexer shape. Expected {indexer.shape}, got {value.shape}" - if not hasattr(value, "dtype") or value.dtype.name != self.metadata.dtype.name: + if not hasattr(value, "dtype") or value.dtype.name != self.dtype.name: if hasattr(value, "astype"): # Handle things that are already NDArrayLike more efficiently - value = value.astype(dtype=self.metadata.dtype, order="A") + value = value.astype(dtype=self.dtype, order="A") else: - value = np.array(value, dtype=self.metadata.dtype, order="A") + value = np.array(value, dtype=self.dtype, order="A") value = cast(NDArrayLike, value) # We accept any ndarray like object from the user and convert it # to a NDBuffer (or subclass). From this point onwards, we only pass @@ -1426,7 +1453,7 @@ async def _set_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, _config, prototype), + self.get_chunk_spec(chunk_coords, _config, prototype), chunk_selection, out_selection, is_complete_chunk, @@ -1481,7 +1508,7 @@ async def setitem( indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, + chunk_grid=self.chunk_grid, ) return await self._set_selection(indexer, value, prototype=prototype) @@ -1518,8 +1545,8 @@ async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) if delete_outside_chunks: # Remove all chunks outside of the new shape - old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) - new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) + old_chunk_coords = set(self.chunk_grid.all_chunk_coords(self.metadata.shape)) + new_chunk_coords = set(self.chunk_grid.all_chunk_coords(new_shape)) async def _delete_key(key: str) -> None: await (self.store_path / key).delete() @@ -1692,15 +1719,9 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | DTypeWrapper - if isinstance(self.metadata, ArrayV2Metadata): - _data_type = self.metadata.dtype - else: - _data_type = self.metadata.data_type - return ArrayInfo( _zarr_format=self.metadata.zarr_format, - _data_type=_data_type, + _data_type=self.dtype, _shape=self.shape, _order=self.order, _shard_shape=self.shards, @@ -2044,7 +2065,7 @@ def dtype(self) -> np.dtype[Any]: np.dtype The NumPy data type. """ - return self._async_array.dtype.unwrap() + return self._async_array.dtype @property def attrs(self) -> Attributes: @@ -2654,7 +2675,7 @@ def get_basic_selection( prototype = default_buffer_prototype() return sync( self._async_array._get_selection( - BasicIndexer(selection, self.shape, self.metadata.chunk_grid), + BasicIndexer(selection, self.shape, self._async_array.chunk_grid), out=out, fields=fields, prototype=prototype, @@ -2754,7 +2775,7 @@ def set_basic_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BasicIndexer(selection, self.shape, self._async_array.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @_deprecate_positional_args @@ -2875,7 +2896,7 @@ def get_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self._async_array.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -2988,7 +3009,7 @@ def set_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self._async_array.chunk_grid) return sync( self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype) ) @@ -3069,7 +3090,7 @@ def get_mask_selection( if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self._async_array.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3152,7 +3173,7 @@ def set_mask_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self._async_array.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @_deprecate_positional_args @@ -3233,7 +3254,7 @@ def get_coordinate_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self._async_array.chunk_grid) out_array = sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3319,7 +3340,7 @@ def set_coordinate_selection( if prototype is None: prototype = default_buffer_prototype() # setup indexer - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self._async_array.chunk_grid) # handle value - need ndarray-like flatten value if not is_scalar(value, self.dtype): @@ -3435,7 +3456,7 @@ def get_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self._async_array.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3529,7 +3550,7 @@ def set_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self._async_array.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @property @@ -3771,13 +3792,6 @@ def _build_parents( return parents -def _get_default_codecs( - np_dtype: np.dtype[Any], -) -> tuple[Codec, ...]: - filters, serializer, compressors = _get_default_chunk_encoding_v3(np_dtype) - return filters + (serializer,) + compressors - - FiltersLike: TypeAlias = ( Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] | ArrayArrayCodec @@ -3917,7 +3931,10 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_wrapped = get_data_type_from_numpy(dtype) + if not isinstance(dtype, DTypeWrapper): + dtype_wrapped = get_data_type_from_numpy(dtype) + else: + dtype_wrapped = dtype shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4178,7 +4195,7 @@ async def create_array( if write_data is True and data_parsed is not None: await result._set_selection( - BasicIndexer(..., shape=result.shape, chunk_grid=result.metadata.chunk_grid), + BasicIndexer(..., shape=result.shape, chunk_grid=result.chunk_grid), data_parsed, prototype=default_buffer_prototype(), ) @@ -4215,15 +4232,20 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - - default_filters = zarr_config.get("array.v3_default_filters").get(dtype.kind) - default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.kind) - default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype.kind) - - filters = tuple(_parse_array_array_codec(codec_dict) for codec_dict in default_filters) - serializer = _parse_array_bytes_codec(default_serializer) - compressors = tuple(_parse_bytes_bytes_codec(codec_dict) for codec_dict in default_compressors) - + filters = () + compressors = (ZstdCodec(level=0, checksum=False),) + # TODO: find a registry-style solution for this that isn't bloated + # We need to associate specific dtypes with specific encoding schemes + + if isinstance(dtype, VariableLengthString): + serializer = VLenUTF8Codec() + elif isinstance(dtype, StaticByteString): + serializer = VLenBytesCodec() + else: + if dtype.unwrap().itemsize == 1: + serializer = BytesCodec(endian=None) + else: + serializer = BytesCodec() return filters, serializer, compressors @@ -4233,17 +4255,18 @@ def _get_default_chunk_encoding_v2( """ Get the default chunk encoding for Zarr format 2 arrays, given a dtype """ + from numcodecs import VLenBytes as numcodecs_VLenBytes + from numcodecs import VLenUTF8 as numcodecs_VLenUTF8 + from numcodecs import Zstd as numcodecs_zstd + + if isinstance(dtype, VariableLengthString): + filters = (numcodecs_VLenUTF8(),) + elif isinstance(dtype, StaticByteString): + filters = (numcodecs_VLenBytes(),) + else: + filters = None - compressor_dict = _default_compressor(dtype) - filter_dicts = _default_filters(dtype) - - compressor = None - if compressor_dict is not None: - compressor = numcodecs.get_codec(compressor_dict) - - filters = None - if filter_dicts is not None: - filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) + compressor = numcodecs_zstd(level=0, checksum=False) return filters, compressor @@ -4296,7 +4319,7 @@ def _parse_chunk_encoding_v3( compressors: CompressorsLike, filters: FiltersLike, serializer: SerializerLike, - dtype: np.dtype[Any], + dtype: DTypeWrapper[Any, Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index cf92f11050..f5a060cf95 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -11,8 +11,7 @@ parse_shapelike, ) from zarr.core.config import config as zarr_config -from zarr.core.metadata.dtype import DTypeWrapper -from zarr.registry import get_data_type_from_numpy +from zarr.core.metadata.dtype import DTypeWrapper, get_data_type_from_numpy if TYPE_CHECKING: from typing import NotRequired @@ -100,7 +99,7 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: npt.DtypeLike | DTypeWrapper[Any, Any], + dtype: npt.DTypeLike | DTypeWrapper[Any, Any], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 628a7e0487..315dbb77a9 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -17,7 +17,6 @@ from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar -from zarr.core.metadata.v2 import _default_fill_value from zarr.registry import register_pipeline if TYPE_CHECKING: @@ -64,7 +63,7 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: # validated when decoding the metadata, but we support reading # Zarr V2 data and need to support the case where fill_value # is None. - return _default_fill_value(dtype=chunk_spec.dtype) + return chunk_spec.dtype.default_value else: return fill_value @@ -317,7 +316,7 @@ def _merge_chunk_array( if existing_chunk_array is None: chunk_array = chunk_spec.prototype.nd_buffer.create( shape=chunk_spec.shape, - dtype=chunk_spec.dtype, + dtype=chunk_spec.dtype.unwrap(), order=chunk_spec.order, fill_value=fill_value_or_default(chunk_spec), ) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index f88683e1e7..a573794730 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,16 +1,32 @@ +from __future__ import annotations + +import base64 from abc import ABC, abstractmethod from collections.abc import Sequence -from dataclasses import dataclass, replace -from typing import Any, ClassVar, Generic, Literal, Self, TypeGuard, TypeVar, cast, get_args +from dataclasses import dataclass, field, replace +from importlib.metadata import EntryPoint +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Generic, + Literal, + Self, + TypeGuard, + TypeVar, + cast, + get_args, +) import numpy as np import numpy.typing as npt from typing_extensions import get_original_bases from zarr.abc.metadata import Metadata -from zarr.core.common import JSON, ZarrFormat from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.registry import register_data_type + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat Endianness = Literal["little", "big", "native"] DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] @@ -132,16 +148,16 @@ def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JS raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_to_json_v2(data: complex | np.complexfloating) -> JSONFloat: - return float_to_json_v2(data) +def complex_to_json_v2(data: complex | np.complexfloating[Any]) -> tuple[JSONFloat, JSONFloat]: + return float_to_json_v2(data.real), float_to_json_v2(data.imag) -def complex_to_json_v3(data: complex | np.complexfloating) -> tuple[JSONFloat, JSONFloat]: +def complex_to_json_v3(data: complex | np.complexfloating[Any]) -> tuple[JSONFloat, JSONFloat]: return float_to_json_v3(data.real), float_to_json_v3(data.imag) def complex_to_json( - data: complex | np.complexfloating, zarr_format: ZarrFormat + data: complex | np.complexfloating[Any], zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat] | JSONFloat: if zarr_format == 2: return complex_to_json_v2(data) @@ -150,6 +166,18 @@ def complex_to_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") +def structured_scalar_to_json(data: bytes, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return base64.b64encode(data).decode("ascii") + raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + + +def structured_scalar_from_json(data: JSON, zarr_format: ZarrFormat) -> bytes: + if zarr_format == 2: + return base64.b64decode(data.encode("ascii")) + raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + + def float_from_json_v2(data: JSONFloat) -> float: match data: case "NaN": @@ -196,7 +224,7 @@ def complex_from_json( TDType = TypeVar("TDType", bound=np.dtype[Any]) -TScalar = TypeVar("TScalar", bound=np.generic) +TScalar = TypeVar("TScalar", bound=np.generic | str) @dataclass(frozen=True, kw_only=True) @@ -205,7 +233,7 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype kind: ClassVar[DataTypeFlavor] default_value: ClassVar[TScalar] - endianness: Endianness = "native" + endianness: Endianness | None = "native" def __init_subclass__(cls) -> None: # Subclasses will bind the first generic type parameter to an attribute of the class @@ -221,8 +249,21 @@ def cast_value(self: Self, value: object) -> TScalar: return cast(np.generic, self.unwrap().type(value)) @classmethod - @abstractmethod + def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: + """ + Check that a dtype matches the dtype_cls class attribute + """ + return type(dtype) is cls.dtype_cls + + @classmethod def wrap(cls: type[Self], dtype: TDType) -> Self: + if cls.check_dtype(dtype): + return cls._wrap_unsafe(dtype) + raise TypeError(f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}.") + + @classmethod + @abstractmethod + def _wrap_unsafe(cls: type[Self], dtype: TDType) -> Self: raise NotImplementedError def unwrap(self: Self) -> TDType: @@ -254,7 +295,7 @@ class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): default_value = np.False_ @classmethod - def wrap(cls, dtype: np.dtypes.BoolDType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: @@ -270,7 +311,7 @@ class IntWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" @classmethod - def wrap(cls, dtype: TDType) -> Self: + def _wrap_unsafe(cls, dtype: TDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: @@ -334,7 +375,7 @@ class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" @classmethod - def wrap(cls, dtype: TDType) -> Self: + def _wrap_unsafe(cls, dtype: TDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: @@ -371,7 +412,7 @@ class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): default_value = np.complex64(0) @classmethod - def wrap(cls, dtype: np.dtypes.Complex64DType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.Complex64DType) -> Self: return cls() def to_json_value( @@ -392,7 +433,7 @@ class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): default_value = np.complex128(0) @classmethod - def wrap(cls, dtype: np.dtypes.Complex128DType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.Complex128DType) -> Self: return cls() def to_json_value( @@ -412,7 +453,7 @@ class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): length: int @classmethod - def wrap(cls, dtype: TDType) -> Self: + def _wrap_unsafe(cls, dtype: TDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def unwrap(self) -> TDType: @@ -431,10 +472,10 @@ def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"length": self.length}} def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return data.tobytes().decode("ascii") + return base64.standard_b64encode(data).decode("ascii") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_bool(data): + if check_json_str(data): return self.unwrap().type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -456,7 +497,7 @@ def unwrap(self) -> np.dtypes.VoidDType: return np.dtype(f"{endianness_code}V{self.length}") def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: - return tuple(*data.tobytes()) + return base64.standard_b64encode(data).decode("ascii") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: # todo: check that this is well-formed @@ -491,20 +532,22 @@ class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): default_value = "" @classmethod - def wrap(cls, dtype: np.dtypes.StringDType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: return cls() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} def unwrap(self) -> np.dtypes.StringDType: - endianness_code = endianness_to_numpy_str(self.endianness) - return np.dtype(endianness_code + self.numpy_character_code) + # StringDType does not have endianness, so we ignore it here + return self.dtype_cls() def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") return self.unwrap().type(data) else: @@ -514,27 +557,96 @@ class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" kind = "string" default_value = np.object_("") + endianness: Endianness = field(default=None) + + def __post_init__(self) -> None: + if self.endianness is not None: + raise ValueError("VariableLengthString does not support endianness.") def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @classmethod - def wrap(cls, dtype: np.dtypes.ObjectDType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() - def unwrap(self) -> np.dtype[np.dtypes.ObjectDType]: + def unwrap(self) -> np.dtypes.ObjectDType: return super().unwrap() def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") return self.unwrap().type(data) -def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: - from zarr.registry import get_data_type_from_dict, get_data_type_from_numpy +@dataclass(frozen=True, kw_only=True) +class StructuredDtype(DTypeWrapper[np.dtypes.VoidDType, np.void]): + name = "numpy/struct" + kind = "struct" + fields: tuple[tuple[str, DTypeWrapper[Any, Any], int], ...] + + @classmethod + def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: + """ + Check that this dtype is a numpy structured dtype + """ + return super().check_dtype(dtype) and dtype.fields is not None + + @classmethod + def _wrap_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: + fields: list[tuple[str, DTypeWrapper[Any, Any], int]] = [] + + if dtype.fields is None: + raise ValueError("numpy dtype has no fields") + + for key, (dtype_instance, offset) in dtype.fields.items(): + dtype_wrapped = data_type_registry.match_dtype(dtype_instance) + fields.append((key, dtype_wrapped, offset)) + + return cls(fields=tuple(fields)) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return structured_scalar_to_json(data.tobytes(), zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + as_bytes = structured_scalar_from_json(data, zarr_format=zarr_format) + dtype = self.unwrap() + return np.array([as_bytes], dtype=dtype.str).view(dtype)[0] + + +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: + if dtype in (str, "str"): + if _NUMPY_SUPPORTS_VLEN_STRING: + np_dtype = np.dtype("T") + else: + np_dtype = np.dtype("O") + else: + np_dtype = np.dtype(dtype) + data_type_registry.lazy_load() + for val in data_type_registry.contents.values(): + return val.wrap(np_dtype) + raise ValueError( + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(data_type_registry.contents)}." + ) + + +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper: + data_type_registry.lazy_load() + dtype_name = dtype["name"] + dtype_cls = data_type_registry.get(dtype_name) + if dtype_cls is None: + raise ValueError(f"No data type class matching name {dtype_name}") + return dtype_cls.from_dict(dtype.get("configuration", {})) + + +def resolve_dtype( + dtype: npt.DTypeLike | DTypeWrapper[Any, Any] | dict[str, JSON], +) -> DTypeWrapper[Any, Any]: if isinstance(dtype, DTypeWrapper): return dtype elif isinstance(dtype, dict): @@ -543,6 +655,55 @@ def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTyp return get_data_type_from_numpy(dtype) +def get_data_type_by_name( + dtype: str, configuration: dict[str, JSON] | None = None +) -> DTypeWrapper[Any, Any]: + data_type_registry.lazy_load() + if configuration is None: + _configuration = {} + else: + _configuration = configuration + maybe_dtype_cls = data_type_registry.get(dtype) + if maybe_dtype_cls is None: + raise ValueError(f"No data type class matching name {dtype}") + return maybe_dtype_cls.from_dict(_configuration) + + +@dataclass(frozen=True, kw_only=True) +class DataTypeRegistry: + contents: dict[str, type[DTypeWrapper[Any, Any]]] = field(default_factory=dict, init=False) + lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) + + def lazy_load(self) -> None: + for e in self.lazy_load_list: + self.register(e.load()) + + self.lazy_load_list.clear() + + def register(self: Self, cls: type[DTypeWrapper[Any, Any]]) -> None: + # don't register the same dtype twice + if cls.name not in self.contents or self.contents[cls.name] != cls: + self.contents[cls.name] = cls + + def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: + return self.contents[key] + + def match_dtype(self, dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: + data_type_registry.lazy_load() + for val in data_type_registry.contents.values(): + try: + return val._wrap_unsafe(dtype) + except ValueError: + pass + raise ValueError(f"No data type wrapper found that matches {dtype}") + + +def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: + data_type_registry.register(cls) + + +data_type_registry = DataTypeRegistry() + INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 2ba2ac5c45..ebf174eff3 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -4,14 +4,17 @@ import warnings from collections.abc import Iterable from enum import Enum -from functools import cached_property from typing import TYPE_CHECKING, TypedDict, cast import numcodecs.abc from zarr.abc.metadata import Metadata -from zarr.core.metadata.dtype import DTypeWrapper -from zarr.registry import get_data_type_from_numpy +from zarr.core.metadata.dtype import ( + DTypeWrapper, + StaticByteString, + StaticRawBytes, + get_data_type_from_numpy, +) if TYPE_CHECKING: from typing import Any, Literal, Self @@ -28,7 +31,6 @@ import numpy as np from zarr.core.array_spec import ArrayConfig, ArraySpec -from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator from zarr.core.common import JSON, ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike from zarr.core.config import config, parse_indexing_order @@ -102,10 +104,6 @@ def __init__( def ndim(self) -> int: return len(self.shape) - @cached_property - def chunk_grid(self) -> RegularChunkGrid: - return RegularChunkGrid(chunk_shape=self.chunks) - @property def shards(self) -> ChunkCoords | None: return None @@ -199,11 +197,14 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if self.dtype.kind in "SV" and self.fill_value is not None: + if ( + isinstance(self.dtype, StaticByteString | StaticRawBytes) + and self.fill_value is not None + ): # There's a relationship between self.dtype and self.fill_value # that mypy isn't aware of. The fact that we have S or V dtype here # means we should have a bytes-type fill_value. - fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii") + fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value _ = zarray_dict.pop("dtype") @@ -351,35 +352,6 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: return fill_value -def _default_fill_value(dtype: np.dtype[Any]) -> Any: - """ - Get the default fill value for a type. - - Notes - ----- - This differs from :func:`parse_fill_value`, which parses a fill value - stored in the Array metadata into an in-memory value. This only gives - the default fill value for some type. - - This is useful for reading Zarr format 2 arrays, which allow the fill - value to be unspecified. - """ - if dtype.kind == "S": - return b"" - elif dtype.kind in "UO": - return "" - elif dtype.kind in "Mm": - return dtype.type("nat") - elif dtype.kind == "V": - if dtype.fields is not None: - default = tuple(_default_fill_value(field[0]) for field in dtype.fields.values()) - return np.array([default], dtype=dtype) - else: - return np.zeros(1, dtype=dtype) - else: - return dtype.type(0) - - def _default_compressor( dtype: DTypeWrapper[Any, Any], ) -> dict[str, JSON] | None: diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 8bf20899c3..e285490bfd 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,7 +4,13 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import DTypeWrapper +from zarr.core.metadata.dtype import ( + DTypeWrapper, + VariableLengthString, + get_data_type_by_name, + get_data_type_from_dict, +) + if TYPE_CHECKING: from collections.abc import Callable from typing import Self @@ -12,7 +18,7 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords - + import json from collections.abc import Iterable @@ -37,7 +43,7 @@ from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec_class, get_data_type_by_name, get_data_type_from_dict +from zarr.registry import get_codec_class DEFAULT_DTYPE = "float64" @@ -103,14 +109,10 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper) -> None: # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name codec_class_name = abc.__class__.__name__ - if dtype.kind == "string" and not codec_class_name == "VLenUTF8Codec": + if isinstance(dtype, VariableLengthString) and not codec_class_name == "VLenUTF8Codec": raise ValueError( f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) - if dtype.kind == "bytes" and not codec_class_name == "VLenBytesCodec": - raise ValueError( - f"For bytes dtype, ArrayBytesCodec must be `VLenBytesCodec`, got `{codec_class_name}`." - ) def parse_dimension_names(data: object) -> tuple[str | None, ...] | None: @@ -313,11 +315,6 @@ def _validate_metadata(self) -> None: shape=self.shape, dtype=self.data_type.unwrap(), chunk_grid=self.chunk_grid ) - @property - def dtype(self) -> np.dtype[Any]: - """Interpret Zarr dtype as NumPy dtype""" - return self.data_type.unwrap() - @property def ndim(self) -> int: return len(self.shape) @@ -365,20 +362,6 @@ def inner_codecs(self) -> tuple[Codec, ...]: return self.codecs[0].codecs return self.codecs - def get_chunk_spec( - self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype - ) -> ArraySpec: - assert isinstance(self.chunk_grid, RegularChunkGrid), ( - "Currently, only regular chunk grid is supported" - ) - return ArraySpec( - shape=self.chunk_grid.chunk_shape, - dtype=self.dtype, - fill_value=self.fill_value, - config=array_config, - prototype=prototype, - ) - def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 7760c599fd..8830cdb1a9 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -2,19 +2,15 @@ import warnings from collections import defaultdict -from dataclasses import dataclass, field from importlib.metadata import entry_points as get_entry_points -from typing import TYPE_CHECKING, Any, Generic, Self, TypeVar - -import numpy as np +from typing import TYPE_CHECKING, Any, Generic, TypeVar from zarr.core.config import BadConfigError, config +from zarr.core.metadata.dtype import data_type_registry if TYPE_CHECKING: from importlib.metadata import EntryPoint - import numpy.typing as npt - from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -24,7 +20,6 @@ ) from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON - from zarr.core.metadata.dtype import DTypeWrapper __all__ = [ "Registry", @@ -56,31 +51,10 @@ def register(self, cls: type[T]) -> None: self[fully_qualified_name(cls)] = cls -@dataclass(frozen=True, kw_only=True) -class DataTypeRegistry: - contents: dict[str, type[DTypeWrapper]] = field(default_factory=dict, init=False) - lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - - def lazy_load(self) -> None: - for e in self.lazy_load_list: - self.register(e.load()) - - self.lazy_load_list.clear() - - def register(self: Self, cls: type[DTypeWrapper]) -> None: - # don't register the same dtype twice - if cls.name not in self.contents or self.contents[cls.name] != cls: - self.contents[cls.name] = cls - - def get(self, key: str) -> type[DTypeWrapper]: - return self.contents[key] - - __codec_registries: dict[str, Registry[Codec]] = defaultdict(Registry) __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() -__data_type_registry = DataTypeRegistry() """ The registry module is responsible for managing implementations of codecs, @@ -117,8 +91,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( @@ -165,10 +139,6 @@ def register_buffer(cls: type[Buffer]) -> None: __buffer_registry.register(cls) -def register_data_type(cls: type[DTypeWrapper]) -> None: - __data_type_registry.register(cls) - - def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if reload_config: _reload_config() @@ -305,36 +275,4 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = None) -> DTypeWrapper: - __data_type_registry.lazy_load() - if configuration is None: - _configuration = {} - else: - _configuration = configuration - maybe_dtype_cls = __data_type_registry.get(dtype) - if maybe_dtype_cls is None: - raise ValueError(f"No data type class matching name {dtype}") - return maybe_dtype_cls.from_dict(_configuration) - - -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper: - __data_type_registry.lazy_load() - dtype_name = dtype["name"] - dtype_cls = __data_type_registry.get(dtype_name) - if dtype_cls is None: - raise ValueError(f"No data type class matching name {dtype_name}") - return dtype_cls.from_dict(dtype.get("configuration", {})) - - -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: - np_dtype = np.dtype(dtype) - __data_type_registry.lazy_load() - for val in __data_type_registry.contents.values(): - if val.dtype_cls is type(np_dtype): - return val.wrap(np_dtype) - raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." - ) - - _collect_entrypoints() diff --git a/tests/conftest.py b/tests/conftest.py index a650accc51..6ff1c4596f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,10 +20,10 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync -from zarr.registry import get_data_type_from_numpy from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore if TYPE_CHECKING: @@ -243,7 +243,7 @@ def create_array_metadata( filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", serializer: SerializerLike = "auto", - fill_value: Any | None = None, + fill_value: Any = 0, order: MemoryOrder | None = None, zarr_format: ZarrFormat, attributes: dict[str, JSON] | None = None, @@ -263,7 +263,7 @@ def create_array_metadata( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - dtype=dtype_parsed.unwrap().itemsize, + item_size=dtype_parsed.unwrap().itemsize, ) if order is None: @@ -274,11 +274,11 @@ def create_array_metadata( if zarr_format == 2: filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( - compressor=compressors, filters=filters, dtype=np.dtype(dtype) + compressor=compressors, filters=filters, dtype=dtype_parsed ) return ArrayV2Metadata( shape=shape_parsed, - dtype=np.dtype(dtype), + dtype=dtype_parsed, chunks=chunk_shape_parsed, order=order_parsed, dimension_separator=chunk_key_encoding_parsed.separator, @@ -379,7 +379,7 @@ def meta_from_array( filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", serializer: SerializerLike = "auto", - fill_value: Any | None = None, + fill_value: Any = 0, order: MemoryOrder | None = None, zarr_format: ZarrFormat = 3, attributes: dict[str, JSON] | None = None, diff --git a/tests/test_array.py b/tests/test_array.py index 959cf02055..d54001b54e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -22,8 +22,6 @@ BytesCodec, GzipCodec, TransposeCodec, - VLenBytesCodec, - VLenUTF8Codec, ZstdCodec, ) from zarr.core._info import ArrayInfo @@ -43,6 +41,7 @@ from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError @@ -451,24 +450,7 @@ def test_vlen_errors() -> None: ValueError, match="For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `BytesCodec`.", ): - Array.create(MemoryStore(), shape=5, chunks=5, dtype=" None: - """ - Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. - """ - - arr = await create_array( - store=store, - dtype=dtype, - shape=(10,), - zarr_format=zarr_format, - compressors=empty_value, - filters=empty_value, - ) - # Test metadata explicitly - if zarr_format == 2: - assert arr.metadata.zarr_format == 2 # guard for mypy - # v2 spec requires that filters be either a collection with at least one filter, or None - assert arr.metadata.filters is None - # Compressor is a single element in v2 metadata; the absence of a compressor is encoded - # as None - assert arr.metadata.compressor is None - - assert arr.filters == () - assert arr.compressors == () - else: - assert arr.metadata.zarr_format == 3 # guard for mypy - if dtype == "str": - assert arr.metadata.codecs == (VLenUTF8Codec(),) - assert arr.serializer == VLenUTF8Codec() - else: - assert arr.metadata.codecs == (BytesCodec(),) - assert arr.serializer == BytesCodec() - @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"]) @pytest.mark.parametrize( @@ -1131,28 +1075,27 @@ async def test_v2_chunk_encoding( assert arr.filters == filters_expected @staticmethod - @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + @pytest.mark.parametrize("dtype_str", ["uint8", "float32", "str"]) async def test_default_filters_compressors( - store: MemoryStore, dtype: str, zarr_format: ZarrFormat + store: MemoryStore, dtype_str: str, zarr_format: ZarrFormat ) -> None: """ Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. """ + zdtype = get_data_type_from_numpy(dtype_str) arr = await create_array( store=store, - dtype=dtype, + dtype=dtype_str, shape=(10,), zarr_format=zarr_format, ) if zarr_format == 3: expected_filters, expected_serializer, expected_compressors = ( - _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) + _get_default_chunk_encoding_v3(dtype=zdtype) ) elif zarr_format == 2: - default_filters, default_compressors = _get_default_chunk_encoding_v2( - dtype=np.dtype(dtype) - ) + default_filters, default_compressors = _get_default_chunk_encoding_v2(dtype=zdtype) if default_filters is None: expected_filters = () else: diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index f73b5e1969..a6c01153ff 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,9 +8,9 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.registry import get_data_type_from_numpy from zarr.storage import StorePath numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"] @@ -46,9 +46,11 @@ def test_vlen_string( # should also work if input array is an object array, provided we explicitly specified # a stringlike dtype when creating the Array if as_object_array: - data = data.astype("O") + data_obj = data.astype("O") - a[:, :] = data + a[:, :] = data_obj + else: + a[:, :] = data assert np.array_equal(data, a[:, :]) assert a.metadata.data_type == get_data_type_from_numpy(data.dtype) assert a.dtype == data.dtype @@ -59,39 +61,3 @@ def test_vlen_string( assert np.array_equal(data, b[:, :]) assert b.metadata.data_type == get_data_type_from_numpy(data.dtype) assert a.dtype == data.dtype - - -@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) -@pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) -def test_vlen_bytes(store: Store, as_object_array: bool, compressor: Codec | None) -> None: - bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"] - data = np.array(bstrings).reshape((2, 3)) - assert data.dtype == "|S5" - - sp = StorePath(store, path="string") - a = zarr.create_array( - sp, - shape=data.shape, - chunks=data.shape, - dtype=data.dtype, - fill_value=b"", - compressors=compressor, - ) - assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy - - # should also work if input array is an object array, provided we explicitly specified - # a bytesting-like dtype when creating the Array - if as_object_array: - data = data.astype("O") - a[:, :] = data - assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == DataType.bytes - assert a.dtype == "O" - - # test round trip - b = Array.open(sp) - assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy - assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == DataType.bytes - assert a.dtype == "O" diff --git a/tests/test_group.py b/tests/test_group.py index 521819ea0e..378e65d26a 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -993,7 +993,7 @@ async def test_asyncgroup_create_array( assert subnode.dtype == dtype # todo: fix the type annotation of array.metadata.chunk_grid so that we get some autocomplete # here. - assert subnode.metadata.chunk_grid.chunk_shape == chunk_shape + assert subnode.chunk_grid.chunk_shape == chunk_shape assert subnode.metadata.zarr_format == zarr_format diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index c1ff2e130a..a81625b7eb 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -20,6 +20,7 @@ from zarr.core.buffer import cpu, default_buffer_prototype from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.storage import StorePath @@ -503,7 +504,7 @@ async def test_consolidated_metadata_backwards_compatibility( async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - dtype = "uint8" + dtype = get_data_type_from_numpy("uint8") await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 4600a977d4..2637224f93 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -3,7 +3,6 @@ import json from typing import TYPE_CHECKING, Literal -import numpy as np import pytest import zarr.api.asynchronous @@ -12,6 +11,7 @@ from zarr.core.buffer.core import default_buffer_prototype from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata +from zarr.core.metadata.dtype import Float32, Float64, Int16 from zarr.core.metadata.v2 import parse_zarr_format if TYPE_CHECKING: @@ -219,7 +219,7 @@ async def test_read_consolidated_metadata( fill_value=0, chunks=(730,), attributes={"_ARRAY_DIMENSIONS": ["time"], "dataset": "NMC Reanalysis"}, - dtype=np.dtype("int16"), + dtype=Int16(), order="C", filters=None, dimension_separator=".", @@ -236,7 +236,7 @@ async def test_read_consolidated_metadata( "standard_name": "time", "units": "hours since 1800-01-01", }, - dtype=np.dtype("float32"), + dtype=Float32(), order="C", filters=None, dimension_separator=".", @@ -254,7 +254,7 @@ async def test_read_consolidated_metadata( attributes={ "calendar": "standard", }, - dtype=np.dtype("float32"), + dtype=Float32(), order="C", filters=None, dimension_separator=".", @@ -295,7 +295,7 @@ def test_from_dict_extra_fields() -> None: expected = ArrayV2Metadata( attributes={"key": "value"}, shape=(8,), - dtype="float64", + dtype=Float64(), chunks=(8,), fill_value=0.0, order="C", diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 54e077f1a6..37d8704b50 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,7 +12,7 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import complex_from_json +from zarr.core.metadata.dtype import complex_from_json, get_data_type_from_numpy from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -20,7 +20,6 @@ ) from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.errors import MetadataValidationError -from zarr.registry import get_data_type_from_numpy if TYPE_CHECKING: from collections.abc import Sequence @@ -56,9 +55,9 @@ complex_dtypes = ("complex64", "complex128") flexible_dtypes = ("str", "bytes", "void") if _NUMPY_SUPPORTS_VLEN_STRING: - vlen_string_dtypes = ("T", "O") + vlen_string_dtypes = ("T",) else: - vlen_string_dtypes = "O" + vlen_string_dtypes = ("O",) dtypes = ( *bool_dtypes, @@ -182,7 +181,7 @@ def test_parse_fill_value_invalid_type_sequence(fill_value: Any, dtype_str: str) @pytest.mark.parametrize("chunk_grid", ["regular"]) @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}]) -@pytest.mark.parametrize("codecs", [[BytesCodec()]]) +@pytest.mark.parametrize("codecs", [[BytesCodec(endian=None)]]) @pytest.mark.parametrize("fill_value", [0, 1]) @pytest.mark.parametrize("chunk_key_encoding", ["v2", "default"]) @pytest.mark.parametrize("dimension_separator", [".", "/", None]) diff --git a/tests/test_v2.py b/tests/test_v2.py index 0a4487cfcc..c5ed39472f 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -85,14 +85,14 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize( - ("dtype", "expected_dtype", "fill_value", "fill_value_encoding"), + ("dtype", "expected_dtype", "fill_value", "fill_value_json"), [ ("|S", "|S0", b"X", "WA=="), ("|V", "|V0", b"X", "WA=="), ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), ], ) -async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_encoding) -> None: +async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_json) -> None: with config.set( { "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], @@ -113,7 +113,7 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_en "chunks": [3], "compressor": None, "dtype": expected_dtype, - "fill_value": fill_value_encoding, + "fill_value": fill_value_json, "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, "order": "C", "shape": [3], @@ -127,37 +127,30 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_en np.testing.assert_equal(data, expected) -@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]]) -def test_v2_encode_decode_with_data(dtype_value): - dtype, value = dtype_value - with config.set( - { - "array.v2_default_filters": { - "string": [{"id": "vlen-utf8"}], - "bytes": [{"id": "vlen-bytes"}], - }, - } - ): - expected = np.full((3,), value, dtype=dtype) - a = zarr.create( - shape=(3,), - zarr_format=2, - dtype=dtype, - ) - a[:] = expected - data = a[:] - np.testing.assert_equal(data, expected) +@pytest.mark.parametrize(("dtype", "value"), [("|S1", b"Y"), ("|U1", "Y"), ("O", "Y")]) +def test_v2_encode_decode_with_data(dtype, value): + dtype, value = dtype, value + expected = np.full((3,), value, dtype=dtype) + a = zarr.create( + shape=(3,), + zarr_format=2, + dtype=dtype, + ) + a[:] = expected + data = a[:] + np.testing.assert_equal(data, expected) @pytest.mark.parametrize("dtype", [str, "str"]) async def test_create_dtype_str(dtype: Any) -> None: + data = ["a", "bb", "ccc"] arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) assert arr.dtype.kind == "O" assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),) - arr[:] = [b"a", b"bb", b"ccc"] + assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) + arr[:] = data result = arr[:] - np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object")) + np.testing.assert_array_equal(result, np.array(data, dtype="object")) @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: - with config.set( - { - "array.v2_default_compressor": { - "numeric": {"id": "zstd", "level": "0"}, - "string": {"id": "zstd", "level": "0"}, - "bytes": {"id": "zstd", "level": "0"}, - }, - "array.v2_default_filters": { - "numeric": [], - "string": [{"id": "vlen-utf8"}], - "bytes": [{"id": "vlen-bytes"}], - }, - } - ): - dtype, expected_compressor, expected_filter = dtype_expected - arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) - assert arr.metadata.compressor.codec_id == expected_compressor - if expected_filter is not None: - assert arr.metadata.filters[0].codec_id == expected_filter - - @pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: a = np.array( From c1a85663f7f4a939a84188d81d025f0561cf4a73 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 4 Mar 2025 23:08:15 +0100 Subject: [PATCH 016/130] dtype-specific tests --- src/zarr/core/metadata/dtype.py | 161 ++++++++++++++++-------- tests/test_metadata/test_dtype.py | 203 ++++++++++++++++++++++++++++++ 2 files changed, 312 insertions(+), 52 deletions(-) create mode 100644 tests/test_metadata/test_dtype.py diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index a573794730..590ab7df67 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -172,7 +172,7 @@ def structured_scalar_to_json(data: bytes, zarr_format: ZarrFormat) -> str: raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") -def structured_scalar_from_json(data: JSON, zarr_format: ZarrFormat) -> bytes: +def structured_scalar_from_json(data: str, zarr_format: ZarrFormat) -> bytes: if zarr_format == 2: return base64.b64decode(data.encode("ascii")) raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") @@ -202,11 +202,13 @@ def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: return float_from_json_v3(data) -def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating: - return dtype.type(data) +def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating[Any, Any]: + return dtype.type(complex(*data)) -def complex_from_json_v3(data: tuple[JSONFloat, JSONFloat], dtype: Any) -> np.complexfloating: +def complex_from_json_v3( + data: tuple[JSONFloat, JSONFloat], dtype: Any +) -> np.complexfloating[Any, Any]: return dtype.type(complex(*data)) @@ -223,6 +225,14 @@ def complex_from_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") +def datetime_to_json(data: np.datetime64[Any]) -> int: + return data.view("int").item() + + +def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64[Any]: + return np.int64(data).view(f"datetime64[{unit}]") + + TDType = TypeVar("TDType", bound=np.dtype[Any]) TScalar = TypeVar("TScalar", bound=np.generic | str) @@ -231,8 +241,6 @@ def complex_from_json( class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): name: ClassVar[str] dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype - kind: ClassVar[DataTypeFlavor] - default_value: ClassVar[TScalar] endianness: Endianness | None = "native" def __init_subclass__(cls) -> None: @@ -248,6 +256,9 @@ def to_dict(self) -> dict[str, JSON]: def cast_value(self: Self, value: object) -> TScalar: return cast(np.generic, self.unwrap().type(value)) + @abstractmethod + def default_value(self) -> TScalar: ... + @classmethod def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: """ @@ -291,8 +302,9 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal @dataclass(frozen=True, kw_only=True) class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): name = "bool" - kind = "boolean" - default_value = np.False_ + + def default_value(self) -> np.bool_: + return np.False_ @classmethod def _wrap_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: @@ -308,7 +320,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: class IntWrapperBase(DTypeWrapper[TDType, TScalar]): - kind = "numeric" + def default_value(self) -> TScalar: + return self.unwrap().type(0) @classmethod def _wrap_unsafe(cls, dtype: TDType) -> Self: @@ -326,53 +339,46 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @dataclass(frozen=True, kw_only=True) class Int8(IntWrapperBase[np.dtypes.Int8DType, np.int8]): name = "int8" - default_value = np.int8(0) @dataclass(frozen=True, kw_only=True) class UInt8(IntWrapperBase[np.dtypes.UInt8DType, np.uint8]): name = "uint8" - default_value = np.uint8(0) @dataclass(frozen=True, kw_only=True) class Int16(IntWrapperBase[np.dtypes.Int16DType, np.int16]): name = "int16" - default_value = np.int16(0) @dataclass(frozen=True, kw_only=True) class UInt16(IntWrapperBase[np.dtypes.UInt16DType, np.uint16]): name = "uint16" - default_value = np.uint16(0) @dataclass(frozen=True, kw_only=True) class Int32(IntWrapperBase[np.dtypes.Int32DType, np.int32]): name = "int32" - default_value = np.int32(0) @dataclass(frozen=True, kw_only=True) class UInt32(IntWrapperBase[np.dtypes.UInt32DType, np.uint32]): name = "uint32" - default_value = np.uint32(0) @dataclass(frozen=True, kw_only=True) class Int64(IntWrapperBase[np.dtypes.Int64DType, np.int64]): name = "int64" - default_value = np.int64(0) @dataclass(frozen=True, kw_only=True) class UInt64(IntWrapperBase[np.dtypes.UInt64DType, np.uint64]): name = "uint64" - default_value = np.uint64(0) class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): - kind = "numeric" + def default_value(self) -> TScalar: + return self.unwrap().type(0.0) @classmethod def _wrap_unsafe(cls, dtype: TDType) -> Self: @@ -390,26 +396,24 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @dataclass(frozen=True, kw_only=True) class Float16(FloatWrapperBase[np.dtypes.Float16DType, np.float16]): name = "float16" - default_value = np.float16(0) @dataclass(frozen=True, kw_only=True) class Float32(FloatWrapperBase[np.dtypes.Float32DType, np.float32]): name = "float32" - default_value = np.float32(0) @dataclass(frozen=True, kw_only=True) class Float64(FloatWrapperBase[np.dtypes.Float64DType, np.float64]): name = "float64" - default_value = np.float64(0) @dataclass(frozen=True, kw_only=True) class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): name = "complex64" - kind = "numeric" - default_value = np.complex64(0) + + def default_value(self) -> np.complex64: + return np.complex64(0.0) @classmethod def _wrap_unsafe(cls, dtype: np.dtypes.Complex64DType) -> Self: @@ -429,8 +433,9 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex6 @dataclass(frozen=True, kw_only=True) class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): name = "complex128" - kind = "numeric" - default_value = np.complex128(0) + + def default_value(self) -> np.complex128: + return np.complex128(0.0) @classmethod def _wrap_unsafe(cls, dtype: np.dtypes.Complex128DType) -> Self: @@ -464,10 +469,11 @@ def unwrap(self) -> TDType: @dataclass(frozen=True, kw_only=True) class StaticByteString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): name = "numpy/static_byte_string" - kind = "string" - default_value = np.bytes_(0) item_size_bits = 8 + def default_value(self) -> np.bytes_: + return np.bytes_(b"") + def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"length": self.length}} @@ -476,17 +482,18 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_str(data): - return self.unwrap().type(data.encode("ascii")) + return self.unwrap().type(base64.standard_b64decode(data.encode("ascii"))) raise TypeError(f"Invalid type: {data}. Expected a string.") @dataclass(frozen=True, kw_only=True) class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): name = "r*" - kind = "bytes" - default_value = np.void(b"") item_size_bits = 8 + def default_value(self) -> np.void: + return np.void(b"") + def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * self.item_size_bits}"} @@ -496,21 +503,22 @@ def unwrap(self) -> np.dtypes.VoidDType: endianness_code = endianness_to_numpy_str(self.endianness) return np.dtype(f"{endianness_code}V{self.length}") - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return base64.standard_b64encode(data).decode("ascii") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: # todo: check that this is well-formed - return self.unwrap().type(bytes(data)) + return self.unwrap().type(base64.standard_b64decode(data)) @dataclass(frozen=True, kw_only=True) class StaticUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): name = "numpy/static_unicode_string" - kind = "string" - default_value = np.str_("") item_size_bits = 32 # UCS4 is 32 bits per code point + def default_value(self) -> np.str_: + return np.str_("") + def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"length": self.length}} @@ -528,8 +536,9 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): name = "numpy/vlen_string" - kind = "string" - default_value = "" + + def default_value(self) -> str: + return "" @classmethod def _wrap_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: @@ -555,10 +564,11 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" - kind = "string" - default_value = np.object_("") endianness: Endianness = field(default=None) + def default_value(self) -> str: + return "" + def __post_init__(self) -> None: if self.endianness is not None: raise ValueError("VariableLengthString does not support endianness.") @@ -570,24 +580,57 @@ def to_dict(self) -> dict[str, JSON]: def _wrap_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() - def unwrap(self) -> np.dtypes.ObjectDType: - return super().unwrap() - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + String literals pass through + """ if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.unwrap().type(data) + return data + + +DateUnit = Literal["Y", "M", "W", "D"] +TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] @dataclass(frozen=True, kw_only=True) -class StructuredDtype(DTypeWrapper[np.dtypes.VoidDType, np.void]): +class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): + name = "numpy/datetime64" + unit: DateUnit | TimeUnit + + def default_value(self) -> np.datetime64: + return np.datetime64("NaT") + + @classmethod + def _wrap_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: + unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] + return cls(unit=unit) + + def unwrap(self) -> np.dtypes.DateTime64DType: + return np.dtype(f"datetime64[{self.unit}]").newbyteorder( + endianness_to_numpy_str(self.endianness) + ) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + if check_json_int(data): + return datetime_from_json(data, self.unit) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: + return datetime_to_json(data) + + +@dataclass(frozen=True, kw_only=True) +class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): name = "numpy/struct" - kind = "struct" fields: tuple[tuple[str, DTypeWrapper[Any, Any], int], ...] + def default_value(self) -> np.void: + return np.array([0], dtype=self.unwrap())[0] + @classmethod def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: """ @@ -608,6 +651,9 @@ def _wrap_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: return cls(fields=tuple(fields)) + def unwrap(self) -> np.dtypes.VoidDType: + return np.dtype([(key, dtype.unwrap()) for (key, dtype, _) in self.fields]) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return structured_scalar_to_json(data.tobytes(), zarr_format) @@ -629,7 +675,10 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: np_dtype = np.dtype(dtype) data_type_registry.lazy_load() for val in data_type_registry.contents.values(): - return val.wrap(np_dtype) + try: + return val.wrap(np_dtype) + except TypeError: + pass raise ValueError( f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(data_type_registry.contents)}." ) @@ -689,11 +738,11 @@ def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: return self.contents[key] def match_dtype(self, dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: - data_type_registry.lazy_load() - for val in data_type_registry.contents.values(): + self.lazy_load() + for val in self.contents.values(): try: - return val._wrap_unsafe(dtype) - except ValueError: + return val.wrap(dtype) + except TypeError: pass raise ValueError(f"No data type wrapper found that matches {dtype}") @@ -708,7 +757,15 @@ def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 STRING_DTYPE = StaticUnicodeString | VariableLengthString | StaticByteString -for dtype in get_args( - Bool | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | StaticRawBytes -): +DTYPE = ( + Bool + | INTEGER_DTYPE + | FLOAT_DTYPE + | COMPLEX_DTYPE + | STRING_DTYPE + | StaticRawBytes + | Structured + | DateTime64 +) +for dtype in get_args(DTYPE): register_data_type(dtype) diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py new file mode 100644 index 0000000000..a3f29a34f5 --- /dev/null +++ b/tests/test_metadata/test_dtype.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest + +from zarr.core.metadata.dtype import ( + Bool, + Complex64, + Complex128, + DataTypeRegistry, + DateTime64, + DTypeWrapper, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + StaticByteString, + StaticRawBytes, + StaticUnicodeString, + Structured, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthString, +) + +_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +if _NUMPY_SUPPORTS_VLEN_STRING: + VLEN_STRING_DTYPE = np.dtypes.StringDType() + VLEN_STRING_CODE = "T" +else: + VLEN_STRING_DTYPE = np.dtypes.ObjectDType() + VLEN_STRING_CODE = "O" + + +@pytest.mark.parametrize( + ("wrapper_cls", "np_dtype"), + [ + (Bool, "bool"), + (Int8, "int8"), + (Int16, "int16"), + (Int32, "int32"), + (Int64, "int64"), + (UInt8, "uint8"), + (UInt16, "uint16"), + (UInt32, "uint32"), + (UInt64, "uint64"), + (Float32, "float32"), + (Float64, "float64"), + (Complex64, "complex64"), + (Complex128, "complex128"), + (StaticUnicodeString, "U"), + (StaticByteString, "S"), + (StaticRawBytes, "V"), + (VariableLengthString, VLEN_STRING_CODE), + (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), + (DateTime64, "datetime64[s]"), + ], +) +def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | str) -> None: + """ + Test that the wrapper class has the correct dtype class bound to the dtype_cls variable + Test that the ``wrap`` method produces an instance of the wrapper class + Test that the ``unwrap`` method returns the original dtype + """ + dt = np.dtype(np_dtype) + assert wrapper_cls.dtype_cls is type(dt) + wrapped = wrapper_cls.wrap(dt) + + with pytest.raises(TypeError, match="Invalid dtype"): + wrapper_cls.wrap("not a dtype") + + assert isinstance(wrapped, wrapper_cls) + assert wrapped.unwrap() == dt + + +def test_registry_match() -> None: + """ + Test that registering a dtype in a data type registry works + Test that match_dtype resolves a numpy dtype into the stored dtype + Test that match_dtype raises an error if the dtype is not registered + """ + local_registry = DataTypeRegistry() + local_registry.register(Bool) + assert isinstance(local_registry.match_dtype(np.dtype("bool")), Bool) + outside_dtype = "int8" + with pytest.raises( + ValueError, match=f"No data type wrapper found that matches {outside_dtype}" + ): + local_registry.match_dtype(np.dtype(outside_dtype)) + + +# start writing new tests here + + +@pytest.mark.parametrize( + ("wrapper", "expected_default"), + [ + (Bool(), np.False_), + (Int8(), np.int8(0)), + (UInt8(), np.uint8(0)), + (Int16(), np.int16(0)), + (UInt16(), np.uint16(0)), + (Int32(), np.int32(0)), + (UInt32(), np.uint32(0)), + (Int64(), np.int64(0)), + (UInt64(), np.uint64(0)), + (Float16(), np.float16(0)), + (Float32(), np.float32(0)), + (Float64(), np.float64(0)), + (Complex64(), np.complex64(0)), + (Complex128(), np.complex128(0)), + (StaticByteString(length=3), np.bytes_(b"")), + (StaticRawBytes(length=3), np.void(b"")), + (StaticUnicodeString(length=3), np.str_("")), + ( + Structured(fields=(("a", Float64(), 0), ("b", Int8(), 8))), + np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], + ), + (VariableLengthString(), ""), + (DateTime64(unit="s"), np.datetime64("NaT")), + ], +) +def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: Any) -> None: + """ + Test that the default_value method is correctly set for each dtype wrapper. + """ + if isinstance(wrapper, DateTime64): + assert np.isnan(wrapper.default_value()) + else: + assert wrapper.default_value() == expected_default + + +@pytest.mark.parametrize( + ("wrapper", "input_value", "expected_json"), + [ + (Bool(), np.bool_(True), True), + (Int8(), np.int8(42), 42), + (UInt8(), np.uint8(42), 42), + (Int16(), np.int16(42), 42), + (UInt16(), np.uint16(42), 42), + (Int32(), np.int32(42), 42), + (UInt32(), np.uint32(42), 42), + (Int64(), np.int64(42), 42), + (UInt64(), np.uint64(42), 42), + (Float16(), np.float16(42.0), 42.0), + (Float32(), np.float32(42.0), 42.0), + (Float64(), np.float64(42.0), 42.0), + (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), + (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), + (StaticByteString(length=4), np.bytes_(b"test"), "dGVzdA=="), + (StaticRawBytes(length=4), np.void(b"test"), "dGVzdA=="), + (StaticUnicodeString(length=4), np.str_("test"), "test"), + (VariableLengthString(), "test", "test"), + (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), + ], +) +def test_to_json_value_v2( + wrapper: type[DTypeWrapper[Any, Any]], input_value: Any, expected_json: Any +) -> None: + """ + Test the to_json_value method for each dtype wrapper for zarr v2 + """ + assert wrapper.to_json_value(input_value, zarr_format=2) == expected_json + + +@pytest.mark.parametrize( + ("wrapper", "json_value", "expected_value"), + [ + (Bool(), True, np.bool_(True)), + (Int8(), 42, np.int8(42)), + (UInt8(), 42, np.uint8(42)), + (Int16(), 42, np.int16(42)), + (UInt16(), 42, np.uint16(42)), + (Int32(), 42, np.int32(42)), + (UInt32(), 42, np.uint32(42)), + (Int64(), 42, np.int64(42)), + (UInt64(), 42, np.uint64(42)), + (Float16(), 42.0, np.float16(42.0)), + (Float32(), 42.0, np.float32(42.0)), + (Float64(), 42.0, np.float64(42.0)), + (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), + (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), + (StaticByteString(length=4), "dGVzdA==", np.bytes_(b"test")), + (StaticRawBytes(length=4), "dGVzdA==", np.void(b"test")), + (StaticUnicodeString(length=4), "test", np.str_("test")), + (VariableLengthString(), "test", "test"), + (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), + ], +) +def test_from_json_value( + wrapper: type[DTypeWrapper[Any, Any]], json_value: Any, expected_value: Any +) -> None: + """ + Test the from_json_value method for each dtype wrapper. + """ + assert wrapper.from_json_value(json_value, zarr_format=2) == expected_value From 2868994b07a610121d707742ee025e4ba43f78e0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 5 Mar 2025 16:57:23 +0100 Subject: [PATCH 017/130] more tests, fix void type default value logic --- src/zarr/core/array.py | 11 +-- src/zarr/core/buffer/core.py | 4 +- src/zarr/core/codec_pipeline.py | 2 +- src/zarr/core/metadata/dtype.py | 87 +++++++++++++++------- src/zarr/core/metadata/v2.py | 64 ++++------------ tests/test_array.py | 27 +------ tests/test_metadata/test_dtype.py | 120 +++++++++++++++++++++++------- tests/test_metadata/test_v3.py | 18 ++--- tests/test_v2.py | 41 +--------- 9 files changed, 185 insertions(+), 189 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index abd862f023..7718aa505f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -103,7 +103,8 @@ ) from zarr.core.metadata.dtype import ( DTypeWrapper, - StaticByteString, + FixedLengthAsciiString, + FixedLengthUnicodeString, VariableLengthString, get_data_type_from_numpy, ) @@ -710,7 +711,7 @@ def _create_metadata_v3( if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = dtype.default_value + fill_value_parsed = dtype.default_value() else: fill_value_parsed = fill_value @@ -4237,7 +4238,7 @@ def _get_default_chunk_encoding_v3( if isinstance(dtype, VariableLengthString): serializer = VLenUTF8Codec() - elif isinstance(dtype, StaticByteString): + elif isinstance(dtype, FixedLengthAsciiString): serializer = VLenBytesCodec() else: if dtype.unwrap().itemsize == 1: @@ -4257,9 +4258,9 @@ def _get_default_chunk_encoding_v2( from numcodecs import VLenUTF8 as numcodecs_VLenUTF8 from numcodecs import Zstd as numcodecs_zstd - if isinstance(dtype, VariableLengthString): + if isinstance(dtype, VariableLengthString | FixedLengthUnicodeString): filters = (numcodecs_VLenUTF8(),) - elif isinstance(dtype, StaticByteString): + elif isinstance(dtype, FixedLengthAsciiString): filters = (numcodecs_VLenBytes(),) else: filters = None diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index ccab103e0f..23ac5d3a69 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -472,7 +472,9 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool: return np.array_equal( self._data, other, - equal_nan=equal_nan if self._data.dtype.kind not in "USTOV" else False, + equal_nan=equal_nan + if self._data.dtype.kind not in ("U", "S", "T", "O", "V") + else False, ) def fill(self, value: Any) -> None: diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 315dbb77a9..5ee4f03799 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -63,7 +63,7 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: # validated when decoding the metadata, but we support reading # Zarr V2 data and need to support the case where fill_value # is None. - return chunk_spec.dtype.default_value + return chunk_spec.dtype.default_value() else: return fill_value diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 590ab7df67..17e67fbb05 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -16,6 +16,7 @@ TypeVar, cast, get_args, + get_origin, ) import numpy as np @@ -133,7 +134,7 @@ def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly - # so we just re-use the v2 routine here + # so we just reuse the v2 routine here return float_to_json_v2(data) @@ -148,11 +149,11 @@ def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JS raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_to_json_v2(data: complex | np.complexfloating[Any]) -> tuple[JSONFloat, JSONFloat]: +def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: return float_to_json_v2(data.real), float_to_json_v2(data.imag) -def complex_to_json_v3(data: complex | np.complexfloating[Any]) -> tuple[JSONFloat, JSONFloat]: +def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: return float_to_json_v3(data.real), float_to_json_v3(data.imag) @@ -226,15 +227,16 @@ def complex_from_json( def datetime_to_json(data: np.datetime64[Any]) -> int: - return data.view("int").item() + return data.view(np.int64).item() def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64[Any]: return np.int64(data).view(f"datetime64[{unit}]") +TScalar = TypeVar("TScalar", bound=np.generic | str, covariant=True) +# TODO: figure out an interface or protocol that non-numpy dtypes can TDType = TypeVar("TDType", bound=np.dtype[Any]) -TScalar = TypeVar("TScalar", bound=np.generic | str) @dataclass(frozen=True, kw_only=True) @@ -244,17 +246,27 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): endianness: Endianness | None = "native" def __init_subclass__(cls) -> None: - # Subclasses will bind the first generic type parameter to an attribute of the class # TODO: wrap this in some *very informative* error handling generic_args = get_args(get_original_bases(cls)[0]) - cls.dtype_cls = generic_args[0] + # the logic here is that if a subclass was created with generic type parameters + # specified explicitly, then we bind that type parameter to the dtype_cls attribute + if len(generic_args) > 0: + cls.dtype_cls = generic_args[0] + else: + # but if the subclass was created without generic type parameters specified explicitly, + # then we check the parent DTypeWrapper classes and retrieve their generic type parameters + for base in cls.__orig_bases__: + if get_origin(base) is DTypeWrapper: + generic_args = get_args(base) + cls.dtype_cls = generic_args[0] + break return super().__init_subclass__() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} def cast_value(self: Self, value: object) -> TScalar: - return cast(np.generic, self.unwrap().type(value)) + return cast(TScalar, self.unwrap().type(value)) @abstractmethod def default_value(self) -> TScalar: ... @@ -455,7 +467,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex1 @dataclass(frozen=True, kw_only=True) class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): item_size_bits: ClassVar[int] - length: int + length: int = 0 @classmethod def _wrap_unsafe(cls, dtype: TDType) -> Self: @@ -467,7 +479,7 @@ def unwrap(self) -> TDType: @dataclass(frozen=True, kw_only=True) -class StaticByteString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): +class FixedLengthAsciiString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): name = "numpy/static_byte_string" item_size_bits = 8 @@ -492,11 +504,18 @@ class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): item_size_bits = 8 def default_value(self) -> np.void: - return np.void(b"") + return self.cast_value(("\x00" * self.length).encode("ascii")) def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * self.item_size_bits}"} + @classmethod + def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: + """ + Reject structured dtypes by ensuring that dtype.fields is None + """ + return type(dtype) is cls.dtype_cls and dtype.fields is None + def unwrap(self) -> np.dtypes.VoidDType: # this needs to be overridden because numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly @@ -512,7 +531,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): +class FixedLengthUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): name = "numpy/static_unicode_string" item_size_bits = 32 # UCS4 is 32 bits per code point @@ -599,7 +618,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): name = "numpy/datetime64" - unit: DateUnit | TimeUnit + unit: DateUnit | TimeUnit = "s" def default_value(self) -> np.datetime64: return np.datetime64("NaT") @@ -609,6 +628,9 @@ def _wrap_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] return cls(unit=unit) + def cast_value(self, value: object) -> np.datetime64: + return self.unwrap().type(value, self.unit) + def unwrap(self) -> np.dtypes.DateTime64DType: return np.dtype(f"datetime64[{self.unit}]").newbyteorder( endianness_to_numpy_str(self.endianness) @@ -651,6 +673,26 @@ def _wrap_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: return cls(fields=tuple(fields)) + def to_dict(self) -> dict[str, JSON]: + base_dict = super().to_dict() + if base_dict.get("configuration", {}) != {}: + raise ValueError( + "This data type wrapper cannot inherit from a data type wrapper that defines a configuration for its dict serialization" + ) + field_configs = [ + (f_name, f_dtype.to_dict(), f_offset) for f_name, f_dtype, f_offset in self.fields + ] + base_dict["configuration"] = {"fields": field_configs} + return base_dict + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + fields = tuple( + (f_name, get_data_type_from_dict(f_dtype), f_offset) + for f_name, f_dtype, f_offset in data["fields"] + ) + return cls(fields=fields) + def unwrap(self) -> np.dtypes.VoidDType: return np.dtype([(key, dtype.unwrap()) for (key, dtype, _) in self.fields]) @@ -665,7 +707,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: return np.array([as_bytes], dtype=dtype.str).view(dtype)[0] -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: if dtype in (str, "str"): if _NUMPY_SUPPORTS_VLEN_STRING: np_dtype = np.dtype("T") @@ -674,17 +716,10 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: else: np_dtype = np.dtype(dtype) data_type_registry.lazy_load() - for val in data_type_registry.contents.values(): - try: - return val.wrap(np_dtype) - except TypeError: - pass - raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(data_type_registry.contents)}." - ) + return data_type_registry.match_dtype(np_dtype) -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper: +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper[Any.Any]: data_type_registry.lazy_load() dtype_name = dtype["name"] dtype_cls = data_type_registry.get(dtype_name) @@ -737,14 +772,14 @@ def register(self: Self, cls: type[DTypeWrapper[Any, Any]]) -> None: def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: return self.contents[key] - def match_dtype(self, dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: + def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: self.lazy_load() for val in self.contents.values(): try: return val.wrap(dtype) except TypeError: pass - raise ValueError(f"No data type wrapper found that matches {dtype}") + raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: @@ -756,7 +791,7 @@ def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = StaticUnicodeString | VariableLengthString | StaticByteString +STRING_DTYPE = FixedLengthUnicodeString | VariableLengthString | FixedLengthAsciiString DTYPE = ( Bool | INTEGER_DTYPE diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index ebf174eff3..cb09a35bec 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -3,7 +3,6 @@ import base64 import warnings from collections.abc import Iterable -from enum import Enum from typing import TYPE_CHECKING, TypedDict, cast import numcodecs.abc @@ -11,8 +10,7 @@ from zarr.abc.metadata import Metadata from zarr.core.metadata.dtype import ( DTypeWrapper, - StaticByteString, - StaticRawBytes, + Structured, get_data_type_from_numpy, ) @@ -109,49 +107,12 @@ def shards(self) -> ChunkCoords | None: return None def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - def _json_convert( - o: Any, - ) -> Any: - if isinstance(o, np.dtype): - if o.fields is None: - return o.str - else: - return o.descr - if isinstance(o, numcodecs.abc.Codec): - codec_config = o.get_config() - - # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 - if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum", None) - - return codec_config - if np.isscalar(o): - out: Any - if hasattr(o, "dtype") and o.dtype.kind == "M" and hasattr(o, "view"): - # https://github.com/zarr-developers/zarr-python/issues/2119 - # `.item()` on a datetime type might or might not return an - # integer, depending on the value. - # Explicitly cast to an int first, and then grab .item() - out = o.view("i8").item() - else: - # convert numpy scalar to python type, and pass - # python types through - out = getattr(o, "item", lambda: o)() - if isinstance(out, complex): - # python complex types are not JSON serializable, so we use the - # serialization defined in the zarr v3 spec - return [out.real, out.imag] - return out - if isinstance(o, Enum): - return o.name - raise TypeError - zarray_dict = self.to_dict() zattrs_dict = zarray_dict.pop("attributes", {}) json_indent = config.get("json_indent") return { ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode() + json.dumps(zarray_dict, indent=json_indent).encode() ), ZATTRS_JSON: prototype.buffer.from_bytes( json.dumps(zattrs_dict, indent=json_indent).encode() @@ -196,11 +157,19 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() + if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): + zarray_dict["compressor"] = zarray_dict["compressor"].get_config() + if zarray_dict["filters"] is not None: + raw_filters = zarray_dict["filters"] + new_filters = [] + for f in raw_filters: + if isinstance(f, numcodecs.abc.Codec): + new_filters.append(f.get_config()) + else: + new_filters.append(f) + zarray_dict["filters"] = new_filters - if ( - isinstance(self.dtype, StaticByteString | StaticRawBytes) - and self.fill_value is not None - ): + if self.fill_value is not None: # There's a relationship between self.dtype and self.fill_value # that mypy isn't aware of. The fact that we have S or V dtype here # means we should have a bytes-type fill_value. @@ -209,10 +178,7 @@ def to_dict(self) -> dict[str, JSON]: _ = zarray_dict.pop("dtype") dtype_json: JSON - # TODO: Replace this with per-dtype method - # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string - dtype_descr = self.dtype.unwrap().descr - if self.dtype.unwrap().kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0: + if isinstance(self.dtype, Structured): dtype_json = tuple(self.dtype.unwrap().descr) else: dtype_json = self.dtype.unwrap().str diff --git a/tests/test_array.py b/tests/test_array.py index d54001b54e..5c58b3d3be 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -428,31 +428,6 @@ async def test_nbytes_stored_async() -> None: assert result == 902 # the size with all chunks filled. -def test_default_fill_values() -> None: - a = zarr.Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: - with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."): - Array.create(MemoryStore(), shape=5, chunks=5, dtype=" None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 @@ -1061,7 +1036,7 @@ async def test_v2_chunk_encoding( filters=filters, ) filters_expected, compressor_expected = _parse_chunk_encoding_v2( - filters=filters, compressor=compressors, dtype=np.dtype(dtype) + filters=filters, compressor=compressors, dtype=get_data_type_from_numpy(dtype) ) assert arr.metadata.zarr_format == 2 # guard for mypy assert arr.metadata.compressor == compressor_expected diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py index a3f29a34f5..d0a0243a9f 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_metadata/test_dtype.py @@ -1,17 +1,20 @@ from __future__ import annotations -from typing import Any +from typing import Any, get_args import numpy as np import pytest from zarr.core.metadata.dtype import ( + DTYPE, Bool, Complex64, Complex128, DataTypeRegistry, DateTime64, DTypeWrapper, + FixedLengthAsciiString, + FixedLengthUnicodeString, Float16, Float32, Float64, @@ -19,17 +22,22 @@ Int16, Int32, Int64, - StaticByteString, StaticRawBytes, - StaticUnicodeString, Structured, UInt8, UInt16, UInt32, UInt64, VariableLengthString, + data_type_registry, ) + +@pytest.fixture +def dtype_registry() -> DataTypeRegistry: + return DataTypeRegistry() + + _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") if _NUMPY_SUPPORTS_VLEN_STRING: VLEN_STRING_DTYPE = np.dtypes.StringDType() @@ -55,8 +63,8 @@ (Float64, "float64"), (Complex64, "complex64"), (Complex128, "complex128"), - (StaticUnicodeString, "U"), - (StaticByteString, "S"), + (FixedLengthUnicodeString, "U"), + (FixedLengthAsciiString, "S"), (StaticRawBytes, "V"), (VariableLengthString, VLEN_STRING_CODE), (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), @@ -80,23 +88,14 @@ def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | st assert wrapped.unwrap() == dt -def test_registry_match() -> None: - """ - Test that registering a dtype in a data type registry works - Test that match_dtype resolves a numpy dtype into the stored dtype - Test that match_dtype raises an error if the dtype is not registered - """ - local_registry = DataTypeRegistry() - local_registry.register(Bool) - assert isinstance(local_registry.match_dtype(np.dtype("bool")), Bool) - outside_dtype = "int8" - with pytest.raises( - ValueError, match=f"No data type wrapper found that matches {outside_dtype}" - ): - local_registry.match_dtype(np.dtype(outside_dtype)) - - -# start writing new tests here +@pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) +def test_dict_serialization(wrapper_cls: DTYPE) -> None: + if issubclass(wrapper_cls, Structured): + instance = wrapper_cls(fields=((("a", Bool(), 0),))) + else: + instance = wrapper_cls() + as_dict = instance.to_dict() + assert wrapper_cls.from_dict(data=as_dict.get("configuration", {})) == instance @pytest.mark.parametrize( @@ -116,9 +115,9 @@ def test_registry_match() -> None: (Float64(), np.float64(0)), (Complex64(), np.complex64(0)), (Complex128(), np.complex128(0)), - (StaticByteString(length=3), np.bytes_(b"")), + (FixedLengthAsciiString(length=3), np.bytes_(b"")), (StaticRawBytes(length=3), np.void(b"")), - (StaticUnicodeString(length=3), np.str_("")), + (FixedLengthUnicodeString(length=3), np.str_("")), ( Structured(fields=(("a", Float64(), 0), ("b", Int8(), 8))), np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], @@ -154,9 +153,9 @@ def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: (Float64(), np.float64(42.0), 42.0), (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), - (StaticByteString(length=4), np.bytes_(b"test"), "dGVzdA=="), + (FixedLengthAsciiString(length=4), np.bytes_(b"test"), "dGVzdA=="), (StaticRawBytes(length=4), np.void(b"test"), "dGVzdA=="), - (StaticUnicodeString(length=4), np.str_("test"), "test"), + (FixedLengthUnicodeString(length=4), np.str_("test"), "test"), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), ], @@ -187,9 +186,9 @@ def test_to_json_value_v2( (Float64(), 42.0, np.float64(42.0)), (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), - (StaticByteString(length=4), "dGVzdA==", np.bytes_(b"test")), + (FixedLengthAsciiString(length=4), "dGVzdA==", np.bytes_(b"test")), (StaticRawBytes(length=4), "dGVzdA==", np.void(b"test")), - (StaticUnicodeString(length=4), "test", np.str_("test")), + (FixedLengthUnicodeString(length=4), "test", np.str_("test")), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), ], @@ -201,3 +200,68 @@ def test_from_json_value( Test the from_json_value method for each dtype wrapper. """ assert wrapper.from_json_value(json_value, zarr_format=2) == expected_value + + +class TestRegistry: + @staticmethod + def test_register(dtype_registry: DataTypeRegistry) -> None: + """ + Test that registering a dtype in a data type registry works. + """ + dtype_registry.register(Bool) + assert dtype_registry.get("bool") == Bool + assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), Bool) + + @staticmethod + def test_override(dtype_registry: DataTypeRegistry) -> None: + """ + Test that registering a new dtype with the same name works (overriding the previous one). + """ + dtype_registry.register(Bool) + + class NewBool(Bool): + def default_value(self) -> np.bool_: + return np.True_ + + dtype_registry.register(NewBool) + assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), NewBool) + + @staticmethod + @pytest.mark.parametrize( + ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicodeString, "|U4")] + ) + def test_match_dtype( + dtype_registry: DataTypeRegistry, wrapper_cls: type[DTypeWrapper[Any, Any]], dtype_str: str + ) -> None: + """ + Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. + """ + dtype_registry.register(wrapper_cls) + assert isinstance(dtype_registry.match_dtype(np.dtype(dtype_str)), wrapper_cls) + + @staticmethod + def test_unregistered_dtype(dtype_registry: DataTypeRegistry) -> None: + """ + Test that match_dtype raises an error if the dtype is not registered. + """ + outside_dtype = "int8" + with pytest.raises( + ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" + ): + dtype_registry.match_dtype(np.dtype(outside_dtype)) + + with pytest.raises(KeyError): + dtype_registry.get(outside_dtype) + + @staticmethod + @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) + def test_registered_dtypes(wrapper_cls: DTypeWrapper[Any, Any]) -> None: + """ + Test that the registered dtypes can be retrieved from the registry. + """ + if issubclass(wrapper_cls, Structured): + instance = wrapper_cls(fields=((("a", Bool(), 0),))) + else: + instance = wrapper_cls() + + assert data_type_registry.match_dtype(instance.unwrap()) == instance diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 37d8704b50..ea59496280 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,7 +12,7 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import complex_from_json, get_data_type_from_numpy +from zarr.core.metadata.dtype import DateTime64, complex_from_json, get_data_type_from_numpy from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -266,19 +266,19 @@ def test_json_indent(indent: int): assert d == json.dumps(json.loads(d), indent=indent).encode() -@pytest.mark.xfail(reason="Data type not supported yet") @pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) @pytest.mark.parametrize("precision", ["ns", "D"]) async def test_datetime_metadata(fill_value: int, precision: str) -> None: + dtype = DateTime64(unit=precision) metadata_dict = { "zarr_format": 3, "node_type": "array", "shape": (1,), "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, - "data_type": f" None: elif fill_value == "-Infinity": assert np.isneginf(m.fill_value) assert d["fill_value"] == "-Infinity" - - -@pytest.mark.parametrize("dtype_str", dtypes) -def test_dtypes(dtype_str: str) -> None: - dt = get_data_type_from_numpy(dtype_str) - np_dtype = dt.unwrap() - assert isinstance(np_dtype, dt.dtype_cls) - assert np_dtype.type(0) == dt.cast_value(0) diff --git a/tests/test_v2.py b/tests/test_v2.py index c5ed39472f..f3dec247b7 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -40,33 +40,6 @@ def test_simple(store: StorePath) -> None: assert np.array_equal(data, a[:, :]) -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize( - ("dtype", "fill_value"), - [ - ("bool", False), - ("int64", 0), - ("float64", 0.0), - ("|S1", b""), - ("|U1", ""), - ("object", ""), - (str, ""), - ], -) -def test_implicit_fill_value(store: MemoryStore, dtype: str, fill_value: Any) -> None: - arr = zarr.create(store=store, shape=(4,), fill_value=None, zarr_format=2, dtype=dtype) - assert arr.metadata.fill_value is None - assert arr.metadata.to_dict()["fill_value"] is None - result = arr[:] - if dtype is str: - # special case - numpy_dtype = np.dtype(object) - else: - numpy_dtype = np.dtype(dtype) - expected = np.full(arr.shape, fill_value, dtype=numpy_dtype) - np.testing.assert_array_equal(result, expected) - - def test_codec_pipeline() -> None: # https://github.com/zarr-developers/zarr-python/issues/2243 store = MemoryStore() @@ -127,7 +100,7 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js np.testing.assert_equal(data, expected) -@pytest.mark.parametrize(("dtype", "value"), [("|S1", b"Y"), ("|U1", "Y"), ("O", "Y")]) +@pytest.mark.parametrize(("dtype", "value"), [("|S1", b"Y"), ("|U1", "Y"), (str, "Y")]) def test_v2_encode_decode_with_data(dtype, value): dtype, value = dtype, value expected = np.full((3,), value, dtype=dtype) @@ -141,18 +114,6 @@ def test_v2_encode_decode_with_data(dtype, value): np.testing.assert_equal(data, expected) -@pytest.mark.parametrize("dtype", [str, "str"]) -async def test_create_dtype_str(dtype: Any) -> None: - data = ["a", "bb", "ccc"] - arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) - assert arr.dtype.kind == "O" - assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) - arr[:] = data - result = arr[:] - np.testing.assert_array_equal(result, np.array(data, dtype="object")) - - @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: From 9ab0b1ee8c43e67436fa053444b1fa59d1052ed8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 5 Mar 2025 19:50:54 +0100 Subject: [PATCH 018/130] fix dtype mechanics in bytescodec --- src/zarr/codecs/bytes.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 1da497ea72..cd9e6d89e9 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -71,14 +71,8 @@ async def _decode_single( chunk_spec: ArraySpec, ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) - if chunk_spec.dtype.unwrap().itemsize > 0: - if self.endian == Endian.little: - prefix = "<" - else: - prefix = ">" - dtype = np.dtype(f"{prefix}{chunk_spec.dtype.unwrap().str[1:]}") - else: - dtype = np.dtype(f"|{chunk_spec.dtype.unwrap().str[1:]}") + + dtype = chunk_spec.dtype.with_endianness(self.endian).unwrap() as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): From e14279d66868a4d2f78c162612ee1a34db553358 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 7 Mar 2025 23:14:22 +0100 Subject: [PATCH 019/130] remove __post_init__ magic in favor of more explicit declaration --- src/zarr/codecs/bytes.py | 5 ++-- src/zarr/core/metadata/dtype.py | 48 +++++++++++++++++---------------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index cd9e6d89e9..9a5a217abf 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -71,8 +71,9 @@ async def _decode_single( chunk_spec: ArraySpec, ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) - - dtype = chunk_spec.dtype.with_endianness(self.endian).unwrap() + # TODO: remove endianness enum in favor of literal union + endian_str = self.endian.value if self.endian is not None else None + dtype = chunk_spec.dtype.with_endianness(endian_str).unwrap() as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 17e67fbb05..33aa22b398 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -16,12 +16,10 @@ TypeVar, cast, get_args, - get_origin, ) import numpy as np import numpy.typing as npt -from typing_extensions import get_original_bases from zarr.abc.metadata import Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING @@ -245,23 +243,6 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype endianness: Endianness | None = "native" - def __init_subclass__(cls) -> None: - # TODO: wrap this in some *very informative* error handling - generic_args = get_args(get_original_bases(cls)[0]) - # the logic here is that if a subclass was created with generic type parameters - # specified explicitly, then we bind that type parameter to the dtype_cls attribute - if len(generic_args) > 0: - cls.dtype_cls = generic_args[0] - else: - # but if the subclass was created without generic type parameters specified explicitly, - # then we check the parent DTypeWrapper classes and retrieve their generic type parameters - for base in cls.__orig_bases__: - if get_origin(base) is DTypeWrapper: - generic_args = get_args(base) - cls.dtype_cls = generic_args[0] - break - return super().__init_subclass__() - def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @@ -314,6 +295,7 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal @dataclass(frozen=True, kw_only=True) class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): name = "bool" + dtype_cls: ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType def default_value(self) -> np.bool_: return np.False_ @@ -350,41 +332,49 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @dataclass(frozen=True, kw_only=True) class Int8(IntWrapperBase[np.dtypes.Int8DType, np.int8]): + dtype_cls = np.dtypes.Int8DType name = "int8" @dataclass(frozen=True, kw_only=True) class UInt8(IntWrapperBase[np.dtypes.UInt8DType, np.uint8]): + dtype_cls = np.dtypes.UInt8DType name = "uint8" @dataclass(frozen=True, kw_only=True) class Int16(IntWrapperBase[np.dtypes.Int16DType, np.int16]): + dtype_cls = np.dtypes.Int16DType name = "int16" @dataclass(frozen=True, kw_only=True) class UInt16(IntWrapperBase[np.dtypes.UInt16DType, np.uint16]): + dtype_cls = np.dtypes.UInt16DType name = "uint16" @dataclass(frozen=True, kw_only=True) class Int32(IntWrapperBase[np.dtypes.Int32DType, np.int32]): + dtype_cls = np.dtypes.Int32DType name = "int32" @dataclass(frozen=True, kw_only=True) class UInt32(IntWrapperBase[np.dtypes.UInt32DType, np.uint32]): + dtype_cls = np.dtypes.UInt32DType name = "uint32" @dataclass(frozen=True, kw_only=True) class Int64(IntWrapperBase[np.dtypes.Int64DType, np.int64]): + dtype_cls = np.dtypes.Int64DType name = "int64" @dataclass(frozen=True, kw_only=True) class UInt64(IntWrapperBase[np.dtypes.UInt64DType, np.uint64]): + dtype_cls = np.dtypes.UInt64DType name = "uint64" @@ -407,21 +397,25 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @dataclass(frozen=True, kw_only=True) class Float16(FloatWrapperBase[np.dtypes.Float16DType, np.float16]): + dtype_cls = np.dtypes.Float16DType name = "float16" @dataclass(frozen=True, kw_only=True) class Float32(FloatWrapperBase[np.dtypes.Float32DType, np.float32]): + dtype_cls = np.dtypes.Float32DType name = "float32" @dataclass(frozen=True, kw_only=True) class Float64(FloatWrapperBase[np.dtypes.Float64DType, np.float64]): + dtype_cls = np.dtypes.Float64DType name = "float64" @dataclass(frozen=True, kw_only=True) class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): + dtype_cls = np.dtypes.Complex64DType name = "complex64" def default_value(self) -> np.complex64: @@ -444,6 +438,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex6 @dataclass(frozen=True, kw_only=True) class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): + dtype_cls = np.dtypes.Complex128DType name = "complex128" def default_value(self) -> np.complex128: @@ -480,7 +475,8 @@ def unwrap(self) -> TDType: @dataclass(frozen=True, kw_only=True) class FixedLengthAsciiString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): - name = "numpy/static_byte_string" + dtype_cls = np.dtypes.BytesDType + name = "numpy.static_byte_string" item_size_bits = 8 def default_value(self) -> np.bytes_: @@ -500,6 +496,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): + dtype_cls = np.dtypes.VoidDType name = "r*" item_size_bits = 8 @@ -532,7 +529,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) class FixedLengthUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): - name = "numpy/static_unicode_string" + dtype_cls = np.dtypes.StrDType + name = "numpy.static_unicode_string" item_size_bits = 32 # UCS4 is 32 bits per code point def default_value(self) -> np.str_: @@ -554,7 +552,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): - name = "numpy/vlen_string" + dtype_cls = np.dtypes.StringDType + name = "numpy.vlen_string" def default_value(self) -> str: return "" @@ -582,7 +581,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): - name = "numpy/vlen_string" + dtype_cls = np.dtypes.ObjectDType + name = "numpy.vlen_string" endianness: Endianness = field(default=None) def default_value(self) -> str: @@ -617,6 +617,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): + dtype_cls = np.dtypes.DateTime64DType name = "numpy/datetime64" unit: DateUnit | TimeUnit = "s" @@ -647,6 +648,7 @@ def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): + dtype_cls = np.dtypes.VoidDType name = "numpy/struct" fields: tuple[tuple[str, DTypeWrapper[Any, Any], int], ...] From 381a26436872be2db0217d9bd5046c5c9d8ae082 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 9 Mar 2025 12:53:32 +0100 Subject: [PATCH 020/130] fix tests --- src/zarr/core/metadata/v2.py | 7 ++++++- tests/test_metadata/test_dtype.py | 2 +- tests/test_metadata/test_v2.py | 6 ++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index cb09a35bec..3883a998c1 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -158,7 +158,12 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): - zarray_dict["compressor"] = zarray_dict["compressor"].get_config() + codec_config = zarray_dict["compressor"].get_config() + # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 + if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): + codec_config.pop("checksum") + zarray_dict["compressor"] = codec_config + if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] new_filters = [] diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py index d0a0243a9f..8a1bcdedd1 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_metadata/test_dtype.py @@ -116,7 +116,7 @@ def test_dict_serialization(wrapper_cls: DTYPE) -> None: (Complex64(), np.complex64(0)), (Complex128(), np.complex128(0)), (FixedLengthAsciiString(length=3), np.bytes_(b"")), - (StaticRawBytes(length=3), np.void(b"")), + (StaticRawBytes(length=3), np.void(b"\x00\x00\x00")), (FixedLengthUnicodeString(length=3), np.str_("")), ( Structured(fields=(("a", Float64(), 0), ("b", Int8(), 8))), diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 2637224f93..1c5ddd6f9a 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -19,8 +19,6 @@ from zarr.abc.codec import Codec -import numcodecs - def test_parse_zarr_format_valid() -> None: assert parse_zarr_format(2) == 2 @@ -33,8 +31,8 @@ def test_parse_zarr_format_invalid(data: Any) -> None: @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}]) -@pytest.mark.parametrize("filters", [None, (numcodecs.GZip(),)]) -@pytest.mark.parametrize("compressor", [None, numcodecs.GZip()]) +@pytest.mark.parametrize("filters", [None, [{"id": "gzip", "level": 1}]]) +@pytest.mark.parametrize("compressor", [None, {"id": "gzip", "level": 1}]) @pytest.mark.parametrize("fill_value", [None, 0, 1]) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dimension_separator", [".", "/", None]) From 6a7857b15ae360825f92c0c47d2aa5863e481531 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 12 Mar 2025 10:46:28 +0100 Subject: [PATCH 021/130] refactor data types --- src/zarr/api/asynchronous.py | 2 +- src/zarr/codecs/_v2.py | 6 +- src/zarr/codecs/blosc.py | 4 +- src/zarr/codecs/bytes.py | 5 +- src/zarr/codecs/sharding.py | 4 +- src/zarr/core/_info.py | 2 +- src/zarr/core/array.py | 42 +- src/zarr/core/array_spec.py | 8 +- src/zarr/core/buffer/cpu.py | 7 +- src/zarr/core/codec_pipeline.py | 2 +- src/zarr/core/dtype/__init__.py | 115 ++++ src/zarr/core/dtype/_numpy.py | 821 +++++++++++++++++++++++ src/zarr/core/dtype/common.py | 602 +++++++++++++++++ src/zarr/core/dtype/registry.py | 50 ++ src/zarr/core/dtype/wrapper.py | 279 ++++++++ src/zarr/core/metadata/dtype.py | 808 ---------------------- src/zarr/core/metadata/v2.py | 21 +- src/zarr/core/metadata/v3.py | 19 +- src/zarr/registry.py | 2 +- src/zarr/testing/strategies.py | 6 +- tests/conftest.py | 4 +- tests/test_array.py | 8 +- tests/test_codecs/test_vlen.py | 2 +- tests/test_metadata/test_consolidated.py | 4 +- tests/test_metadata/test_dtype.py | 51 +- tests/test_metadata/test_v2.py | 2 +- tests/test_metadata/test_v3.py | 6 +- 27 files changed, 1962 insertions(+), 920 deletions(-) create mode 100644 src/zarr/core/dtype/__init__.py create mode 100644 src/zarr/core/dtype/_numpy.py create mode 100644 src/zarr/core/dtype/common.py create mode 100644 src/zarr/core/dtype/registry.py create mode 100644 src/zarr/core/dtype/wrapper.py diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index d882b1d7cc..d3e88ae7d3 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -28,6 +28,7 @@ _warn_order_kwarg, _warn_write_empty_chunks_kwarg, ) +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.group import ( AsyncGroup, ConsolidatedMetadata, @@ -35,7 +36,6 @@ create_hierarchy, ) from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.errors import NodeTypeValidationError from zarr.storage._common import make_store_path diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index e2f228f509..a89d1f5fa4 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -48,7 +48,7 @@ async def _decode_single( # segfaults and other bad things happening if chunk_spec.dtype != object: try: - chunk = chunk.view(chunk_spec.dtype.unwrap()) + chunk = chunk.view(chunk_spec.dtype.to_dtype()) except TypeError: # this will happen if the dtype of the chunk # does not match the dtype of the array spec i.g. if @@ -56,7 +56,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.unwrap()) + chunk = np.array(chunk).astype(chunk_spec.dtype.to_dtype()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -80,7 +80,7 @@ async def _encode_single( chunk = chunk_array.as_ndarray_like() # ensure contiguous and correct order - chunk = chunk.astype(chunk_spec.dtype.unwrap(), order=chunk_spec.order, copy=False) + chunk = chunk.astype(chunk_spec.dtype.to_dtype(), order=chunk_spec.order, copy=False) # apply filters if self.filters: diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index d7cd1f0113..79be926ad8 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -139,13 +139,13 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: dtype = array_spec.dtype new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=dtype.unwrap().itemsize) + new_codec = replace(new_codec, typesize=dtype.to_dtype().itemsize) if new_codec.shuffle is None: new_codec = replace( new_codec, shuffle=( BloscShuffle.bitshuffle - if dtype.unwrap().itemsize == 1 + if dtype.to_dtype().itemsize == 1 else BloscShuffle.shuffle ), ) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 9a5a217abf..e7b57ab9b3 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -10,6 +10,7 @@ from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.dtype.common import endianness_to_numpy_str from zarr.registry import register_codec if TYPE_CHECKING: @@ -56,7 +57,7 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if array_spec.dtype.unwrap().itemsize == 1: + if array_spec.dtype.to_dtype().itemsize == 1: if self.endian is not None: return replace(self, endian=None) elif self.endian is None: @@ -73,7 +74,7 @@ async def _decode_single( assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union endian_str = self.endian.value if self.endian is not None else None - dtype = chunk_spec.dtype.with_endianness(endian_str).unwrap() + dtype = chunk_spec.dtype.to_dtype().newbyteorder(endianness_to_numpy_str(endian_str)) as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 7163a5fd7f..c501346980 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -50,7 +50,6 @@ get_indexer, morton_order_iter, ) -from zarr.core.metadata.dtype import DTypeWrapper from zarr.core.metadata.v3 import parse_codecs from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec @@ -59,6 +58,7 @@ from typing import Self from zarr.core.common import JSON + from zarr.core.dtype.wrapper import DTypeWrapper MAX_UINT_64 = 2**64 - 1 ShardMapping = Mapping[ChunkCoords, Buffer] @@ -488,7 +488,7 @@ async def _decode_partial_single( # setup output array out = shard_spec.prototype.nd_buffer.create( shape=indexer.shape, - dtype=shard_spec.dtype.unwrap(), + dtype=shard_spec.dtype.to_dtype(), order=shard_spec.order, fill_value=0, ) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 6b594583e2..a632b8c602 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -7,7 +7,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.dtype import DTypeWrapper +from zarr.core.dtype.wrapper import DTypeWrapper # from zarr.core.metadata.v3 import DataType diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7718aa505f..a060bcbfae 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -67,6 +67,13 @@ product, ) from zarr.core.config import config as zarr_config +from zarr.core.dtype import ( + DTypeWrapper, + FixedLengthAsciiString, + FixedLengthUnicodeString, + VariableLengthString, + parse_data_type, +) from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -101,13 +108,6 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import ( - DTypeWrapper, - FixedLengthAsciiString, - FixedLengthUnicodeString, - VariableLengthString, - get_data_type_from_numpy, -) from zarr.core.metadata.v2 import ( parse_compressor, parse_filters, @@ -555,7 +555,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike[Any] | DTypeWrapper[Any, Any], + dtype: npt.DTypeLike | DTypeWrapper[Any, Any], zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -584,11 +584,8 @@ async def _create( See :func:`AsyncArray.create` for more details. Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - # TODO: delete this and be more strict about where parsing occurs - if not isinstance(dtype, DTypeWrapper): - dtype_parsed = get_data_type_from_numpy(np.dtype(dtype)) - else: - dtype_parsed = dtype + + dtype_parsed = parse_data_type(dtype) store_path = await make_store_path(store) shape = parse_shapelike(shape) @@ -597,9 +594,9 @@ async def _create( raise ValueError("Only one of chunk_shape or chunks can be provided.") if chunks: - _chunks = normalize_chunks(chunks, shape, dtype_parsed.unwrap().itemsize) + _chunks = normalize_chunks(chunks, shape, dtype_parsed.to_dtype().itemsize) else: - _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.unwrap().itemsize) + _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.to_dtype().itemsize) config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @@ -701,7 +698,7 @@ def _create_metadata_v3( else: chunk_key_encoding_parsed = chunk_key_encoding - if dtype.unwrap().kind in ("U", "T", "S"): + if dtype.to_dtype().kind in ("U", "T", "S"): warn( f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", @@ -1053,9 +1050,9 @@ def dtype(self) -> np.dtype[Any]: Data type of the array """ if self.metadata.zarr_format == 2: - return self.metadata.dtype.unwrap() + return self.metadata.dtype.to_dtype() else: - return self.metadata.data_type.unwrap() + return self.metadata.data_type.to_dtype() @property def order(self) -> MemoryOrder: @@ -3930,10 +3927,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - if not isinstance(dtype, DTypeWrapper): - dtype_wrapped = get_data_type_from_numpy(dtype) - else: - dtype_wrapped = dtype + dtype_wrapped = parse_data_type(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -3951,7 +3945,7 @@ async def init_array( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=dtype_wrapped.unwrap().itemsize, + item_size=dtype_wrapped.to_dtype().itemsize, ) chunks_out: tuple[int, ...] meta: ArrayV2Metadata | ArrayV3Metadata @@ -4241,7 +4235,7 @@ def _get_default_chunk_encoding_v3( elif isinstance(dtype, FixedLengthAsciiString): serializer = VLenBytesCodec() else: - if dtype.unwrap().itemsize == 1: + if dtype.to_dtype().itemsize == 1: serializer = BytesCodec(endian=None) else: serializer = BytesCodec() diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index f5a060cf95..f297fafa24 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -11,7 +11,7 @@ parse_shapelike, ) from zarr.core.config import config as zarr_config -from zarr.core.metadata.dtype import DTypeWrapper, get_data_type_from_numpy +from zarr.core.dtype import parse_data_type if TYPE_CHECKING: from typing import NotRequired @@ -20,6 +20,7 @@ from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords + from zarr.core.dtype.wrapper import DTypeWrapper class ArrayConfigParams(TypedDict): @@ -105,10 +106,7 @@ def __init__( prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) - if not isinstance(dtype, DTypeWrapper): - dtype_parsed = get_data_type_from_numpy(dtype) - else: - dtype_parsed = dtype + dtype_parsed = parse_data_type(dtype) fill_value_parsed = parse_fill_value(fill_value) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 00444a6f76..9894fced51 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -10,7 +10,6 @@ import numpy.typing as npt from zarr.core.buffer import core -from zarr.core.metadata.dtype import DTypeWrapper from zarr.registry import ( register_buffer, register_ndbuffer, @@ -158,11 +157,7 @@ def create( if fill_value is None: return cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order)) else: - return cls( - np.full( - shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order - ) - ) + return cls(np.full(shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order)) @classmethod def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self: diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 5ee4f03799..222e97ce74 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -316,7 +316,7 @@ def _merge_chunk_array( if existing_chunk_array is None: chunk_array = chunk_spec.prototype.nd_buffer.create( shape=chunk_spec.shape, - dtype=chunk_spec.dtype.unwrap(), + dtype=chunk_spec.dtype.to_dtype(), order=chunk_spec.order, fill_value=fill_value_or_default(chunk_spec), ) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py new file mode 100644 index 0000000000..432eabf2ce --- /dev/null +++ b/src/zarr/core/dtype/__init__.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, get_args + +import numpy as np + +from zarr.core.dtype.common import _NUMPY_SUPPORTS_VLEN_STRING + +if TYPE_CHECKING: + import numpy.typing as npt + + from zarr.core.common import JSON + +from zarr.core.dtype._numpy import ( + Bool, + Complex64, + Complex128, + DateTime64, + FixedLengthAsciiString, + FixedLengthBytes, + FixedLengthUnicodeString, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + Structured, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthString, +) +from zarr.core.dtype.registry import DataTypeRegistry +from zarr.core.dtype.wrapper import DTypeWrapper + +__all__ = [ + "Complex64", + "Complex128", + "DTypeWrapper", + "DateTime64", + "FixedLengthAsciiString", + "FixedLengthBytes", + "FixedLengthUnicodeString", + "Float16", + "Float32", + "Float64", + "Int8", + "Int16", + "Int32", + "Int64", + "Structured", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "VariableLengthString", + "data_type_registry", + "parse_data_type", +] + +data_type_registry = DataTypeRegistry() + +INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 +FLOAT_DTYPE = Float16 | Float32 | Float64 +COMPLEX_DTYPE = Complex64 | Complex128 +STRING_DTYPE = FixedLengthUnicodeString | VariableLengthString | FixedLengthAsciiString +DTYPE = ( + Bool + | INTEGER_DTYPE + | FLOAT_DTYPE + | COMPLEX_DTYPE + | STRING_DTYPE + | FixedLengthBytes + | Structured + | DateTime64 +) + +for dtype in get_args(DTYPE): + data_type_registry.register(dtype._zarr_v3_name, dtype) + + +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: + data_type_registry.lazy_load() + if not isinstance(dtype, np.dtype): + if dtype in (str, "str"): + if _NUMPY_SUPPORTS_VLEN_STRING: + np_dtype = np.dtype("T") + else: + np_dtype = np.dtype("O") + elif isinstance(dtype, list): + # this is a valid _VoidDTypeLike check + np_dtype = np.dtype([tuple(d) for d in dtype]) + else: + np_dtype = np.dtype(dtype) + else: + np_dtype = dtype + return data_type_registry.match_dtype(np_dtype) + + +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper[Any, Any]: + return data_type_registry.match_json(dtype) + + +def parse_data_type( + dtype: npt.DTypeLike | DTypeWrapper[Any, Any] | dict[str, JSON], +) -> DTypeWrapper[Any, Any]: + if isinstance(dtype, DTypeWrapper): + return dtype + elif isinstance(dtype, dict): + return get_data_type_from_dict(dtype) + else: + return get_data_type_from_numpy(dtype) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py new file mode 100644 index 0000000000..b98cc100e3 --- /dev/null +++ b/src/zarr/core/dtype/_numpy.py @@ -0,0 +1,821 @@ +from __future__ import annotations + +import base64 +import re +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, TypeGuard, cast, get_args + +import numpy as np + +from zarr.core.dtype.common import ( + _NUMPY_SUPPORTS_VLEN_STRING, + DataTypeValidationError, + Endianness, + JSONFloat, + bytes_from_json, + bytes_to_json, + check_json_bool, + check_json_complex_float, + check_json_complex_float_v3, + check_json_float_v2, + check_json_int, + check_json_str, + complex_from_json, + complex_to_json, + datetime_from_json, + datetime_to_json, + endianness_from_numpy_str, + endianness_to_numpy_str, + float_from_json, + float_to_json, +) +from zarr.core.dtype.wrapper import DTypeWrapper, TDType + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + + +@dataclass(frozen=True, kw_only=True) +class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): + """ + Wrapper for numpy boolean dtype. + + Attributes + ---------- + name : str + The name of the dtype. + dtype_cls : ClassVar[type[np.dtypes.BoolDType]] + The numpy dtype class. + """ + + _zarr_v3_name = "bool" + dtype_cls: ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.bool_: + """ + Get the default value for the boolean dtype. + + Returns + ------- + np.bool_ + The default value. + """ + return np.False_ + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: + """ + Wrap a numpy boolean dtype without checking. + + Parameters + ---------- + dtype : np.dtypes.BoolDType + The numpy dtype to wrap. + + Returns + ------- + Self + The wrapped dtype. + """ + return cls() + + def to_dtype(self) -> np.dtypes.BoolDType: + return self.dtype_cls() + + def to_json_value(self, data: np.bool_, zarr_format: ZarrFormat) -> bool: + """ + Convert a boolean value to JSON-serializable format. + + Parameters + ---------- + data : object + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + bool + The JSON-serializable format. + """ + return bool(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: + """ + Read a JSON-serializable value as a numpy boolean scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.bool_ + The numpy boolean scalar. + """ + if check_json_bool(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected a boolean.") + + +@dataclass(frozen=True, kw_only=True) +class Int8(DTypeWrapper[np.dtypes.Int8DType, np.int8]): + dtype_cls = np.dtypes.Int8DType + _zarr_v3_name = "int8" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Int8DType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.Int8DType: + return self.dtype_cls() + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.int8: + return self.to_dtype().type(0) + + def to_json_value(self, data: np.int8, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int8: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class UInt8(DTypeWrapper[np.dtypes.UInt8DType, np.uint8]): + dtype_cls = np.dtypes.UInt8DType + _zarr_v3_name = "uint8" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt8DType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.UInt8DType: + return self.dtype_cls() + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.uint8: + return self.to_dtype().type(0) + + def to_json_value(self, data: np.uint8, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint8: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class Int16(DTypeWrapper[np.dtypes.Int16DType, np.int16]): + dtype_cls = np.dtypes.Int16DType + _zarr_v3_name = "int16" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Int16DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Int16DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.int16: + return self.cast_value(0) + + def to_json_value(self, data: np.int16, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int16: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class UInt16(DTypeWrapper[np.dtypes.UInt16DType, np.uint16]): + dtype_cls = np.dtypes.UInt16DType + _zarr_v3_name = "uint16" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt16DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.UInt16DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.uint16: + return self.cast_value(0) + + def to_json_value(self, data: np.uint16, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint16: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class Int32(DTypeWrapper[np.dtypes.Int32DType, np.int32]): + dtype_cls = np.dtypes.Int32DType + _zarr_v3_name = "int32" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Int32DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Int32DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.int32: + return self.cast_value(0) + + def to_json_value(self, data: np.int32, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int32: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class UInt32(DTypeWrapper[np.dtypes.UInt32DType, np.uint32]): + dtype_cls = np.dtypes.UInt32DType + _zarr_v3_name = "uint32" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt32DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.UInt32DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.uint32: + return self.cast_value(0) + + def to_json_value(self, data: np.uint32, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint32: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class Int64(DTypeWrapper[np.dtypes.Int64DType, np.int64]): + dtype_cls = np.dtypes.Int64DType + _zarr_v3_name = "int64" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Int64DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Int64DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.int64: + return self.cast_value(0) + + def to_json_value(self, data: np.int64, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int64: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class UInt64(DTypeWrapper[np.dtypes.UInt64DType, np.uint64]): + dtype_cls = np.dtypes.UInt64DType + _zarr_v3_name = "uint64" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt64DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.UInt64DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.uint64: + return self.cast_value(0) + + def to_json_value(self, data: np.uint64, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint64: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class Float16(DTypeWrapper[np.dtypes.Float16DType, np.float16]): + dtype_cls = np.dtypes.Float16DType + _zarr_v3_name = "float16" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Float16DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Float16DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.float16: + return self.to_dtype().type(0.0) + + def to_json_value(self, data: np.float16, zarr_format: ZarrFormat) -> JSONFloat: + return float_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float16: + if check_json_float_v2(data): + return self.to_dtype().type(float_from_json(data, zarr_format)) + raise TypeError(f"Invalid type: {data}. Expected a float.") + + +@dataclass(frozen=True, kw_only=True) +class Float32(DTypeWrapper[np.dtypes.Float32DType, np.float32]): + dtype_cls = np.dtypes.Float32DType + _zarr_v3_name = "float32" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Float32DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Float32DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def cast_value(self, value: object) -> np.float32: + return self.to_dtype().type(value) + + def default_value(self) -> np.float32: + return self.to_dtype().type(0.0) + + def to_json_value(self, data: np.float32, zarr_format: ZarrFormat) -> JSONFloat: + return float_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float32: + if check_json_float_v2(data): + return self.to_dtype().type(float_from_json(data, zarr_format)) + raise TypeError(f"Invalid type: {data}. Expected a float.") + + +@dataclass(frozen=True, kw_only=True) +class Float64(DTypeWrapper[np.dtypes.Float64DType, np.float64]): + dtype_cls = np.dtypes.Float64DType + _zarr_v3_name = "float64" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Float64DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Float64DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.float64: + return self.to_dtype().type(0.0) + + def to_json_value(self, data: np.float64, zarr_format: ZarrFormat) -> JSONFloat: + return float_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float64: + if check_json_float_v2(data): + return self.to_dtype().type(float_from_json(data, zarr_format)) + raise TypeError(f"Invalid type: {data}. Expected a float.") + + +@dataclass(frozen=True, kw_only=True) +class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): + dtype_cls = np.dtypes.Complex64DType + _zarr_v3_name = "complex64" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Complex64DType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.Complex64DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.complex64: + return np.complex64(0.0) + + def to_json_value( + self, data: np.complex64, zarr_format: ZarrFormat + ) -> tuple[JSONFloat, JSONFloat]: + return complex_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex64: + if check_json_complex_float(data, zarr_format=zarr_format): + return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) + raise TypeError(f"Invalid type: {data}. Expected a complex float.") + + +@dataclass(frozen=True, kw_only=True) +class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): + dtype_cls = np.dtypes.Complex128DType + _zarr_v3_name = "complex128" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Complex128DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Complex128DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.complex128: + return np.complex128(0.0) + + def to_json_value( + self, data: np.complex128, zarr_format: ZarrFormat + ) -> tuple[JSONFloat, JSONFloat]: + return complex_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: + if check_json_complex_float_v3(data): + return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) + raise TypeError(f"Invalid type: {data}. Expected a complex float.") + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthAsciiString(DTypeWrapper[np.dtypes.BytesDType[Any], np.bytes_]): + dtype_cls = np.dtypes.BytesDType + _zarr_v3_name = "numpy.static_byte_string" + item_size_bits: ClassVar[int] = 8 + length: int = 1 + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.BytesDType) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def to_dtype(self) -> np.dtypes.BytesDType: + return self.dtype_cls(self.length) + + def default_value(self) -> np.bytes_: + return np.bytes_(b"") + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3), "configuration": {"length": self.length}} + + def to_json_value(self, data: np.bytes_, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + raise TypeError(f"Invalid type: {data}. Expected a string.") + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType[Any], np.void]): + dtype_cls = np.dtypes.VoidDType[Any] + _zarr_v3_name = "r*" + item_size_bits: ClassVar[int] = 8 + length: int = 1 + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[Any]) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def default_value(self) -> np.void: + return self.cast_value(("\x00" * self.length).encode("ascii")) + + def to_dtype(self) -> np.dtypes.VoidDType[Any]: + # Numpy does not allow creating a void type + # by invoking np.dtypes.VoidDType directly + return np.dtype(f"V{self.length}") + + def get_name(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return super().get_name(zarr_format=zarr_format) + # note that we don't return self._zarr_v3_name + # because the name is parametrized by the length + return f"r{self.length * self.item_size_bits}" + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + @classmethod + def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + """ + Reject structured dtypes by ensuring that dtype.fields is None + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return super().check_dtype(dtype) and dtype.fields is None + + @classmethod + def check_json(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + # Overriding the base class implementation because the r* dtype + # does not have a name that will can appear in array metadata + # Instead, array metadata will contain names like "r8", "r16", etc + return ( + isinstance(data, dict) + and "name" in data + and isinstance(data["name"], str) + and re.match(r"^r\d+$", data["name"]) + ) + + def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data.tobytes()).decode("ascii") + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data)) + raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]): + dtype_cls = np.dtypes.StrDType[int] + _zarr_v3_name = "numpy.static_unicode_string" + item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point + endianness: Endianness | None = "native" + length: int = 1 + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: + return cls( + length=dtype.itemsize // (cls.item_size_bits // 8), + endianness=endianness_from_numpy_str(dtype.byteorder), + ) + + def to_dtype(self) -> np.dtypes.StrDType[int]: + return self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)) + + def default_value(self) -> np.str_: + return np.str_("") + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3), "configuration": {"length": self.length}} + + def to_json_value(self, data: np.str_, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.cast_value(data) + + +if _NUMPY_SUPPORTS_VLEN_STRING: + + @dataclass(frozen=True, kw_only=True) + class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): + dtype_cls = np.dtypes.StringDType + _zarr_v3_name = "numpy.vlen_string" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: + return cls() + + def default_value(self) -> str: + return "" + + def cast_value(self, value: object) -> str: + return str(value) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def to_dtype(self) -> np.dtypes.StringDType: + return self.dtype_cls() + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.cast_value(data) + +else: + + @dataclass(frozen=True, kw_only=True) + class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): + dtype_cls = np.dtypes.ObjectDType + _zarr_v3_name = "numpy.vlen_string" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.ObjectDType: + return self.dtype_cls() + + def cast_value(self, value: object) -> str: + return str(value) + + def default_value(self) -> str: + return "" + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def to_json_value(self, data: str, *, zarr_format: ZarrFormat) -> str: + return data + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + String literals pass through + """ + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return data + + +DateUnit = Literal["Y", "M", "W", "D"] +TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] + + +@dataclass(frozen=True, kw_only=True) +class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): + dtype_cls = np.dtypes.DateTime64DType + _zarr_v3_name = "numpy.datetime64" + unit: DateUnit | TimeUnit = "s" + endianness: Endianness = "native" + + def default_value(self) -> np.datetime64: + return np.datetime64("NaT") + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3), "configuration": {"unit": self.unit}} + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: + unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] + if unit not in get_args(DateUnit | TimeUnit): + raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') + return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder)) + + def cast_value(self, value: object) -> np.datetime64: + return self.to_dtype().type(value, self.unit) + + def to_dtype(self) -> np.dtypes.DateTime64DType: + # Numpy does not allow creating datetime64 via + # np.dtypes.DateTime64Dtype() + return np.dtype(f"datetime64[{self.unit}]").newbyteorder( + endianness_to_numpy_str(self.endianness) + ) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + if check_json_int(data): + return datetime_from_json(data, self.unit) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: + return datetime_to_json(data) + + +@dataclass(frozen=True, kw_only=True) +class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): + dtype_cls = np.dtypes.VoidDType + _zarr_v3_name = "numpy.structured" + fields: tuple[tuple[str, DTypeWrapper[Any, Any]], ...] + + def default_value(self) -> np.void: + return self.cast_value(0) + + def cast_value(self, value: object) -> np.void: + return np.array([value], dtype=self.to_dtype())[0] + + @classmethod + def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: + """ + Check that this dtype is a numpy structured dtype + + Parameters + ---------- + dtype : np.dtypes.DTypeLike + The dtype to check. + + Returns + ------- + TypeGuard[np.dtypes.VoidDType] + True if the dtype matches, False otherwise. + """ + return super().check_dtype(dtype) and dtype.fields is not None + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: + from zarr.core.dtype import get_data_type_from_numpy + + fields: list[tuple[str, DTypeWrapper[Any, Any]]] = [] + + if dtype.fields is None: + raise ValueError("numpy dtype has no fields") + + for key, (dtype_instance, _) in dtype.fields.items(): + dtype_wrapped = get_data_type_from_numpy(dtype_instance) + fields.append((key, dtype_wrapped)) + + return cls(fields=tuple(fields)) + + def get_name(self, zarr_format: ZarrFormat) -> str | list[tuple[str, str]]: + if zarr_format == 2: + return [[k, d.get_name(zarr_format=2)] for k, d in self.fields] + return self._zarr_v3_name + + def to_dict(self) -> dict[str, JSON]: + base_dict = {"name": self.get_name(zarr_format=3)} + field_configs = [(f_name, f_dtype.to_dict()) for f_name, f_dtype in self.fields] + base_dict["configuration"] = {"fields": field_configs} + return base_dict + + @classmethod + def check_json(cls, data: JSON) -> bool: + return ( + isinstance(data, dict) + and "name" in data + and "configuration" in data + and "fields" in data["configuration"] + ) + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + if cls.check_json(data): + from zarr.core.dtype import get_data_type_from_dict + + fields = tuple( + (f_name, get_data_type_from_dict(f_dtype)) + for f_name, f_dtype in data["configuration"]["fields"] + ) + return cls(fields=fields) + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + + def to_dtype(self) -> np.dtypes.VoidDType: + return cast(np.void, np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields])) + + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(data.tobytes(), zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + as_bytes = bytes_from_json(data, zarr_format=zarr_format) + dtype = self.to_dtype() + return cast(np.void, np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py new file mode 100644 index 0000000000..1dbf22c3c2 --- /dev/null +++ b/src/zarr/core/dtype/common.py @@ -0,0 +1,602 @@ +from __future__ import annotations + +import base64 +from collections.abc import Sequence +from typing import TYPE_CHECKING, Any, Literal, TypeGuard, cast, get_args + +import numpy as np + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype._numpy import DateUnit, TimeUnit + +Endianness = Literal["little", "big", "native"] +EndiannessNumpy = Literal[">", "<", "=", "|"] +JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] + +_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") + + +class DataTypeValidationError(ValueError): ... + + +def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: + """ + Convert an endianness literal to its numpy string representation. + + Parameters + ---------- + endianness : Endianness or None + The endianness to convert. + + Returns + ------- + Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "little": + return "<" + case "big": + return ">" + case "native": + return "=" + case None: + return "|" + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" + ) + + +def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: + """ + Convert a numpy endianness string literal to a human-readable literal value. + + Parameters + ---------- + endianness : Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Returns + ------- + Endianness or None + The human-readable representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "<": + return "little" + case ">": + return "big" + case "=": + return "native" + case "|": + return None + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" + ) + + +def check_json_bool(data: JSON) -> TypeGuard[bool]: + """ + Check if a JSON value is a boolean. + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a boolean, False otherwise. + """ + return bool(isinstance(data, bool)) + + +def check_json_str(data: JSON) -> TypeGuard[str]: + """ + Check if a JSON value is a string. + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a string, False otherwise. + """ + return bool(isinstance(data, str)) + + +def check_json_int(data: JSON) -> TypeGuard[int]: + """ + Check if a JSON value is an integer. + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is an integer, False otherwise. + """ + return bool(isinstance(data, int)) + + +def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: + """ + Check if a JSON value represents a float (v2). + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a float, False otherwise. + """ + if data == "NaN" or data == "Infinity" or data == "-Infinity": + return True + return isinstance(data, float | int) + + +def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: + """ + Check if a JSON value represents a float (v3). + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a float, False otherwise. + """ + # TODO: handle the special JSON serialization of different NaN values + return check_json_float_v2(data) + + +def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: + """ + Check if a JSON value represents a float based on zarr format. + + Parameters + ---------- + data : JSON + The JSON value to check. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + Bool + True if the data is a float, False otherwise. + """ + if zarr_format == 2: + return check_json_float_v2(data) + else: + return check_json_float_v3(data) + + +def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float, as per the zarr v3 spec + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a complex float, False otherwise. + """ + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v3(data[0]) + and check_json_float_v3(data[1]) + ) + + +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a complex float, False otherwise. + """ + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v2(data[0]) + and check_json_float_v2(data[1]) + ) + + +def check_json_complex_float( + data: JSON, zarr_format: ZarrFormat +) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float based on zarr format. + + Parameters + ---------- + data : JSON + The JSON value to check. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + Bool + True if the data represents a complex float, False otherwise. + """ + if zarr_format == 2: + return check_json_complex_float_v2(data) + return check_json_complex_float_v3(data) + + +def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: + """ + Convert a float to JSON (v2). + + Parameters + ---------- + data : float or np.floating + The float value to convert. + + Returns + ------- + JSONFloat + The JSON representation of the float. + """ + if np.isnan(data): + return "NaN" + elif np.isinf(data): + return "Infinity" if data > 0 else "-Infinity" + return float(data) + + +def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: + """ + Convert a float to JSON (v3). + + Parameters + ---------- + data : float or np.floating + The float value to convert. + + Returns + ------- + JSONFloat + The JSON representation of the float. + """ + # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly + # so we just reuse the v2 routine here + return float_to_json_v2(data) + + +def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: + """ + Convert a float to JSON, parametrized by the zarr format version. + + Parameters + ---------- + data : float or np.floating + The float value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSONFloat + The JSON representation of the float. + """ + if zarr_format == 2: + return float_to_json_v2(data) + else: + return float_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: + """ + Convert a complex number to JSON (v2). + + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + + Returns + ------- + tuple[JSONFloat, JSONFloat] + The JSON representation of the complex number. + """ + return float_to_json_v2(data.real), float_to_json_v2(data.imag) + + +def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: + """ + Convert a complex number to JSON (v3). + + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + + Returns + ------- + tuple[JSONFloat, JSONFloat] + The JSON representation of the complex number. + """ + return float_to_json_v3(data.real), float_to_json_v3(data.imag) + + +def complex_to_json( + data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat +) -> tuple[JSONFloat, JSONFloat]: + """ + Convert a complex number to JSON, parametrized by the zarr format version. + + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + tuple[JSONFloat, JSONFloat] or JSONFloat + The JSON representation of the complex number. + """ + if zarr_format == 2: + return complex_to_json_v2(data) + else: + return complex_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: + """ + Convert bytes to JSON. + + Parameters + ---------- + data : bytes + The structured scalar value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The bytes encoded as ascii using the base64 alphabet. + """ + if zarr_format == 2: + return base64.b64encode(data).decode("ascii") + raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + + +def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: + """ + Convert a JSON string to bytes + + Parameters + ---------- + data : str + The JSON string to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + bytes + The bytes. + """ + if zarr_format == 2: + return base64.b64decode(data.encode("ascii")) + raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + + +def float_from_json_v2(data: JSONFloat) -> float: + """ + Convert a JSON float to a float (Zarr v2). + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + + Returns + ------- + float + The float value. + """ + match data: + case "NaN": + return float("nan") + case "Infinity": + return float("inf") + case "-Infinity": + return float("-inf") + case _: + return float(data) + + +def float_from_json_v3(data: JSONFloat) -> float: + """ + Convert a JSON float to a float (v3). + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + + Returns + ------- + float + The float value. + """ + # todo: support the v3-specific NaN handling + return float_from_json_v2(data) + + +def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: + """ + Convert a JSON float to a float based on zarr format. + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + float + The float value. + """ + if zarr_format == 2: + return float_from_json_v2(data) + else: + return float_from_json_v3(data) + + +def complex_from_json_v2( + data: tuple[JSONFloat, JSONFloat], dtype: np.dtypes.Complex64DType | np.dtypes.Complex128DType +) -> np.complexfloating[Any, Any]: + """ + Convert a JSON complex float to a complex number (v2). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + dtype : Any + The numpy dtype. + + Returns + ------- + np.complexfloating + The complex number. + """ + return dtype.type(complex(float_from_json_v2(data[0]), float_from_json_v2(data[1]))) + + +def complex_from_json_v3( + data: tuple[JSONFloat, JSONFloat], dtype: np.dtypes.Complex64DType | np.dtypes.Complex128DType +) -> np.complexfloating[Any, Any]: + """ + Convert a JSON complex float to a complex number (v3). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + dtype : Any + The numpy dtype. + + Returns + ------- + np.complexfloating + The complex number. + """ + return dtype.type(complex(float_from_json_v3(data[0]), float_from_json_v3(data[1]))) + + +def complex_from_json( + data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat +) -> np.complexfloating[Any, Any]: + """ + Convert a JSON complex float to a complex number based on zarr format. + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + dtype : Any + The numpy dtype. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.complexfloating + The complex number. + """ + if zarr_format == 2: + return complex_from_json_v2(data, dtype) + else: + if check_json_complex_float_v3(data): + return complex_from_json_v3(data, dtype) + else: + raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def datetime_to_json(data: np.datetime64) -> int: + """ + Convert a datetime64 to a JSON integer. + + Parameters + ---------- + data : np.datetime64 + The datetime64 value to convert. + + Returns + ------- + int + The JSON representation of the datetime64. + """ + return data.view(np.int64).item() + + +def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: + """ + Convert a JSON integer to a datetime64. + + Parameters + ---------- + data : int + The JSON integer to convert. + unit : DateUnit or TimeUnit + The unit of the datetime64. + + Returns + ------- + np.datetime64 + The datetime64 value. + """ + return cast(np.datetime64, np.int64(data).view(f"datetime64[{unit}]")) diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py new file mode 100644 index 0000000000..d4f1f03258 --- /dev/null +++ b/src/zarr/core/dtype/registry.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Self + +from zarr.core.dtype.common import DataTypeValidationError + +if TYPE_CHECKING: + from importlib.metadata import EntryPoint + + from zarr.core.common import JSON + from zarr.core.dtype.wrapper import DTypeWrapper, TDType + + +@dataclass(frozen=True, kw_only=True) +class DataTypeRegistry: + contents: dict[str, type[DTypeWrapper[Any, Any]]] = field(default_factory=dict, init=False) + lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) + + def lazy_load(self) -> None: + for e in self.lazy_load_list: + self.register(e.name, e.load()) + + self.lazy_load_list.clear() + + def register(self: Self, key: str, cls: type[DTypeWrapper[Any, Any]]) -> None: + # don't register the same dtype twice + if key not in self.contents or self.contents[key] != cls: + self.contents[key] = cls + + def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: + return self.contents[key] + + def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: + self.lazy_load() + for val in self.contents.values(): + try: + return val.from_dtype(dtype) + except DataTypeValidationError: + pass + raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") + + def match_json(self, data: JSON) -> DTypeWrapper[Any, Any]: + self.lazy_load() + for val in self.contents.values(): + try: + return val.from_dict(data) + except DataTypeValidationError: + pass + raise ValueError(f"No data type wrapper found that matches {data}") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py new file mode 100644 index 0000000000..002bd100e9 --- /dev/null +++ b/src/zarr/core/dtype/wrapper.py @@ -0,0 +1,279 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, ClassVar, Generic, Self, TypeGuard, TypeVar, cast + +import numpy as np + +from zarr.abc.metadata import Metadata +from zarr.core.dtype.common import DataTypeValidationError + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + +TScalar = TypeVar("TScalar", bound=np.generic | str) +# TODO: figure out an interface or protocol that non-numpy dtypes can use +TDType = TypeVar("TDType", bound=np.dtype[Any]) + + +@dataclass(frozen=True, kw_only=True) +class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): + """ + Abstract base class for wrapping numpy dtypes. + + Attributes + ---------- + dtype_cls : ClassVar[type[TDType]] + The numpy dtype class. This is a class variable. Instances of this class cannot set it. + _zarr_v3_name : ClassVar[str] + The name given to the wrapped data type by a zarr v3 data type specification. Note that this + is not necessarily the same name that will appear in metadata documents, as some data types + have names that depend on their configuration. + """ + + # this class will create a numpy dtype + # mypy currently disallows class variables to contain type parameters + # but it seems like it should be OK for us to use it here: + # https://github.com/python/typing/discussions/1424#discussioncomment-7989934 + dtype_cls: ClassVar[type[TDType]] # type: ignore[misc] + _zarr_v3_name: ClassVar[str] + + @classmethod + @abstractmethod + def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: + """ + Wrap a native dtype without checking. + + Parameters + ---------- + dtype : TDType + The native dtype to wrap. + + Returns + ------- + Self + The wrapped dtype. + """ + raise NotImplementedError + + @classmethod + def from_dtype(cls: type[Self], dtype: TDType) -> Self: + """ + Wrap a dtype object. + + Parameters + ---------- + dtype : TDType + The dtype object to wrap. + + Returns + ------- + Self + The wrapped dtype. + + Raises + ------ + TypeError + If the dtype does not match the dtype_cls class attribute. + """ + if cls.check_dtype(dtype): + return cls._from_dtype_unsafe(dtype) + raise DataTypeValidationError( + f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." + ) + + @abstractmethod + def to_dtype(self: Self) -> TDType: + """ + Return an instance of the wrapped dtype. + + Returns + ------- + TDType + The unwrapped dtype. + """ + raise NotImplementedError + + @abstractmethod + def to_dict(self) -> dict[str, JSON]: + """ + Convert the wrapped data type to a dictionary. + + Returns + ------- + dict[str, JSON] + The dictionary representation of the wrapped data type + """ + raise NotImplementedError + + def cast_value(self: Self, value: object) -> TScalar: + """ + Cast a value to an instance of the scalar type. + This implementation assumes a numpy-style dtype class that has a + ``type`` method for casting scalars. Non-numpy dtypes will need to + override this method. + + Parameters + ---------- + value : object + The value to cast. + + Returns + ------- + TScalar + The cast value. + """ + return cast(TScalar, self.to_dtype().type(value)) + + @abstractmethod + def default_value(self) -> TScalar: + """ + Get the default value for the wrapped data type. This is a method, rather than an attribute, + because the default value for some data types may depend on parameters that are not known + until a concrete data type is wrapped. + + Returns + ------- + TScalar + The default value for this data type. + """ + ... + + @classmethod + def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: + """ + Check that a data type matches the dtype_cls class attribute. Used as a type guard. + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return type(dtype) is cls.dtype_cls + + @classmethod + def check_json(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + """ + Check that a JSON representation of a data type matches the dtype_cls class attribute. Used + as a type guard. This base implementation checks that the input is a dictionary, + that the key "name" is in that dictionary, and that the value of "name" + matches the _zarr_v3_name class attribute. + + Parameters + ---------- + data : JSON + The JSON representation of the data type. + + Returns + ------- + Bool + True if the JSON representation matches, False otherwise. + """ + return "name" in data and data["name"] == cls._zarr_v3_name + + @classmethod + def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: + """ + Wrap a JSON representation of a data type. + + Parameters + ---------- + data : dict[str, JSON] + The JSON representation of the data type. + + Returns + ------- + Self + The wrapped data type. + """ + if cls.check_json(data): + return cls._from_json_unsafe(data) + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + + @classmethod + def _from_json_unsafe(cls: type[Self], data: dict[str, JSON]) -> Self: + """ + Wrap a JSON representation of a data type. + + Parameters + ---------- + data : dict[str, JSON] + The JSON representation of the data type. + + Returns + ------- + Self + The wrapped data type. + """ + config = data.get("configuration", {}) + return cls(**config) + + def get_name(self, zarr_format: ZarrFormat) -> str: + """ + Return the name of the wrapped data type. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The name of the wrapped data type. + + Notes + ----- + This is a method, rather than an attribute, because the name of the data type may depend on + parameters that are not known until a concrete data type is wrapped. + + As the names of data types vary between zarr versions, this method takes a ``zarr_format`` + parameter + """ + if zarr_format == 2: + return self.to_dtype().str + return self._zarr_v3_name + + @abstractmethod + def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: + """ + Convert a single value to JSON-serializable format. + + Parameters + ---------- + data : object + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSON + The JSON-serializable format. + """ + raise NotImplementedError + + @abstractmethod + def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: + """ + Read a JSON-serializable value as a scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + TScalar + The numpy scalar. + """ + raise NotImplementedError diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 33aa22b398..e69de29bb2 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,808 +0,0 @@ -from __future__ import annotations - -import base64 -from abc import ABC, abstractmethod -from collections.abc import Sequence -from dataclasses import dataclass, field, replace -from importlib.metadata import EntryPoint -from typing import ( - TYPE_CHECKING, - Any, - ClassVar, - Generic, - Literal, - Self, - TypeGuard, - TypeVar, - cast, - get_args, -) - -import numpy as np -import numpy.typing as npt - -from zarr.abc.metadata import Metadata -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING - -if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat - -Endianness = Literal["little", "big", "native"] -DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] -JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] - - -def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", "=", "|"]: - match endianness: - case "little": - return "<" - case "big": - return ">" - case "native": - return "=" - case None: - return "|" - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(endianness)} or None" - ) - - -def check_json_bool(data: JSON) -> TypeGuard[bool]: - """ - Check if a JSON value represents a boolean. - """ - return bool(isinstance(data, bool)) - - -def check_json_str(data: JSON) -> TypeGuard[str]: - """ - Check if a JSON value represents a string. - """ - return bool(isinstance(data, str)) - - -def check_json_int(data: JSON) -> TypeGuard[int]: - """ - Check if a JSON value represents an integer. - """ - return bool(isinstance(data, int)) - - -def check_json_float_v2(data: JSON) -> TypeGuard[float]: - if data == "NaN" or data == "Infinity" or data == "-Infinity": - return True - else: - return bool(isinstance(data, float | int)) - - -def check_json_float_v3(data: JSON) -> TypeGuard[float]: - # TODO: handle the special JSON serialization of different NaN values - return check_json_float_v2(data) - - -def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: - if zarr_format == 2: - return check_json_float_v2(data) - else: - return check_json_float_v3(data) - - -def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, as per the zarr v3 spec - """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v3(data[0]) - and check_json_float_v3(data[1]) - ) - - -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x - """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v2(data[0]) - and check_json_float_v2(data[1]) - ) - - -def check_json_complex_float( - data: JSON, zarr_format: ZarrFormat -) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - if zarr_format == 2: - return check_json_complex_float_v2(data) - else: - return check_json_complex_float_v3(data) - - -def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: - if np.isnan(data): - return "NaN" - elif np.isinf(data): - return "Infinity" if data > 0 else "-Infinity" - return float(data) - - -def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: - # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly - # so we just reuse the v2 routine here - return float_to_json_v2(data) - - -def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: - """ - convert a float to JSON as per the zarr v3 spec - """ - if zarr_format == 2: - return float_to_json_v2(data) - else: - return float_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: - return float_to_json_v2(data.real), float_to_json_v2(data.imag) - - -def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: - return float_to_json_v3(data.real), float_to_json_v3(data.imag) - - -def complex_to_json( - data: complex | np.complexfloating[Any], zarr_format: ZarrFormat -) -> tuple[JSONFloat, JSONFloat] | JSONFloat: - if zarr_format == 2: - return complex_to_json_v2(data) - else: - return complex_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def structured_scalar_to_json(data: bytes, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return base64.b64encode(data).decode("ascii") - raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") - - -def structured_scalar_from_json(data: str, zarr_format: ZarrFormat) -> bytes: - if zarr_format == 2: - return base64.b64decode(data.encode("ascii")) - raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") - - -def float_from_json_v2(data: JSONFloat) -> float: - match data: - case "NaN": - return float("nan") - case "Infinity": - return float("inf") - case "-Infinity": - return float("-inf") - case _: - return float(data) - - -def float_from_json_v3(data: JSONFloat) -> float: - # todo: support the v3-specific NaN handling - return float_from_json_v2(data) - - -def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: - if zarr_format == 2: - return float_from_json_v2(data) - else: - return float_from_json_v3(data) - - -def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating[Any, Any]: - return dtype.type(complex(*data)) - - -def complex_from_json_v3( - data: tuple[JSONFloat, JSONFloat], dtype: Any -) -> np.complexfloating[Any, Any]: - return dtype.type(complex(*data)) - - -def complex_from_json( - data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat -) -> np.complexfloating: - if zarr_format == 2: - return complex_from_json_v2(data, dtype) - else: - if check_json_complex_float_v3(data): - return complex_from_json_v3(data, dtype) - else: - raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def datetime_to_json(data: np.datetime64[Any]) -> int: - return data.view(np.int64).item() - - -def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64[Any]: - return np.int64(data).view(f"datetime64[{unit}]") - - -TScalar = TypeVar("TScalar", bound=np.generic | str, covariant=True) -# TODO: figure out an interface or protocol that non-numpy dtypes can -TDType = TypeVar("TDType", bound=np.dtype[Any]) - - -@dataclass(frozen=True, kw_only=True) -class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): - name: ClassVar[str] - dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype - endianness: Endianness | None = "native" - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name} - - def cast_value(self: Self, value: object) -> TScalar: - return cast(TScalar, self.unwrap().type(value)) - - @abstractmethod - def default_value(self) -> TScalar: ... - - @classmethod - def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: - """ - Check that a dtype matches the dtype_cls class attribute - """ - return type(dtype) is cls.dtype_cls - - @classmethod - def wrap(cls: type[Self], dtype: TDType) -> Self: - if cls.check_dtype(dtype): - return cls._wrap_unsafe(dtype) - raise TypeError(f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}.") - - @classmethod - @abstractmethod - def _wrap_unsafe(cls: type[Self], dtype: TDType) -> Self: - raise NotImplementedError - - def unwrap(self: Self) -> TDType: - endian_str = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(endian_str) - - def with_endianness(self: Self, endianness: Endianness) -> Self: - return replace(self, endianness=endianness) - - @abstractmethod - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: - """ - Convert a single value to JSON-serializable format. Depends on the zarr format. - """ - raise NotImplementedError - - @abstractmethod - def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: - """ - Read a JSON-serializable value as a numpy scalar - """ - raise NotImplementedError - - -@dataclass(frozen=True, kw_only=True) -class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): - name = "bool" - dtype_cls: ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType - - def default_value(self) -> np.bool_: - return np.False_ - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: - return cls() - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: - return bool(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: - if check_json_bool(data): - return self.unwrap().type(data) - raise TypeError(f"Invalid type: {data}. Expected a boolean.") - - -class IntWrapperBase(DTypeWrapper[TDType, TScalar]): - def default_value(self) -> TScalar: - return self.unwrap().type(0) - - @classmethod - def _wrap_unsafe(cls, dtype: TDType) -> Self: - return cls() - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: - if check_json_int(data): - return self.unwrap().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - - -@dataclass(frozen=True, kw_only=True) -class Int8(IntWrapperBase[np.dtypes.Int8DType, np.int8]): - dtype_cls = np.dtypes.Int8DType - name = "int8" - - -@dataclass(frozen=True, kw_only=True) -class UInt8(IntWrapperBase[np.dtypes.UInt8DType, np.uint8]): - dtype_cls = np.dtypes.UInt8DType - name = "uint8" - - -@dataclass(frozen=True, kw_only=True) -class Int16(IntWrapperBase[np.dtypes.Int16DType, np.int16]): - dtype_cls = np.dtypes.Int16DType - name = "int16" - - -@dataclass(frozen=True, kw_only=True) -class UInt16(IntWrapperBase[np.dtypes.UInt16DType, np.uint16]): - dtype_cls = np.dtypes.UInt16DType - name = "uint16" - - -@dataclass(frozen=True, kw_only=True) -class Int32(IntWrapperBase[np.dtypes.Int32DType, np.int32]): - dtype_cls = np.dtypes.Int32DType - name = "int32" - - -@dataclass(frozen=True, kw_only=True) -class UInt32(IntWrapperBase[np.dtypes.UInt32DType, np.uint32]): - dtype_cls = np.dtypes.UInt32DType - name = "uint32" - - -@dataclass(frozen=True, kw_only=True) -class Int64(IntWrapperBase[np.dtypes.Int64DType, np.int64]): - dtype_cls = np.dtypes.Int64DType - name = "int64" - - -@dataclass(frozen=True, kw_only=True) -class UInt64(IntWrapperBase[np.dtypes.UInt64DType, np.uint64]): - dtype_cls = np.dtypes.UInt64DType - name = "uint64" - - -class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): - def default_value(self) -> TScalar: - return self.unwrap().type(0.0) - - @classmethod - def _wrap_unsafe(cls, dtype: TDType) -> Self: - return cls() - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: - return float_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: - if check_json_float_v2(data): - return self.unwrap().type(float_from_json(data, zarr_format)) - raise TypeError(f"Invalid type: {data}. Expected a float.") - - -@dataclass(frozen=True, kw_only=True) -class Float16(FloatWrapperBase[np.dtypes.Float16DType, np.float16]): - dtype_cls = np.dtypes.Float16DType - name = "float16" - - -@dataclass(frozen=True, kw_only=True) -class Float32(FloatWrapperBase[np.dtypes.Float32DType, np.float32]): - dtype_cls = np.dtypes.Float32DType - name = "float32" - - -@dataclass(frozen=True, kw_only=True) -class Float64(FloatWrapperBase[np.dtypes.Float64DType, np.float64]): - dtype_cls = np.dtypes.Float64DType - name = "float64" - - -@dataclass(frozen=True, kw_only=True) -class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): - dtype_cls = np.dtypes.Complex64DType - name = "complex64" - - def default_value(self) -> np.complex64: - return np.complex64(0.0) - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.Complex64DType) -> Self: - return cls() - - def to_json_value( - self, data: np.generic, zarr_format: ZarrFormat - ) -> tuple[JSONFloat, JSONFloat]: - return complex_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex64: - if check_json_complex_float_v3(data): - return complex_from_json(data, dtype=self.unwrap(), zarr_format=zarr_format) - raise TypeError(f"Invalid type: {data}. Expected a complex float.") - - -@dataclass(frozen=True, kw_only=True) -class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): - dtype_cls = np.dtypes.Complex128DType - name = "complex128" - - def default_value(self) -> np.complex128: - return np.complex128(0.0) - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.Complex128DType) -> Self: - return cls() - - def to_json_value( - self, data: np.generic, zarr_format: ZarrFormat - ) -> tuple[JSONFloat, JSONFloat]: - return complex_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: - if check_json_complex_float_v3(data): - return complex_from_json(data, dtype=self.unwrap(), zarr_format=zarr_format) - raise TypeError(f"Invalid type: {data}. Expected a complex float.") - - -@dataclass(frozen=True, kw_only=True) -class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): - item_size_bits: ClassVar[int] - length: int = 0 - - @classmethod - def _wrap_unsafe(cls, dtype: TDType) -> Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - - def unwrap(self) -> TDType: - endianness_code = endianness_to_numpy_str(self.endianness) - return self.dtype_cls(self.length).newbyteorder(endianness_code) - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthAsciiString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): - dtype_cls = np.dtypes.BytesDType - name = "numpy.static_byte_string" - item_size_bits = 8 - - def default_value(self) -> np.bytes_: - return np.bytes_(b"") - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"length": self.length}} - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.unwrap().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") - - -@dataclass(frozen=True, kw_only=True) -class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): - dtype_cls = np.dtypes.VoidDType - name = "r*" - item_size_bits = 8 - - def default_value(self) -> np.void: - return self.cast_value(("\x00" * self.length).encode("ascii")) - - def to_dict(self) -> dict[str, JSON]: - return {"name": f"r{self.length * self.item_size_bits}"} - - @classmethod - def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: - """ - Reject structured dtypes by ensuring that dtype.fields is None - """ - return type(dtype) is cls.dtype_cls and dtype.fields is None - - def unwrap(self) -> np.dtypes.VoidDType: - # this needs to be overridden because numpy does not allow creating a void type - # by invoking np.dtypes.VoidDType directly - endianness_code = endianness_to_numpy_str(self.endianness) - return np.dtype(f"{endianness_code}V{self.length}") - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - # todo: check that this is well-formed - return self.unwrap().type(base64.standard_b64decode(data)) - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): - dtype_cls = np.dtypes.StrDType - name = "numpy.static_unicode_string" - item_size_bits = 32 # UCS4 is 32 bits per code point - - def default_value(self) -> np.str_: - return np.str_("") - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"length": self.length}} - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.unwrap().type(data) - - -if _NUMPY_SUPPORTS_VLEN_STRING: - - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): - dtype_cls = np.dtypes.StringDType - name = "numpy.vlen_string" - - def default_value(self) -> str: - return "" - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: - return cls() - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name} - - def unwrap(self) -> np.dtypes.StringDType: - # StringDType does not have endianness, so we ignore it here - return self.dtype_cls() - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.unwrap().type(data) - -else: - - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): - dtype_cls = np.dtypes.ObjectDType - name = "numpy.vlen_string" - endianness: Endianness = field(default=None) - - def default_value(self) -> str: - return "" - - def __post_init__(self) -> None: - if self.endianness is not None: - raise ValueError("VariableLengthString does not support endianness.") - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name} - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: - return cls() - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - """ - String literals pass through - """ - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data - - -DateUnit = Literal["Y", "M", "W", "D"] -TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] - - -@dataclass(frozen=True, kw_only=True) -class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): - dtype_cls = np.dtypes.DateTime64DType - name = "numpy/datetime64" - unit: DateUnit | TimeUnit = "s" - - def default_value(self) -> np.datetime64: - return np.datetime64("NaT") - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: - unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] - return cls(unit=unit) - - def cast_value(self, value: object) -> np.datetime64: - return self.unwrap().type(value, self.unit) - - def unwrap(self) -> np.dtypes.DateTime64DType: - return np.dtype(f"datetime64[{self.unit}]").newbyteorder( - endianness_to_numpy_str(self.endianness) - ) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_int(data): - return datetime_from_json(data, self.unit) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - - def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: - return datetime_to_json(data) - - -@dataclass(frozen=True, kw_only=True) -class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): - dtype_cls = np.dtypes.VoidDType - name = "numpy/struct" - fields: tuple[tuple[str, DTypeWrapper[Any, Any], int], ...] - - def default_value(self) -> np.void: - return np.array([0], dtype=self.unwrap())[0] - - @classmethod - def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: - """ - Check that this dtype is a numpy structured dtype - """ - return super().check_dtype(dtype) and dtype.fields is not None - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: - fields: list[tuple[str, DTypeWrapper[Any, Any], int]] = [] - - if dtype.fields is None: - raise ValueError("numpy dtype has no fields") - - for key, (dtype_instance, offset) in dtype.fields.items(): - dtype_wrapped = data_type_registry.match_dtype(dtype_instance) - fields.append((key, dtype_wrapped, offset)) - - return cls(fields=tuple(fields)) - - def to_dict(self) -> dict[str, JSON]: - base_dict = super().to_dict() - if base_dict.get("configuration", {}) != {}: - raise ValueError( - "This data type wrapper cannot inherit from a data type wrapper that defines a configuration for its dict serialization" - ) - field_configs = [ - (f_name, f_dtype.to_dict(), f_offset) for f_name, f_dtype, f_offset in self.fields - ] - base_dict["configuration"] = {"fields": field_configs} - return base_dict - - @classmethod - def from_dict(cls, data: dict[str, JSON]) -> Self: - fields = tuple( - (f_name, get_data_type_from_dict(f_dtype), f_offset) - for f_name, f_dtype, f_offset in data["fields"] - ) - return cls(fields=fields) - - def unwrap(self) -> np.dtypes.VoidDType: - return np.dtype([(key, dtype.unwrap()) for (key, dtype, _) in self.fields]) - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return structured_scalar_to_json(data.tobytes(), zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - as_bytes = structured_scalar_from_json(data, zarr_format=zarr_format) - dtype = self.unwrap() - return np.array([as_bytes], dtype=dtype.str).view(dtype)[0] - - -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: - if dtype in (str, "str"): - if _NUMPY_SUPPORTS_VLEN_STRING: - np_dtype = np.dtype("T") - else: - np_dtype = np.dtype("O") - else: - np_dtype = np.dtype(dtype) - data_type_registry.lazy_load() - return data_type_registry.match_dtype(np_dtype) - - -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper[Any.Any]: - data_type_registry.lazy_load() - dtype_name = dtype["name"] - dtype_cls = data_type_registry.get(dtype_name) - if dtype_cls is None: - raise ValueError(f"No data type class matching name {dtype_name}") - return dtype_cls.from_dict(dtype.get("configuration", {})) - - -def resolve_dtype( - dtype: npt.DTypeLike | DTypeWrapper[Any, Any] | dict[str, JSON], -) -> DTypeWrapper[Any, Any]: - if isinstance(dtype, DTypeWrapper): - return dtype - elif isinstance(dtype, dict): - return get_data_type_from_dict(dtype) - else: - return get_data_type_from_numpy(dtype) - - -def get_data_type_by_name( - dtype: str, configuration: dict[str, JSON] | None = None -) -> DTypeWrapper[Any, Any]: - data_type_registry.lazy_load() - if configuration is None: - _configuration = {} - else: - _configuration = configuration - maybe_dtype_cls = data_type_registry.get(dtype) - if maybe_dtype_cls is None: - raise ValueError(f"No data type class matching name {dtype}") - return maybe_dtype_cls.from_dict(_configuration) - - -@dataclass(frozen=True, kw_only=True) -class DataTypeRegistry: - contents: dict[str, type[DTypeWrapper[Any, Any]]] = field(default_factory=dict, init=False) - lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - - def lazy_load(self) -> None: - for e in self.lazy_load_list: - self.register(e.load()) - - self.lazy_load_list.clear() - - def register(self: Self, cls: type[DTypeWrapper[Any, Any]]) -> None: - # don't register the same dtype twice - if cls.name not in self.contents or self.contents[cls.name] != cls: - self.contents[cls.name] = cls - - def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: - return self.contents[key] - - def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: - self.lazy_load() - for val in self.contents.values(): - try: - return val.wrap(dtype) - except TypeError: - pass - raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") - - -def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: - data_type_registry.register(cls) - - -data_type_registry = DataTypeRegistry() - -INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 -FLOAT_DTYPE = Float16 | Float32 | Float64 -COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = FixedLengthUnicodeString | VariableLengthString | FixedLengthAsciiString -DTYPE = ( - Bool - | INTEGER_DTYPE - | FLOAT_DTYPE - | COMPLEX_DTYPE - | STRING_DTYPE - | StaticRawBytes - | Structured - | DateTime64 -) -for dtype in get_args(DTYPE): - register_data_type(dtype) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3883a998c1..94c69602af 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -8,11 +8,8 @@ import numcodecs.abc from zarr.abc.metadata import Metadata -from zarr.core.metadata.dtype import ( - DTypeWrapper, - Structured, - get_data_type_from_numpy, -) +from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype.wrapper import DTypeWrapper if TYPE_CHECKING: from typing import Any, Literal, Self @@ -82,7 +79,7 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value(fill_value, dtype=dtype.unwrap()) + fill_value_parsed = parse_fill_value(fill_value, dtype=dtype.to_dtype()) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -125,9 +122,9 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # check that the zarr_format attribute is correct _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = get_data_type_from_numpy(parse_dtype(_data["dtype"])) + dtype = get_data_type_from_numpy(_data["dtype"]) _data["dtype"] = dtype - if dtype.unwrap().kind in "SV": + if dtype.to_dtype().kind in "SV": fill_value_encoded = _data.get("fill_value") if fill_value_encoded is not None: fill_value = base64.standard_b64decode(fill_value_encoded) @@ -181,13 +178,7 @@ def to_dict(self) -> dict[str, JSON]: fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value - _ = zarray_dict.pop("dtype") - dtype_json: JSON - if isinstance(self.dtype, Structured): - dtype_json = tuple(self.dtype.unwrap().descr) - else: - dtype_json = self.dtype.unwrap().str - zarray_dict["dtype"] = dtype_json + zarray_dict["dtype"] = self.dtype.get_name(zarr_format=2) return zarray_dict diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index e285490bfd..2c6e65037e 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,10 +4,9 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import ( +from zarr.core.dtype import ( DTypeWrapper, VariableLengthString, - get_data_type_by_name, get_data_type_from_dict, ) @@ -96,7 +95,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper[Any, Any]) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -235,7 +234,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DTypeWrapper + data_type: DTypeWrapper[Any, Any] chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -250,7 +249,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: DTypeWrapper, + data_type: DTypeWrapper[Any, Any], chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -270,14 +269,14 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - fill_value_parsed = data_type.unwrap().type(fill_value) + fill_value_parsed = data_type.to_dtype().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type.unwrap(), + dtype=data_type.to_dtype(), fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -312,7 +311,7 @@ def _validate_metadata(self) -> None: raise ValueError("`fill_value` is required.") for codec in self.codecs: codec.validate( - shape=self.shape, dtype=self.data_type.unwrap(), chunk_grid=self.chunk_grid + shape=self.shape, dtype=self.data_type.to_dtype(), chunk_grid=self.chunk_grid ) @property @@ -382,9 +381,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: data_type_json = _data.pop("data_type") if isinstance(data_type_json, str): - # check that the data_type attribute is valid - data_type = get_data_type_by_name(data_type_json) - + data_type = get_data_type_from_dict({"name": data_type_json}) else: data_type = get_data_type_from_dict(data_type_json) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 8830cdb1a9..d1fe1d181c 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Generic, TypeVar from zarr.core.config import BadConfigError, config -from zarr.core.metadata.dtype import data_type_registry +from zarr.core.dtype import data_type_registry if TYPE_CHECKING: from importlib.metadata import EntryPoint diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index aa42329be7..2eef703448 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -15,6 +15,7 @@ from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.common import ZarrFormat +from zarr.core.dtype import parse_data_type from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike @@ -133,8 +134,9 @@ def array_metadata( shape = draw(array_shapes()) ndim = len(shape) chunk_shape = draw(array_shapes(min_dims=ndim, max_dims=ndim)) - dtype = draw(v3_dtypes()) - fill_value = draw(npst.from_dtype(dtype)) + np_dtype = draw(v3_dtypes()) + dtype = parse_data_type(np_dtype) + fill_value = draw(npst.from_dtype(np_dtype)) if zarr_format == 2: return ArrayV2Metadata( shape=shape, diff --git a/tests/conftest.py b/tests/conftest.py index 6ff1c4596f..5e17c82a37 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,7 +20,7 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.metadata.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -263,7 +263,7 @@ def create_array_metadata( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=dtype_parsed.unwrap().itemsize, + item_size=dtype_parsed.to_dtype().itemsize, ) if order is None: diff --git a/tests/test_array.py b/tests/test_array.py index 5c58b3d3be..f8880c86c0 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -39,10 +39,9 @@ from zarr.core.buffer.cpu import NDBuffer from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv -from zarr.core.metadata.dtype import get_data_type_from_numpy -from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath @@ -50,6 +49,7 @@ if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike from zarr.core.metadata.v2 import ArrayV2Metadata + from zarr.core.metadata.v3 import ArrayV3Metadata @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @@ -1004,7 +1004,7 @@ async def test_v3_chunk_encoding( filters=filters, compressors=compressors, serializer="auto", - dtype=arr.metadata.data_type, + dtype=arr.metadata.data_type, # type: ignore[union-attr] ) assert arr.filters == filters_expected assert arr.compressors == compressors_expected @@ -1119,7 +1119,7 @@ async def test_with_data(impl: Literal["sync", "async"], store: Store) -> None: elif impl == "async": arr = await create_array(store, name=name, data=data, zarr_format=3) stored = await arr._get_selection( - BasicIndexer(..., shape=arr.shape, chunk_grid=arr.metadata.chunk_grid), + BasicIndexer(..., shape=arr.shape, chunk_grid=arr.chunk_grid), prototype=default_buffer_prototype(), ) else: diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index a6c01153ff..ee3415a501 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,7 +8,7 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.metadata.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage import StorePath diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index a81625b7eb..508519e696 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -18,9 +18,9 @@ open_consolidated, ) from zarr.core.buffer import cpu, default_buffer_prototype +from zarr.core.dtype import parse_data_type from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata -from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.storage import StorePath @@ -504,7 +504,7 @@ async def test_consolidated_metadata_backwards_compatibility( async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - dtype = get_data_type_from_numpy("uint8") + dtype = parse_data_type("uint8") await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py index 8a1bcdedd1..ee19cdf845 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_metadata/test_dtype.py @@ -5,15 +5,19 @@ import numpy as np import pytest -from zarr.core.metadata.dtype import ( +from zarr.core.dtype import ( DTYPE, + DTypeWrapper, + VariableLengthString, + data_type_registry, +) +from zarr.core.dtype._numpy import ( Bool, Complex64, Complex128, - DataTypeRegistry, DateTime64, - DTypeWrapper, FixedLengthAsciiString, + FixedLengthBytes, FixedLengthUnicodeString, Float16, Float32, @@ -22,15 +26,14 @@ Int16, Int32, Int64, - StaticRawBytes, Structured, UInt8, UInt16, UInt32, UInt64, - VariableLengthString, - data_type_registry, ) +from zarr.core.dtype.common import DataTypeValidationError +from zarr.core.dtype.registry import DataTypeRegistry @pytest.fixture @@ -65,7 +68,7 @@ def dtype_registry() -> DataTypeRegistry: (Complex128, "complex128"), (FixedLengthUnicodeString, "U"), (FixedLengthAsciiString, "S"), - (StaticRawBytes, "V"), + (FixedLengthBytes, "V"), (VariableLengthString, VLEN_STRING_CODE), (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), (DateTime64, "datetime64[s]"), @@ -79,23 +82,23 @@ def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | st """ dt = np.dtype(np_dtype) assert wrapper_cls.dtype_cls is type(dt) - wrapped = wrapper_cls.wrap(dt) + wrapped = wrapper_cls.from_dtype(dt) - with pytest.raises(TypeError, match="Invalid dtype"): - wrapper_cls.wrap("not a dtype") + with pytest.raises(DataTypeValidationError, match="Invalid dtype"): + wrapper_cls.from_dtype("not a dtype") assert isinstance(wrapped, wrapper_cls) - assert wrapped.unwrap() == dt + assert wrapped.to_dtype() == dt @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) def test_dict_serialization(wrapper_cls: DTYPE) -> None: if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool(), 0),))) + instance = wrapper_cls(fields=((("a", Bool()),))) else: instance = wrapper_cls() as_dict = instance.to_dict() - assert wrapper_cls.from_dict(data=as_dict.get("configuration", {})) == instance + assert wrapper_cls.from_dict(as_dict) == instance @pytest.mark.parametrize( @@ -116,10 +119,10 @@ def test_dict_serialization(wrapper_cls: DTYPE) -> None: (Complex64(), np.complex64(0)), (Complex128(), np.complex128(0)), (FixedLengthAsciiString(length=3), np.bytes_(b"")), - (StaticRawBytes(length=3), np.void(b"\x00\x00\x00")), + (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), (FixedLengthUnicodeString(length=3), np.str_("")), ( - Structured(fields=(("a", Float64(), 0), ("b", Int8(), 8))), + Structured(fields=(("a", Float64()), ("b", Int8()))), np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], ), (VariableLengthString(), ""), @@ -154,7 +157,7 @@ def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), (FixedLengthAsciiString(length=4), np.bytes_(b"test"), "dGVzdA=="), - (StaticRawBytes(length=4), np.void(b"test"), "dGVzdA=="), + (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), (FixedLengthUnicodeString(length=4), np.str_("test"), "test"), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), @@ -187,7 +190,7 @@ def test_to_json_value_v2( (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), (FixedLengthAsciiString(length=4), "dGVzdA==", np.bytes_(b"test")), - (StaticRawBytes(length=4), "dGVzdA==", np.void(b"test")), + (FixedLengthBytes(length=4), "dGVzdA==", np.void(b"test")), (FixedLengthUnicodeString(length=4), "test", np.str_("test")), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), @@ -208,8 +211,8 @@ def test_register(dtype_registry: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - dtype_registry.register(Bool) - assert dtype_registry.get("bool") == Bool + dtype_registry.register(Bool._zarr_v3_name, Bool) + assert dtype_registry.get(Bool._zarr_v3_name) == Bool assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), Bool) @staticmethod @@ -217,13 +220,13 @@ def test_override(dtype_registry: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - dtype_registry.register(Bool) + dtype_registry.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): def default_value(self) -> np.bool_: return np.True_ - dtype_registry.register(NewBool) + dtype_registry.register(NewBool._zarr_v3_name, NewBool) assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), NewBool) @staticmethod @@ -236,7 +239,7 @@ def test_match_dtype( """ Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. """ - dtype_registry.register(wrapper_cls) + dtype_registry.register(wrapper_cls._zarr_v3_name, wrapper_cls) assert isinstance(dtype_registry.match_dtype(np.dtype(dtype_str)), wrapper_cls) @staticmethod @@ -260,8 +263,8 @@ def test_registered_dtypes(wrapper_cls: DTypeWrapper[Any, Any]) -> None: Test that the registered dtypes can be retrieved from the registry. """ if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool(), 0),))) + instance = wrapper_cls(fields=((("a", Bool()),))) else: instance = wrapper_cls() - assert data_type_registry.match_dtype(instance.unwrap()) == instance + assert data_type_registry.match_dtype(instance.to_dtype()) == instance diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 1c5ddd6f9a..2eec9a6c74 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -9,9 +9,9 @@ import zarr.storage from zarr.core.buffer import cpu from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.dtype._numpy import Float32, Float64, Int16 from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata -from zarr.core.metadata.dtype import Float32, Float64, Int16 from zarr.core.metadata.v2 import parse_zarr_format if TYPE_CHECKING: diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index ea59496280..23f28ab097 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -11,8 +11,10 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config +from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype._numpy import DateTime64 +from zarr.core.dtype.common import complex_from_json from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import DateTime64, complex_from_json, get_data_type_from_numpy from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -127,7 +129,7 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ zarr_format = 3 dtype = get_data_type_from_numpy(dtype_str) - expected = dtype.unwrap().type(complex(*fill_value)) + expected = dtype.to_dtype().type(complex(*fill_value)) observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) From e8fd72cbf40ff51c937b81d95321e8de8e57230d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 14:10:22 +0100 Subject: [PATCH 022/130] start design doc --- docs/user-guide/data_types.rst | 156 +++++++++++++++++++++++++++++++++ docs/user-guide/index.rst | 1 + src/zarr/core/dtype/_numpy.py | 6 +- src/zarr/core/dtype/wrapper.py | 41 ++++----- 4 files changed, 181 insertions(+), 23 deletions(-) create mode 100644 docs/user-guide/data_types.rst diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst new file mode 100644 index 0000000000..19095e1851 --- /dev/null +++ b/docs/user-guide/data_types.rst @@ -0,0 +1,156 @@ +Data types +========== + +Zarr's data type model +---------------------- + +Every Zarr array has a "data type", which defines the meaning and physical layout of the +array's elements. Zarr is heavily influenced by `NumPy `_, and +Zarr arrays can use many of the same data types as numpy arrays:: + >>> import zarr + >>> import numpy as np + >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) + >>> z + + +But Zarr data types and Numpy data types are also very different in one key respect: +Zarr arrays are designed to be persisted to storage and later read, possibly by Zarr implementations in different programming languages. +So in addition to defining a memory layout for array elements, each Zarr data type defines a procedure for +reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to +array metadata. + +Data types in Zarr version 2 +----------------------------- + +Version 2 of the Zarr format defined its data types relative to `Numpy's data types `_, and added a few non-Numpy data types as well. +Thus the JSON identifer for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype: + + >>> import zarr + >>> import numpy as np + >>> import json + >>> np_dtype = np.dtype('int64') + >>> z = zarr.create_array(shape=(1,), dtype=np_dtype, zarr_format=2) + >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] + >>> assert dtype_meta == np_dtype.str # True + >>> dtype_meta + , "configuration": {...}}`` + +Data types in Zarr-Python +------------------------- + +Zarr-Python supports two different Zarr formats, and those two formats specify data types in rather different ways: +data types in Zarr version 2 are encoded as Numpy-compatible strings, while data types in Zarr version 3 are encoded as either strings or ``JSON`` objects, +and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. + +If that wasn't enough, we want Zarr-Python to support data types beyond what's available in Numpy. So it's crucial that we have a +model of array data types that can adapt to the differences between Zarr V2 and V3 and doesn't over-fit to Numpy. + +Here are the operations we need to perform on data types in Zarr-Python: + +* Round-trip native data types to fields in array metadata documents. + For example, the Numpy data type ``np.dtype('>i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. + + In Zarr V3 metadata, the same Numpy data type would be saved as ``{..., "data_type": "int16", "codecs": [..., {"name": "bytes", "configuration": {"endian": "big"}, ...]}`` + +* Define a default fill value. This is not mandated by the Zarr specifications, but it's convenient for users + to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for + parametric data types like fixed-length strings the default can only be generated after the data type has been parametrized at runtime. + +* Round-trip scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications + define how scalars of each data type should be stored as JSON in array metadata documents, and in principle each data type + can define this encoding separately. + +* Do all of the above for *user-defined data types*. Zarr-Python should support data types added as extensions,so we cannot + hard-code the list of data types. We need to ensure that users can easily (or easily enough) define a python object + that models their custom data type and register this object with Zarr-Python, so that the above operations all succeed for their + custom data type. + +To achieve these goals, Zarr Python uses a class called :class:`zarr.core.dtype.DTypeWrapper` to wrap native data types. Each data type +supported by Zarr Python is modeled by a subclass of `DTypeWrapper`, which has the following structure: + +(attribute) ``dtype_cls`` +^^^^^^^^^^^^^ +The ``dtype_cls`` attribute is a **class variable** that is bound to a class that can produce +an instance of a native data type. For example, on the ``DTypeWrapper`` used to model the boolean +data type, the ``dtype_cls`` attribute is bound to the numpy bool data type class: ``np.dtypes.BoolDType``. +This attribute is used when we need to create an instance of the native data type, for example when +defining a Numpy array that will contain Zarr data. + +It might seem odd that ``DTypeWrapper.dtype_cls`` binds to a *class* that produces a native data type instead of an instance of that native data type -- +why not have a ``DTypeWrapper.dtype`` attribute that binds to ``np.dtypes.BoolDType()``? The reason why ``DTypeWrapper`` +doesn't wrap a concrete data type instance is because data type instances may have endianness information, but Zarr V3 +data types do not. To model Zarr V3 data types, we need endianness to be an **instance variable** which is +defined when creating an instance of the ```DTypeWrapper``. Subclasses of ``DTypeWrapper`` that model data types with +byte order semantics thus have ``endianness`` as an instance variable, and this value can be set when creating an instance of the wrapper. + + +(attribute) ``_zarr_v3_name`` +^^^^^^^^^^^^^ +The ``_zarr_v3_name`` attribute encodes the canonical name for a data type for Zarr V3. For many data types these names +are defined in the `Zarr V3 specification https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#data-types`_ For nearly all of the +data types defined in Zarr V3, this name can be used to uniquely specify a data type. The one exception is the ``r*`` data type, +which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. + +(class method) ``from_dtype(cls, dtype) -> Self`` +^^^^^^^^^ +This method defines a procedure for safely converting a native dtype instance into an instance of ``DTypeWrapper``. It should perform +validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some +data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. +A ``DTypeWrapper`` that wraps Numpy structured data types must do additional checks to ensure that the input ``dtype`` is actually a structured data type. +If input validation succeeds, this method will call ``_from_dtype_unsafe``. + +(class method) ``_from_dtype_unsafe(cls, dtype) -> Self`` +^^^^^^^^^^ +This method defines the procedure for converting a native data type instance, like ``np.dtype('uint8')``, +into a wrapper class instance. The ``unsafe`` prefix on the method name denotes that this method should not +perform any input validation. Input validation should be done by the routine that calls this method. + +For many data types, creating the wrapper class takes no arguments and so this method can just return ``cls()``. +But for data types with runtime attributes like endianness or length (for fixed-size strings), this ``_from_dtype_unsafe`` +ensures that those attributes of ``dtype`` are mapped on to the correct parameters in the ``DTypeWrapper`` class constructor. + +(method) ``to_dtype(self) -> dtype`` +^^^^^^^ +This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together +with ``from_dtype``, this method allows round-trip conversion of a native data type in to a wrapper class and then out again. + +That is, for some ``DTypeWrapper`` class ``FooWrapper`` that wraps a native data type called ``foo``, ``FooWrapper.from_dtype(instance_of_foo).to_dtype() == instance_of_foo`` should be true. + +(method) ``to_dict(self) -> dict`` +^^^^^ +This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in +Zarr metadata. + +(method) ``cast_value(self, value: object) -> scalar`` +^^^^^ +Cast a python object to an instance of the wrapped data type. This is used for generating the default +value associated with this data type. + + +(method) ``default_value(self) -> scalar`` +^^^^ +Return the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value +for an array when a user has not requested one. + +Why is this a method and not a static attribute? Although some data types +can have a static default value, parametrized data types like fixed-length strings or structured data types cannot. For these data types, +a default value must be calculated based on the attributes of the wrapped data type. + +(method) `` + + + diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index c50713332b..ea34ac2561 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -8,6 +8,7 @@ User guide installation arrays + data_types groups attributes storage diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index b98cc100e3..362f7f361c 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -569,7 +569,7 @@ def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[np.dtypes.VoidDType return super().check_dtype(dtype) and dtype.fields is None @classmethod - def check_json(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + def check_dict(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: # Overriding the base class implementation because the r* dtype # does not have a name that will can appear in array metadata # Instead, array metadata will contain names like "r8", "r16", etc @@ -787,7 +787,7 @@ def to_dict(self) -> dict[str, JSON]: return base_dict @classmethod - def check_json(cls, data: JSON) -> bool: + def check_dict(cls, data: JSON) -> bool: return ( isinstance(data, dict) and "name" in data @@ -797,7 +797,7 @@ def check_json(cls, data: JSON) -> bool: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - if cls.check_json(data): + if cls.check_dict(data): from zarr.core.dtype import get_data_type_from_dict fields = tuple( diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 002bd100e9..eecb1f2562 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -39,24 +39,6 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): dtype_cls: ClassVar[type[TDType]] # type: ignore[misc] _zarr_v3_name: ClassVar[str] - @classmethod - @abstractmethod - def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: - """ - Wrap a native dtype without checking. - - Parameters - ---------- - dtype : TDType - The native dtype to wrap. - - Returns - ------- - Self - The wrapped dtype. - """ - raise NotImplementedError - @classmethod def from_dtype(cls: type[Self], dtype: TDType) -> Self: """ @@ -83,6 +65,25 @@ def from_dtype(cls: type[Self], dtype: TDType) -> Self: f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." ) + + @classmethod + @abstractmethod + def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: + """ + Wrap a native dtype without checking. + + Parameters + ---------- + dtype : TDType + The native dtype to wrap. + + Returns + ------- + Self + The wrapped dtype. + """ + raise NotImplementedError + @abstractmethod def to_dtype(self: Self) -> TDType: """ @@ -158,7 +159,7 @@ def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: return type(dtype) is cls.dtype_cls @classmethod - def check_json(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + def check_dict(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: """ Check that a JSON representation of a data type matches the dtype_cls class attribute. Used as a type guard. This base implementation checks that the input is a dictionary, @@ -192,7 +193,7 @@ def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: Self The wrapped data type. """ - if cls.check_json(data): + if cls.check_dict(data): return cls._from_json_unsafe(data) raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") From b22f324bfa787e336e8afd05834fe691939f2a91 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 14:11:03 +0100 Subject: [PATCH 023/130] more design doc --- docs/user-guide/data_types.rst | 90 +++++++++++++++++----------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 19095e1851..7a5825bf2f 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -4,19 +4,19 @@ Data types Zarr's data type model ---------------------- -Every Zarr array has a "data type", which defines the meaning and physical layout of the +Every Zarr array has a "data type", which defines the meaning and physical layout of the array's elements. Zarr is heavily influenced by `NumPy `_, and Zarr arrays can use many of the same data types as numpy arrays:: >>> import zarr >>> import numpy as np >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) >>> z - + -But Zarr data types and Numpy data types are also very different in one key respect: -Zarr arrays are designed to be persisted to storage and later read, possibly by Zarr implementations in different programming languages. -So in addition to defining a memory layout for array elements, each Zarr data type defines a procedure for -reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to +But Zarr data types and Numpy data types are also very different in one key respect: +Zarr arrays are designed to be persisted to storage and later read, possibly by Zarr implementations in different programming languages. +So in addition to defining a memory layout for array elements, each Zarr data type defines a procedure for +reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to array metadata. Data types in Zarr version 2 @@ -35,11 +35,11 @@ Thus the JSON identifer for a Numpy-compatible data type is just the Numpy ``str >>> dtype_meta i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. - + For example, the Numpy data type ``np.dtype('>i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. + In Zarr V3 metadata, the same Numpy data type would be saved as ``{..., "data_type": "int16", "codecs": [..., {"name": "bytes", "configuration": {"endian": "big"}, ...]}`` -* Define a default fill value. This is not mandated by the Zarr specifications, but it's convenient for users - to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for +* Define a default fill value. This is not mandated by the Zarr specifications, but it's convenient for users + to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for parametric data types like fixed-length strings the default can only be generated after the data type has been parametrized at runtime. * Round-trip scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications define how scalars of each data type should be stored as JSON in array metadata documents, and in principle each data type can define this encoding separately. -* Do all of the above for *user-defined data types*. Zarr-Python should support data types added as extensions,so we cannot - hard-code the list of data types. We need to ensure that users can easily (or easily enough) define a python object - that models their custom data type and register this object with Zarr-Python, so that the above operations all succeed for their +* Do all of the above for *user-defined data types*. Zarr-Python should support data types added as extensions,so we cannot + hard-code the list of data types. We need to ensure that users can easily (or easily enough) define a python object + that models their custom data type and register this object with Zarr-Python, so that the above operations all succeed for their custom data type. -To achieve these goals, Zarr Python uses a class called :class:`zarr.core.dtype.DTypeWrapper` to wrap native data types. Each data type -supported by Zarr Python is modeled by a subclass of `DTypeWrapper`, which has the following structure: +To achieve these goals, Zarr Python uses a class called :class:`zarr.core.dtype.DTypeWrapper` to wrap native data types. Each data type +supported by Zarr Python is modeled by a subclass of `DTypeWrapper`, which has the following structure: (attribute) ``dtype_cls`` ^^^^^^^^^^^^^ The ``dtype_cls`` attribute is a **class variable** that is bound to a class that can produce -an instance of a native data type. For example, on the ``DTypeWrapper`` used to model the boolean -data type, the ``dtype_cls`` attribute is bound to the numpy bool data type class: ``np.dtypes.BoolDType``. -This attribute is used when we need to create an instance of the native data type, for example when -defining a Numpy array that will contain Zarr data. +an instance of a native data type. For example, on the ``DTypeWrapper`` used to model the boolean +data type, the ``dtype_cls`` attribute is bound to the numpy bool data type class: ``np.dtypes.BoolDType``. +This attribute is used when we need to create an instance of the native data type, for example when +defining a Numpy array that will contain Zarr data. -It might seem odd that ``DTypeWrapper.dtype_cls`` binds to a *class* that produces a native data type instead of an instance of that native data type -- +It might seem odd that ``DTypeWrapper.dtype_cls`` binds to a *class* that produces a native data type instead of an instance of that native data type -- why not have a ``DTypeWrapper.dtype`` attribute that binds to ``np.dtypes.BoolDType()``? The reason why ``DTypeWrapper`` -doesn't wrap a concrete data type instance is because data type instances may have endianness information, but Zarr V3 -data types do not. To model Zarr V3 data types, we need endianness to be an **instance variable** which is -defined when creating an instance of the ```DTypeWrapper``. Subclasses of ``DTypeWrapper`` that model data types with +doesn't wrap a concrete data type instance is because data type instances may have endianness information, but Zarr V3 +data types do not. To model Zarr V3 data types, we need endianness to be an **instance variable** which is +defined when creating an instance of the ```DTypeWrapper``. Subclasses of ``DTypeWrapper`` that model data types with byte order semantics thus have ``endianness`` as an instance variable, and this value can be set when creating an instance of the wrapper. (attribute) ``_zarr_v3_name`` ^^^^^^^^^^^^^ -The ``_zarr_v3_name`` attribute encodes the canonical name for a data type for Zarr V3. For many data types these names +The ``_zarr_v3_name`` attribute encodes the canonical name for a data type for Zarr V3. For many data types these names are defined in the `Zarr V3 specification https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#data-types`_ For nearly all of the data types defined in Zarr V3, this name can be used to uniquely specify a data type. The one exception is the ``r*`` data type, -which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. +which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. (class method) ``from_dtype(cls, dtype) -> Self`` ^^^^^^^^^ This method defines a procedure for safely converting a native dtype instance into an instance of ``DTypeWrapper``. It should perform -validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some -data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. +validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some +data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. A ``DTypeWrapper`` that wraps Numpy structured data types must do additional checks to ensure that the input ``dtype`` is actually a structured data type. -If input validation succeeds, this method will call ``_from_dtype_unsafe``. +If input validation succeeds, this method will call ``_from_dtype_unsafe``. (class method) ``_from_dtype_unsafe(cls, dtype) -> Self`` ^^^^^^^^^^ This method defines the procedure for converting a native data type instance, like ``np.dtype('uint8')``, -into a wrapper class instance. The ``unsafe`` prefix on the method name denotes that this method should not -perform any input validation. Input validation should be done by the routine that calls this method. +into a wrapper class instance. The ``unsafe`` prefix on the method name denotes that this method should not +perform any input validation. Input validation should be done by the routine that calls this method. For many data types, creating the wrapper class takes no arguments and so this method can just return ``cls()``. -But for data types with runtime attributes like endianness or length (for fixed-size strings), this ``_from_dtype_unsafe`` +But for data types with runtime attributes like endianness or length (for fixed-size strings), this ``_from_dtype_unsafe`` ensures that those attributes of ``dtype`` are mapped on to the correct parameters in the ``DTypeWrapper`` class constructor. (method) ``to_dtype(self) -> dtype`` ^^^^^^^ -This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together +This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together with ``from_dtype``, this method allows round-trip conversion of a native data type in to a wrapper class and then out again. That is, for some ``DTypeWrapper`` class ``FooWrapper`` that wraps a native data type called ``foo``, ``FooWrapper.from_dtype(instance_of_foo).to_dtype() == instance_of_foo`` should be true. -(method) ``to_dict(self) -> dict`` +(method) ``to_dict(self) -> dict`` ^^^^^ -This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in +This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in Zarr metadata. (method) ``cast_value(self, value: object) -> scalar`` ^^^^^ -Cast a python object to an instance of the wrapped data type. This is used for generating the default +Cast a python object to an instance of the wrapped data type. This is used for generating the default value associated with this data type. (method) ``default_value(self) -> scalar`` ^^^^ -Return the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value -for an array when a user has not requested one. +Return the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value +for an array when a user has not requested one. -Why is this a method and not a static attribute? Although some data types +Why is this a method and not a static attribute? Although some data types can have a static default value, parametrized data types like fixed-length strings or structured data types cannot. For these data types, a default value must be calculated based on the attributes of the wrapped data type. -(method) `` +(method) ``check_dtype(cls, dtype)`` From b7a231e08c978ab0f229957fc6a52cec8aca11a2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 16:27:06 +0100 Subject: [PATCH 024/130] update docs --- docs/user-guide/data_types.rst | 64 ++++++++++++++++++++++++---------- src/zarr/core/dtype/wrapper.py | 29 ++++++++------- 2 files changed, 59 insertions(+), 34 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 7a5825bf2f..83b9870755 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -6,24 +6,24 @@ Zarr's data type model Every Zarr array has a "data type", which defines the meaning and physical layout of the array's elements. Zarr is heavily influenced by `NumPy `_, and -Zarr arrays can use many of the same data types as numpy arrays:: +Zarr-Python supports creating arrays with Numpy data types:: >>> import zarr >>> import numpy as np >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) >>> z -But Zarr data types and Numpy data types are also very different in one key respect: -Zarr arrays are designed to be persisted to storage and later read, possibly by Zarr implementations in different programming languages. -So in addition to defining a memory layout for array elements, each Zarr data type defines a procedure for +But Zarr data types and Numpy data types are also very different: +Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. +To ensure that the data type can be interpreted correctly when reading an array, each Zarr data type defines a procedure for reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to -array metadata. +array metadata, and these serialization procedures depend on the Zarr format. Data types in Zarr version 2 ----------------------------- Version 2 of the Zarr format defined its data types relative to `Numpy's data types `_, and added a few non-Numpy data types as well. -Thus the JSON identifer for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype: +Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype: >>> import zarr >>> import numpy as np @@ -113,16 +113,6 @@ data types, additional checks are needed -- in Numpy "structured" data types and A ``DTypeWrapper`` that wraps Numpy structured data types must do additional checks to ensure that the input ``dtype`` is actually a structured data type. If input validation succeeds, this method will call ``_from_dtype_unsafe``. -(class method) ``_from_dtype_unsafe(cls, dtype) -> Self`` -^^^^^^^^^^ -This method defines the procedure for converting a native data type instance, like ``np.dtype('uint8')``, -into a wrapper class instance. The ``unsafe`` prefix on the method name denotes that this method should not -perform any input validation. Input validation should be done by the routine that calls this method. - -For many data types, creating the wrapper class takes no arguments and so this method can just return ``cls()``. -But for data types with runtime attributes like endianness or length (for fixed-size strings), this ``_from_dtype_unsafe`` -ensures that those attributes of ``dtype`` are mapped on to the correct parameters in the ``DTypeWrapper`` class constructor. - (method) ``to_dtype(self) -> dtype`` ^^^^^^^ This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together @@ -137,20 +127,56 @@ Zarr metadata. (method) ``cast_value(self, value: object) -> scalar`` ^^^^^ -Cast a python object to an instance of the wrapped data type. This is used for generating the default +This method converts a python object to an instance of the wrapped data type. It is used for generating the default value associated with this data type. (method) ``default_value(self) -> scalar`` ^^^^ -Return the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value +This method returns the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value for an array when a user has not requested one. Why is this a method and not a static attribute? Although some data types can have a static default value, parametrized data types like fixed-length strings or structured data types cannot. For these data types, a default value must be calculated based on the attributes of the wrapped data type. -(method) ``check_dtype(cls, dtype)`` +(class method) ``check_dtype(cls, dtype) -> bool`` +^^^^^ +This class method checks if a native dtype is compatible with the ``DTypeWrapper`` class. It returns ``True`` +if ``dtype`` is compatible with the wrapper class, and ``False`` otherwise. For many data types, this check is as simple +as checking that ``cls.dtype_cls`` matches ``type(dtype)``, i.e. checking that the data type class wrapped +by the ``DTypeWrapper`` is the same as the class of ``dtype``. But there are some data types where this check alone is not sufficient, +in which case this method is overridden so that additional properties of ``dtype`` can be inspected and compared with +the expectations of ``cls``. + +(class method) ``from_dict(cls, dtype) -> Self`` +^^^^ +This class method creates a ``DTypeWrapper`` from an appropriately structured dictionary. The default +implementation first checks that the dictionary has the correct structure, and then uses its data +to instantiate the ``DTypeWrapper`` instance. + +(method) ``to_dict(self) -> dict[str, JSON]`` +^^^ +Returns a dictionary form of the wrapped data type. This is used prior to writing array metadata. +(class method) ``get_name(self, zarr_format: Literal[2, 3]) -> str`` +^^^^ +This method generates a name for the wrapped data type, depending on the Zarr format. If ``zarr_format`` is +2 and the wrapped data type is a Numpy data type, then the Numpy string representation of that data type is returned. +If ``zarr_format`` is 3, then the Zarr V3 name for the wrapped data type is returned. For most data types +the Zarr V3 name will be stored as the ``_zarr_v3_name`` class attribute, but for parametric data types the +name must be computed at runtime based on the parameters of the data type. + + +(method) ``to_json_value(self, data: scalar, zarr_format: Literal[2, 3]) -> JSON`` +^^^ +This method converts a scalar instance of the data type into a JSON-serialiazable value. +For some data types like bool and integers this conversion is simple -- just return a JSON boolean +or number -- but other data types define a JSON serialization for scalars that is a bit more involved. +And this JSON serialization depends on the Zarr format. + +(method) ``from_json_value(self, data: JSON, zarr_format: Literal[2, 3]) -> scalar`` +^^^ +Convert a JSON-serialiazed scalar to a native scalar. This inverts the operation of ``to_json_value``. diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index eecb1f2562..dc3a0cc5d2 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -65,7 +65,6 @@ def from_dtype(cls: type[Self], dtype: TDType) -> Self: f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." ) - @classmethod @abstractmethod def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: @@ -96,18 +95,6 @@ def to_dtype(self: Self) -> TDType: """ raise NotImplementedError - @abstractmethod - def to_dict(self) -> dict[str, JSON]: - """ - Convert the wrapped data type to a dictionary. - - Returns - ------- - dict[str, JSON] - The dictionary representation of the wrapped data type - """ - raise NotImplementedError - def cast_value(self: Self, value: object) -> TScalar: """ Cast a value to an instance of the scalar type. @@ -178,6 +165,18 @@ def check_dict(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JS """ return "name" in data and data["name"] == cls._zarr_v3_name + @abstractmethod + def to_dict(self) -> dict[str, JSON]: + """ + Convert the wrapped data type to a dictionary. + + Returns + ------- + dict[str, JSON] + The dictionary representation of the wrapped data type + """ + raise NotImplementedError + @classmethod def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: """ @@ -194,11 +193,11 @@ def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: The wrapped data type. """ if cls.check_dict(data): - return cls._from_json_unsafe(data) + return cls._from_dict_unsafe(data) raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") @classmethod - def _from_json_unsafe(cls: type[Self], data: dict[str, JSON]) -> Self: + def _from_dict_unsafe(cls: type[Self], data: dict[str, JSON]) -> Self: """ Wrap a JSON representation of a data type. From 7dfcd0f6b6334f4e87a9769bc3e950791d498c70 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 16:34:40 +0100 Subject: [PATCH 025/130] fix sphinx warnings --- docs/user-guide/data_types.rst | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 83b9870755..6132eb2376 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -7,11 +7,11 @@ Zarr's data type model Every Zarr array has a "data type", which defines the meaning and physical layout of the array's elements. Zarr is heavily influenced by `NumPy `_, and Zarr-Python supports creating arrays with Numpy data types:: - >>> import zarr - >>> import numpy as np - >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) - >>> z - +>>> import zarr +>>> import numpy as np +>>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) +>>> z + But Zarr data types and Numpy data types are also very different: Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. @@ -36,8 +36,8 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st `_, or "byte order", of the data type. Following Numpy's example, + Zarr version 2 data types associate each data type with an endianness where applicable. Zarr version 3 data types do not store endianness information. In addition to defining a representation of the data type itself (which in the example above was just a simple string ``"`_ For nearly all of the data types defined in Zarr V3, this name can be used to uniquely specify a data type. The one exception is the ``r*`` data type, which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. (class method) ``from_dtype(cls, dtype) -> Self`` -^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method defines a procedure for safely converting a native dtype instance into an instance of ``DTypeWrapper``. It should perform validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. @@ -114,25 +114,25 @@ A ``DTypeWrapper`` that wraps Numpy structured data types must do additional che If input validation succeeds, this method will call ``_from_dtype_unsafe``. (method) ``to_dtype(self) -> dtype`` -^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together with ``from_dtype``, this method allows round-trip conversion of a native data type in to a wrapper class and then out again. That is, for some ``DTypeWrapper`` class ``FooWrapper`` that wraps a native data type called ``foo``, ``FooWrapper.from_dtype(instance_of_foo).to_dtype() == instance_of_foo`` should be true. (method) ``to_dict(self) -> dict`` -^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in Zarr metadata. (method) ``cast_value(self, value: object) -> scalar`` -^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method converts a python object to an instance of the wrapped data type. It is used for generating the default value associated with this data type. (method) ``default_value(self) -> scalar`` -^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method returns the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value for an array when a user has not requested one. @@ -141,7 +141,7 @@ can have a static default value, parametrized data types like fixed-length strin a default value must be calculated based on the attributes of the wrapped data type. (class method) ``check_dtype(cls, dtype) -> bool`` -^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This class method checks if a native dtype is compatible with the ``DTypeWrapper`` class. It returns ``True`` if ``dtype`` is compatible with the wrapper class, and ``False`` otherwise. For many data types, this check is as simple as checking that ``cls.dtype_cls`` matches ``type(dtype)``, i.e. checking that the data type class wrapped @@ -150,17 +150,17 @@ in which case this method is overridden so that additional properties of ``dtype the expectations of ``cls``. (class method) ``from_dict(cls, dtype) -> Self`` -^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This class method creates a ``DTypeWrapper`` from an appropriately structured dictionary. The default implementation first checks that the dictionary has the correct structure, and then uses its data to instantiate the ``DTypeWrapper`` instance. (method) ``to_dict(self) -> dict[str, JSON]`` -^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Returns a dictionary form of the wrapped data type. This is used prior to writing array metadata. (class method) ``get_name(self, zarr_format: Literal[2, 3]) -> str`` -^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method generates a name for the wrapped data type, depending on the Zarr format. If ``zarr_format`` is 2 and the wrapped data type is a Numpy data type, then the Numpy string representation of that data type is returned. If ``zarr_format`` is 3, then the Zarr V3 name for the wrapped data type is returned. For most data types @@ -169,14 +169,14 @@ name must be computed at runtime based on the parameters of the data type. (method) ``to_json_value(self, data: scalar, zarr_format: Literal[2, 3]) -> JSON`` -^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method converts a scalar instance of the data type into a JSON-serialiazable value. For some data types like bool and integers this conversion is simple -- just return a JSON boolean or number -- but other data types define a JSON serialization for scalars that is a bit more involved. And this JSON serialization depends on the Zarr format. (method) ``from_json_value(self, data: JSON, zarr_format: Literal[2, 3]) -> scalar`` -^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Convert a JSON-serialiazed scalar to a native scalar. This inverts the operation of ``to_json_value``. From 706e6b636cb2428aa13a773b2099b0d0ed405c0c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 16:59:52 +0100 Subject: [PATCH 026/130] tweak docs --- docs/user-guide/data_types.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 6132eb2376..94e05de62d 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -7,17 +7,17 @@ Zarr's data type model Every Zarr array has a "data type", which defines the meaning and physical layout of the array's elements. Zarr is heavily influenced by `NumPy `_, and Zarr-Python supports creating arrays with Numpy data types:: ->>> import zarr ->>> import numpy as np ->>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) ->>> z - -But Zarr data types and Numpy data types are also very different: + >>> import zarr + >>> import numpy as np + >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) + >>> z + + Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. -To ensure that the data type can be interpreted correctly when reading an array, each Zarr data type defines a procedure for -reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to -array metadata, and these serialization procedures depend on the Zarr format. +This means Zarr data types must be interpreted correctly when clients read an array. So each Zarr data type defines a procedure for +encoding / decoding that data type to / from Zarr array metadata, and also encoding / decoding **instances** of that data type to / from +array metadata. These serialization procedures depend on the Zarr format. Data types in Zarr version 2 ----------------------------- @@ -56,7 +56,7 @@ Zarr-Python supports two different Zarr formats, and those two formats specify d data types in Zarr version 2 are encoded as Numpy-compatible strings, while data types in Zarr version 3 are encoded as either strings or ``JSON`` objects, and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. -If that wasn't enough, we want Zarr-Python to support data types beyond what's available in Numpy. So it's crucial that we have a +We also want Zarr-Python to support data types beyond what's available in Numpy. So it's crucial that we have a model of array data types that can adapt to the differences between Zarr V2 and V3 and doesn't over-fit to Numpy. Here are the operations we need to perform on data types in Zarr-Python: From 8fbf67347d41d584226652637eccc0e1cd000333 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 17:05:49 +0100 Subject: [PATCH 027/130] info about v3 data types --- docs/user-guide/data_types.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 94e05de62d..2c6a98753c 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -46,8 +46,9 @@ as are floats, with the caveat that `NaN`, positive infinity, and negative infin Data types in Zarr version 3 ---------------------------- +* Data type names are different -- Zarr V2 represented the 16 bit unsigned integer data type as ``>i2``; Zarr V3 represents the same data type as ``int16``. * No endianness -* Data type can be encoded as a string or a ``JSON`` object with the structure ``{"name": , "configuration": {...}}`` +* A data type can be encoded in metadata as a string or a ``JSON`` object with the structure ``{"name": , "configuration": {...}}`` Data types in Zarr-Python ------------------------- From e9aff64055aafb6b833b126bdce5dfabed5ed69f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 17:12:45 +0100 Subject: [PATCH 028/130] adjust note --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 2c6a98753c..8fcfaac794 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -37,7 +37,7 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st .. note:: The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following Numpy's example, - Zarr version 2 data types associate each data type with an endianness where applicable. Zarr version 3 data types do not store endianness information. + in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. In addition to defining a representation of the data type itself (which in the example above was just a simple string ``" Date: Thu, 13 Mar 2025 17:41:56 +0100 Subject: [PATCH 029/130] fix: use unparametrized types in direct assignment --- src/zarr/core/dtype/_numpy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 362f7f361c..caf46bb216 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -524,7 +524,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType[Any], np.void]): - dtype_cls = np.dtypes.VoidDType[Any] + dtype_cls = np.dtypes.VoidDType _zarr_v3_name = "r*" item_size_bits: ClassVar[int] = 8 length: int = 1 @@ -591,8 +591,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]): - dtype_cls = np.dtypes.StrDType[int] - _zarr_v3_name = "numpy.static_unicode_string" + dtype_cls = np.dtypes.StrDType + _zarr_v3_name = "numpy.fixed_length_unicode_string" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point endianness: Endianness | None = "native" length: int = 1 From 60cac0496b353244194e483d661af7934c059fdd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 10:04:13 +0100 Subject: [PATCH 030/130] start fixing config --- src/zarr/core/array.py | 19 +++++++---------- src/zarr/core/config.py | 39 +++++++++++++---------------------- src/zarr/core/dtype/_numpy.py | 4 ++-- 3 files changed, 23 insertions(+), 39 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a060bcbfae..465a2b6cc8 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -70,7 +70,6 @@ from zarr.core.dtype import ( DTypeWrapper, FixedLengthAsciiString, - FixedLengthUnicodeString, VariableLengthString, parse_data_type, ) @@ -4248,19 +4247,15 @@ def _get_default_chunk_encoding_v2( """ Get the default chunk encoding for Zarr format 2 arrays, given a dtype """ - from numcodecs import VLenBytes as numcodecs_VLenBytes - from numcodecs import VLenUTF8 as numcodecs_VLenUTF8 - from numcodecs import Zstd as numcodecs_zstd - - if isinstance(dtype, VariableLengthString | FixedLengthUnicodeString): - filters = (numcodecs_VLenUTF8(),) - elif isinstance(dtype, FixedLengthAsciiString): - filters = (numcodecs_VLenBytes(),) + if dtype._zarr_v3_name in zarr_config.get("array.v2_default_filters"): + filters = zarr_config.get(f"array.v2_default_filters.{dtype._zarr_v3_name}") else: - filters = None - - compressor = numcodecs_zstd(level=0, checksum=False) + filters = zarr_config.get("array.v2_default_filters.default") + if dtype._zarr_v3_name in zarr_config.get("array.v2_default_compressor"): + compressor = zarr_config.get(f"array.v2_default_compressor.{dtype._zarr_v3_name}") + else: + compressor = zarr_config.get("array.v2_default_compressor.default") return filters, compressor diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 98252f572c..71c311d7d5 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -36,6 +36,8 @@ if TYPE_CHECKING: from donfig.config_obj import ConfigSet +from collections import defaultdict + class BadConfigError(ValueError): _msg = "bad Config: %r" @@ -77,37 +79,24 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": { - "numeric": {"id": "zstd", "level": 0, "checksum": False}, - "string": {"id": "zstd", "level": 0, "checksum": False}, - "bytes": {"id": "zstd", "level": 0, "checksum": False}, - }, + "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { - "numeric": None, - "string": [{"id": "vlen-utf8"}], - "bytes": [{"id": "vlen-bytes"}], - "raw": None, + "default": None, + "numpy.variable_length_unicode_string": [{"id": "vlen-utf8"}], + "numpy.fixed_length_unicode_string": [{"id": "vlen-utf8"}], + "r*": [{"id": "vlen-bytes"}], }, - "v3_default_filters": {"boolean": [], "numeric": [], "string": [], "bytes": []}, + "v3_default_filters": defaultdict(list), "v3_default_serializer": { - "boolean": {"name": "bytes", "configuration": {"endian": "little"}}, - "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, - "string": {"name": "vlen-utf8"}, - "bytes": {"name": "vlen-bytes"}, + "default": {"name": "bytes", "configuration": {"endian": "little"}}, + "numpy.variable_length_unicode_string": [{"name": "vlen-utf8"}], + "numpy.fixed_length_unicode_string": [{"name": "vlen-utf8"}], + "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { - "boolean": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "numeric": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "string": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "bytes": [ + "default": [ {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], + ] }, }, "async": {"concurrency": 10, "timeout": None}, diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index caf46bb216..d61fedd4ab 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -627,7 +627,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "numpy.vlen_string" + _zarr_v3_name = "numpy.variable_length_string" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: @@ -658,7 +658,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "numpy.vlen_string" + _zarr_v3_name = "numpy.variable_length_string" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: From 120df57d6aa633c2d290db7013ce94a85c79622d Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 17 Mar 2025 10:12:38 +0100 Subject: [PATCH 031/130] Update src/zarr/core/_info.py Co-authored-by: Joe Hamman --- src/zarr/core/_info.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index a632b8c602..3a3a3a5714 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -9,8 +9,6 @@ from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import DTypeWrapper -# from zarr.core.metadata.v3 import DataType - @dataclasses.dataclass(kw_only=True) class GroupInfo: From 0d9922b5bb71be891764888d223684b8ff8f63e5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 12:20:39 +0100 Subject: [PATCH 032/130] add placeholder disclaimer to v3 data types summary --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 8fcfaac794..91cbeb1d7f 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -45,7 +45,7 @@ as are floats, with the caveat that `NaN`, positive infinity, and negative infin Data types in Zarr version 3 ---------------------------- - +(note: placeholder text) * Data type names are different -- Zarr V2 represented the 16 bit unsigned integer data type as ``>i2``; Zarr V3 represents the same data type as ``int16``. * No endianness * A data type can be encoded in metadata as a string or a ``JSON`` object with the structure ``{"name": , "configuration": {...}}`` From 207595251f6ba9881972411b9943a01c0f7311e8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 12:22:53 +0100 Subject: [PATCH 033/130] make example runnable --- docs/user-guide/data_types.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 91cbeb1d7f..7039d1850a 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -28,8 +28,9 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st >>> import zarr >>> import numpy as np >>> import json + >>> store = {} >>> np_dtype = np.dtype('int64') - >>> z = zarr.create_array(shape=(1,), dtype=np_dtype, zarr_format=2) + >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> assert dtype_meta == np_dtype.str # True >>> dtype_meta From 44369d68b5ca0d647dbff497c297b426cbaa3108 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 12:25:31 +0100 Subject: [PATCH 034/130] placeholder section for adding a custom dtype --- docs/user-guide/data_types.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 7039d1850a..352e967c87 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -181,4 +181,7 @@ And this JSON serialization depends on the Zarr format. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Convert a JSON-serialiazed scalar to a native scalar. This inverts the operation of ``to_json_value``. +Using a custom data type +------------------------ +TODO \ No newline at end of file From 4f3381f12d2ed72bdf2a6b4449d6dece5e656989 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 12:38:46 +0100 Subject: [PATCH 035/130] define native data type and native scalar --- docs/user-guide/data_types.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 352e967c87..fffd622209 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -58,21 +58,23 @@ Zarr-Python supports two different Zarr formats, and those two formats specify d data types in Zarr version 2 are encoded as Numpy-compatible strings, while data types in Zarr version 3 are encoded as either strings or ``JSON`` objects, and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. -We also want Zarr-Python to support data types beyond what's available in Numpy. So it's crucial that we have a -model of array data types that can adapt to the differences between Zarr V2 and V3 and doesn't over-fit to Numpy. +We aspire for Zarr-Python to eventually be array-library-agnostic. +In the context of data types, this means that we should not design an API that overfits to Numpy's data types. +We will use the term "native data type" to refer to a data type used by any external array library (including Numpy), e.g. ``np.dtypes.Float64DType()``. +We will also use the term "native scalar" or "native scalar type" to refer to a scalar value of a native data type. For example, ``np.float64(0)`` generates a scalar with the data dtype ``np.dtypes.Float64DType`` -Here are the operations we need to perform on data types in Zarr-Python: +Zarr-Python needs to support the following operations on native data types: * Round-trip native data types to fields in array metadata documents. For example, the Numpy data type ``np.dtype('>i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. In Zarr V3 metadata, the same Numpy data type would be saved as ``{..., "data_type": "int16", "codecs": [..., {"name": "bytes", "configuration": {"endian": "big"}, ...]}`` -* Define a default fill value. This is not mandated by the Zarr specifications, but it's convenient for users +* Associate a default fill value with a native data type. This is not mandated by the Zarr specifications, but it's convenient for users to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for parametric data types like fixed-length strings the default can only be generated after the data type has been parametrized at runtime. -* Round-trip scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications +* Round-trip native scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications define how scalars of each data type should be stored as JSON in array metadata documents, and in principle each data type can define this encoding separately. From c8d76800a7fb5742b8d02f5ba143df620dd66c35 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 14:32:12 +0100 Subject: [PATCH 036/130] update data type names --- src/zarr/core/array.py | 41 ++++++++++++++++++++--------------- src/zarr/core/config.py | 14 +++++------- src/zarr/core/dtype/_numpy.py | 33 +++++++++++++++------------- 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 465a2b6cc8..7e2d65f5bc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -30,9 +30,6 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec -from zarr.codecs.bytes import BytesCodec -from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec -from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config from zarr.core.attributes import Attributes @@ -69,8 +66,6 @@ from zarr.core.config import config as zarr_config from zarr.core.dtype import ( DTypeWrapper, - FixedLengthAsciiString, - VariableLengthString, parse_data_type, ) from zarr.core.indexing import ( @@ -4224,21 +4219,29 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - filters = () - compressors = (ZstdCodec(level=0, checksum=False),) # TODO: find a registry-style solution for this that isn't bloated # We need to associate specific dtypes with specific encoding schemes - if isinstance(dtype, VariableLengthString): - serializer = VLenUTF8Codec() - elif isinstance(dtype, FixedLengthAsciiString): - serializer = VLenBytesCodec() + if dtype._zarr_v3_name in zarr_config.get("array.v3_default_filters"): + filters = zarr_config.get(f"array.v3_default_filters.{dtype._zarr_v3_name}") else: - if dtype.to_dtype().itemsize == 1: - serializer = BytesCodec(endian=None) - else: - serializer = BytesCodec() - return filters, serializer, compressors + filters = zarr_config.get("array.v3_default_filters.default") + + if dtype._zarr_v3_name in zarr_config.get("array.v3_default_compressors"): + compressors = zarr_config.get(f"array.v3_default_compressors.{dtype._zarr_v3_name}") + else: + compressors = zarr_config.get("array.v3_default_compressors.default") + + if dtype._zarr_v3_name in zarr_config.get("array.v3_default_serializer"): + serializer = zarr_config.get(f"array.v3_default_serializer.{dtype._zarr_v3_name}") + else: + serializer = zarr_config.get("array.v3_default_serializer.default") + + return ( + tuple(_parse_array_array_codec(f) for f in filters), + _parse_array_bytes_codec(serializer), + tuple(_parse_bytes_bytes_codec(c) for c in compressors), + ) def _get_default_chunk_encoding_v2( @@ -4256,7 +4259,11 @@ def _get_default_chunk_encoding_v2( compressor = zarr_config.get(f"array.v2_default_compressor.{dtype._zarr_v3_name}") else: compressor = zarr_config.get("array.v2_default_compressor.default") - return filters, compressor + + if filters is not None: + filters = tuple(numcodecs.get_codec(f) for f in filters) + + return filters, numcodecs.get_codec(compressor) def _parse_chunk_encoding_v2( diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 71c311d7d5..aa4dde049e 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -36,8 +36,6 @@ if TYPE_CHECKING: from donfig.config_obj import ConfigSet -from collections import defaultdict - class BadConfigError(ValueError): _msg = "bad Config: %r" @@ -82,15 +80,15 @@ def enable_gpu(self) -> ConfigSet: "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { "default": None, - "numpy.variable_length_unicode_string": [{"id": "vlen-utf8"}], - "numpy.fixed_length_unicode_string": [{"id": "vlen-utf8"}], - "r*": [{"id": "vlen-bytes"}], + "variable_length_utf8": [{"id": "vlen-utf8"}], + "fixed_length_ucs4": [{"id": "vlen-utf8"}], + "fixed_length_ascii": [{"id": "vlen-bytes"}], }, - "v3_default_filters": defaultdict(list), + "v3_default_filters": {"default": ()}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "numpy.variable_length_unicode_string": [{"name": "vlen-utf8"}], - "numpy.fixed_length_unicode_string": [{"name": "vlen-utf8"}], + "variable_length_utf8": {"name": "vlen-utf8"}, + "fixed_length_ucs4": {"name": "vlen-utf8"}, "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index d61fedd4ab..fa97503795 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -496,7 +496,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex1 @dataclass(frozen=True, kw_only=True) class FixedLengthAsciiString(DTypeWrapper[np.dtypes.BytesDType[Any], np.bytes_]): dtype_cls = np.dtypes.BytesDType - _zarr_v3_name = "numpy.static_byte_string" + _zarr_v3_name = "fixed_length_ascii" item_size_bits: ClassVar[int] = 8 length: int = 1 @@ -523,20 +523,20 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType[Any], np.void]): +class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType, np.void]): dtype_cls = np.dtypes.VoidDType _zarr_v3_name = "r*" item_size_bits: ClassVar[int] = 8 length: int = 1 @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[Any]) -> Self: + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def default_value(self) -> np.void: return self.cast_value(("\x00" * self.length).encode("ascii")) - def to_dtype(self) -> np.dtypes.VoidDType[Any]: + def to_dtype(self) -> np.dtypes.VoidDType: # Numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly return np.dtype(f"V{self.length}") @@ -577,7 +577,7 @@ def check_dict(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: isinstance(data, dict) and "name" in data and isinstance(data["name"], str) - and re.match(r"^r\d+$", data["name"]) + and (re.match(r"^r\d+$", data["name"]) is not None) ) def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str: @@ -592,7 +592,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]): dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_unicode_string" + _zarr_v3_name = "fixed_length_ucs4" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point endianness: Endianness | None = "native" length: int = 1 @@ -605,7 +605,10 @@ def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: ) def to_dtype(self) -> np.dtypes.StrDType[int]: - return self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)) + return cast( + np.dtypes.StrDType[int], + self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)), + ) def default_value(self) -> np.str_: return np.str_("") @@ -627,7 +630,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "numpy.variable_length_string" + _zarr_v3_name = "variable_length_utf8" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: @@ -658,14 +661,14 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "numpy.variable_length_string" + _zarr_v3_name = "variable_length_utf8" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.ObjectDType: - return self.dtype_cls() + return cast(np.dtypes.ObjectDType, self.dtype_cls()) def cast_value(self, value: object) -> str: return str(value) @@ -695,7 +698,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): dtype_cls = np.dtypes.DateTime64DType - _zarr_v3_name = "numpy.datetime64" + _zarr_v3_name = "datetime64" unit: DateUnit | TimeUnit = "s" endianness: Endianness = "native" @@ -713,7 +716,7 @@ def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder)) def cast_value(self, value: object) -> np.datetime64: - return self.to_dtype().type(value, self.unit) + return cast(np.datetime64, self.to_dtype().type(value, self.unit)) def to_dtype(self) -> np.dtypes.DateTime64DType: # Numpy does not allow creating datetime64 via @@ -734,14 +737,14 @@ def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): dtype_cls = np.dtypes.VoidDType - _zarr_v3_name = "numpy.structured" + _zarr_v3_name = "structured" fields: tuple[tuple[str, DTypeWrapper[Any, Any]], ...] def default_value(self) -> np.void: return self.cast_value(0) def cast_value(self, value: object) -> np.void: - return np.array([value], dtype=self.to_dtype())[0] + return cast(np.void, np.array([value], dtype=self.to_dtype())[0]) @classmethod def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: @@ -787,7 +790,7 @@ def to_dict(self) -> dict[str, JSON]: return base_dict @classmethod - def check_dict(cls, data: JSON) -> bool: + def check_dict(cls, data: JSON) -> TypeGuard[JSON]: return ( isinstance(data, dict) and "name" in data From 2a7b5a8cead0dc5c74525248b7a27058846f091b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 16:11:26 +0100 Subject: [PATCH 037/130] fix config test failures --- src/zarr/core/array.py | 10 +++++- src/zarr/core/config.py | 2 +- src/zarr/core/dtype/_numpy.py | 2 +- tests/test_array.py | 19 +++++++--- tests/test_config.py | 67 +++++++++++++++-------------------- 5 files changed, 53 insertions(+), 47 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7e2d65f5bc..0fa25c3695 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -30,6 +30,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec +from zarr.codecs.bytes import BytesCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config from zarr.core.attributes import Attributes @@ -4231,7 +4232,6 @@ def _get_default_chunk_encoding_v3( compressors = zarr_config.get(f"array.v3_default_compressors.{dtype._zarr_v3_name}") else: compressors = zarr_config.get("array.v3_default_compressors.default") - if dtype._zarr_v3_name in zarr_config.get("array.v3_default_serializer"): serializer = zarr_config.get(f"array.v3_default_serializer.{dtype._zarr_v3_name}") else: @@ -4353,6 +4353,14 @@ def _parse_chunk_encoding_v3( out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + # specialize codecs as needed given the dtype + + # TODO: refactor so that the config only contains the name of the codec, and we use the dtype + # to create the codec instance, instead of storing a dict representation of a full codec. + + if isinstance(out_array_bytes, BytesCodec) and dtype.to_dtype().itemsize == 1: + # The default endianness in the bytescodec might not be None, so we need to replace it + out_array_bytes = replace(out_array_bytes, endian=None) return out_array_array, out_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index aa4dde049e..054316fd37 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -84,7 +84,7 @@ def enable_gpu(self) -> ConfigSet: "fixed_length_ucs4": [{"id": "vlen-utf8"}], "fixed_length_ascii": [{"id": "vlen-bytes"}], }, - "v3_default_filters": {"default": ()}, + "v3_default_filters": {"default": []}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, "variable_length_utf8": {"name": "vlen-utf8"}, diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index fa97503795..c562f0a593 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -711,7 +711,7 @@ def to_dict(self) -> dict[str, JSON]: @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] - if unit not in get_args(DateUnit | TimeUnit): + if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder)) diff --git a/tests/test_array.py b/tests/test_array.py index f8880c86c0..b2f21d6562 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1,4 +1,5 @@ import dataclasses +import inspect import json import math import multiprocessing as mp @@ -28,8 +29,6 @@ from zarr.core.array import ( CompressorsLike, FiltersLike, - _get_default_chunk_encoding_v2, - _get_default_chunk_encoding_v3, _parse_chunk_encoding_v2, _parse_chunk_encoding_v3, chunks_initialized, @@ -1064,13 +1063,23 @@ async def test_default_filters_compressors( shape=(10,), zarr_format=zarr_format, ) + + sig = inspect.signature(create_array) + if zarr_format == 3: - expected_filters, expected_serializer, expected_compressors = ( - _get_default_chunk_encoding_v3(dtype=zdtype) + expected_filters, expected_serializer, expected_compressors = _parse_chunk_encoding_v3( + compressors=sig.parameters["compressors"].default, + filters=sig.parameters["filters"].default, + serializer=sig.parameters["serializer"].default, + dtype=zdtype, ) elif zarr_format == 2: - default_filters, default_compressors = _get_default_chunk_encoding_v2(dtype=zdtype) + default_filters, default_compressors = _parse_chunk_encoding_v2( + compressor=sig.parameters["compressors"].default, + filters=sig.parameters["filters"].default, + dtype=zdtype, + ) if default_filters is None: expected_filters = () else: diff --git a/tests/test_config.py b/tests/test_config.py index 1a2453d646..34ecfdc119 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -19,10 +19,12 @@ GzipCodec, ShardingCodec, ) +from zarr.core.array import create_array from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -52,33 +54,24 @@ def test_config_defaults_set() -> None: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": { - "numeric": {"id": "zstd", "level": 0, "checksum": False}, - "string": {"id": "zstd", "level": 0, "checksum": False}, - "bytes": {"id": "zstd", "level": 0, "checksum": False}, - }, + "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { - "numeric": None, - "string": [{"id": "vlen-utf8"}], - "bytes": [{"id": "vlen-bytes"}], - "raw": None, + "default": None, + "variable_length_utf8": [{"id": "vlen-utf8"}], + "fixed_length_ucs4": [{"id": "vlen-utf8"}], + "fixed_length_ascii": [{"id": "vlen-bytes"}], }, - "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, + "v3_default_filters": {"default": []}, "v3_default_serializer": { - "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, - "string": {"name": "vlen-utf8"}, - "bytes": {"name": "vlen-bytes"}, + "default": {"name": "bytes", "configuration": {"endian": "little"}}, + "variable_length_utf8": {"name": "vlen-utf8"}, + "fixed_length_ucs4": {"name": "vlen-utf8"}, + "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { - "numeric": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "string": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "bytes": [ + "default": [ {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], + ] }, }, "async": {"concurrency": 10, "timeout": None}, @@ -306,26 +299,22 @@ class NewCodec2(BytesCodec): @pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) async def test_default_codecs(dtype: str) -> None: - with config.set( - { - "array.v3_default_compressors": { # test setting non-standard codecs - "numeric": [ - {"name": "gzip", "configuration": {"level": 5}}, - ], - "string": [ - {"name": "gzip", "configuration": {"level": 5}}, - ], - "bytes": [ - {"name": "gzip", "configuration": {"level": 5}}, - ], - } - } - ): - arr = await zarr.api.asynchronous.create_array( + """ + Test that the default compressors are sensitive to the current setting of the config. + """ + zdtype = get_data_type_from_numpy(dtype) + expected_compressors = (GzipCodec(),) + new_conf = { + f"array.v3_default_compressors.{zdtype._zarr_v3_name}": [ + c.to_dict() for c in expected_compressors + ] + } + with config.set(new_conf): + arr = await create_array( shape=(100,), chunks=(100,), - dtype=np.dtype(dtype), + dtype=dtype, zarr_format=3, store=MemoryStore(), ) - assert arr.compressors == (GzipCodec(),) + assert arr.compressors == expected_compressors From e855e54d3ac757af32f231ec26f65d9e38fcb809 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 16:27:10 +0100 Subject: [PATCH 038/130] call to_dtype once in blosc evolve_from_array_spec --- src/zarr/codecs/blosc.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 79be926ad8..4cee49f56d 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -136,18 +136,14 @@ def to_dict(self) -> dict[str, JSON]: } def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - dtype = array_spec.dtype + dtype = array_spec.dtype.to_dtype() new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=dtype.to_dtype().itemsize) + new_codec = replace(new_codec, typesize=dtype.itemsize) if new_codec.shuffle is None: new_codec = replace( new_codec, - shuffle=( - BloscShuffle.bitshuffle - if dtype.to_dtype().itemsize == 1 - else BloscShuffle.shuffle - ), + shuffle=(BloscShuffle.bitshuffle if dtype.itemsize == 1 else BloscShuffle.shuffle), ) return new_codec From a2da99add279049ae3827537e5c63f2652cc8aa2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 19 Mar 2025 21:18:47 +0100 Subject: [PATCH 039/130] refactor dtypewrapper -> zdtype --- src/zarr/abc/codec.py | 17 +- src/zarr/api/asynchronous.py | 6 +- src/zarr/codecs/bytes.py | 9 +- src/zarr/codecs/sharding.py | 16 +- src/zarr/codecs/transpose.py | 10 +- src/zarr/core/_info.py | 17 +- src/zarr/core/array.py | 85 +- src/zarr/core/array_spec.py | 15 +- src/zarr/core/buffer/cpu.py | 2 +- src/zarr/core/codec_pipeline.py | 9 +- src/zarr/core/common.py | 2 - src/zarr/core/config.py | 10 +- src/zarr/core/dtype/__init__.py | 53 +- src/zarr/core/dtype/_numpy.py | 1190 +++++++++++++++++----- src/zarr/core/dtype/common.py | 71 +- src/zarr/core/dtype/registry.py | 20 +- src/zarr/core/dtype/wrapper.py | 157 ++- src/zarr/core/metadata/v2.py | 44 +- src/zarr/core/metadata/v3.py | 36 +- src/zarr/testing/strategies.py | 4 +- tests/conftest.py | 4 +- tests/test_array.py | 6 +- tests/test_codecs/test_vlen.py | 6 +- tests/test_config.py | 16 +- tests/test_metadata/test_consolidated.py | 2 +- tests/test_metadata/test_dtype.py | 120 ++- tests/test_metadata/test_v3.py | 16 +- 27 files changed, 1312 insertions(+), 631 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 16400f5f4b..31cb44d84e 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Any, Generic, TypeVar +from typing import TYPE_CHECKING, Generic, TypeVar from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer @@ -12,11 +12,10 @@ from collections.abc import Awaitable, Callable, Iterable from typing import Self - import numpy as np - from zarr.abc.store import ByteGetter, ByteSetter from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import ChunkGrid + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar from zarr.core.indexing import SelectorTuple __all__ = [ @@ -93,7 +92,13 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: """ return self - def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: + def validate( + self, + *, + shape: ChunkCoords, + dtype: ZDType[_BaseDType, _BaseScalar], + chunk_grid: ChunkGrid, + ) -> None: """Validates that the codec configuration is compatible with the array metadata. Raises errors when the codec configuration is not compatible. @@ -285,7 +290,9 @@ def supports_partial_decode(self) -> bool: ... def supports_partial_encode(self) -> bool: ... @abstractmethod - def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: + def validate( + self, *, shape: ChunkCoords, dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid + ) -> None: """Validates that all codec configurations are compatible with the array metadata. Raises errors when a codec configuration is not compatible. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index d3e88ae7d3..72a12f9acb 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -28,7 +28,7 @@ _warn_order_kwarg, _warn_write_empty_chunks_kwarg, ) -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.group import ( AsyncGroup, ConsolidatedMetadata, @@ -433,7 +433,7 @@ async def save_array( shape = arr.shape chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute overwrite = kwargs.pop("overwrite", None) or _infer_overwrite(mode) - zarr_dtype = get_data_type_from_numpy(arr.dtype) + zarr_dtype = get_data_type_from_native_dtype(arr.dtype) new = await AsyncArray._create( store_path, zarr_format=zarr_format, @@ -984,7 +984,7 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - dtype_wrapped = get_data_type_from_numpy(dtype) + dtype_wrapped = get_data_type_from_native_dtype(dtype) if zarr_format == 2: if chunks is None: chunks = shape diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index e7b57ab9b3..c86705c8ea 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -3,20 +3,21 @@ import sys from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration -from zarr.core.dtype.common import endianness_to_numpy_str +from zarr.core.dtype._numpy import endianness_to_numpy_str from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self from zarr.core.array_spec import ArraySpec + from zarr.core.dtype.common import Endianness class Endian(Enum): @@ -73,7 +74,9 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian_str = self.endian.value if self.endian is not None else None + endian_str = cast( + "Endianness | None", self.endian.value if self.endian is not None else None + ) dtype = chunk_spec.dtype.to_dtype().newbyteorder(endianness_to_numpy_str(endian_str)) as_array_like = chunk_bytes.as_array_like() diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index c501346980..e8a23e20c4 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -43,6 +43,7 @@ parse_shapelike, product, ) +from zarr.core.dtype._numpy import UInt64 from zarr.core.indexing import ( BasicIndexer, SelectorTuple, @@ -58,7 +59,7 @@ from typing import Self from zarr.core.common import JSON - from zarr.core.dtype.wrapper import DTypeWrapper + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar MAX_UINT_64 = 2**64 - 1 ShardMapping = Mapping[ChunkCoords, Buffer] @@ -405,7 +406,11 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self def validate( - self, *, shape: ChunkCoords, dtype: DTypeWrapper[Any, Any], chunk_grid: ChunkGrid + self, + *, + shape: ChunkCoords, + dtype: ZDType[_BaseDType, _BaseScalar], + chunk_grid: ChunkGrid, ) -> None: if len(self.chunk_shape) != len(shape): raise ValueError( @@ -443,7 +448,10 @@ async def _decode_single( # setup output array out = chunk_spec.prototype.nd_buffer.create( - shape=shard_shape, dtype=shard_spec.dtype, order=shard_spec.order, fill_value=0 + shape=shard_shape, + dtype=shard_spec.dtype.to_dtype(), + order=shard_spec.order, + fill_value=0, ) shard_dict = await _ShardReader.from_bytes(shard_bytes, self, chunks_per_shard) @@ -685,7 +693,7 @@ def _shard_index_size(self, chunks_per_shard: ChunkCoords) -> int: def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec: return ArraySpec( shape=chunks_per_shard + (2,), - dtype=np.dtype(" tuple[int, ...]: @@ -45,7 +46,12 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "transpose", "configuration": {"order": tuple(self.order)}} - def validate(self, shape: tuple[int, ...], dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: + def validate( + self, + shape: tuple[int, ...], + dtype: ZDType[_BaseDType, _BaseScalar], + chunk_grid: ChunkGrid, + ) -> None: if len(self.order) != len(shape): raise ValueError( f"The `order` tuple needs have as many entries as there are dimensions in the array. Got {self.order}." diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 3a3a3a5714..c9637b156a 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -1,13 +1,16 @@ +from __future__ import annotations + import dataclasses import textwrap -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal -import numcodecs.abc -import numpy as np +if TYPE_CHECKING: + import numcodecs.abc + import numpy as np -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec -from zarr.core.common import ZarrFormat -from zarr.core.dtype.wrapper import DTypeWrapper + from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec + from zarr.core.common import ZarrFormat + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar @dataclasses.dataclass(kw_only=True) @@ -78,7 +81,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | DTypeWrapper + _data_type: np.dtype[Any] | ZDType[_BaseDType, _BaseScalar] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0fa25c3695..7b6eb455fc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -41,7 +41,7 @@ default_buffer_prototype, ) from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype -from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks +from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -66,7 +66,7 @@ ) from zarr.core.config import config as zarr_config from zarr.core.dtype import ( - DTypeWrapper, + ZDType, parse_data_type, ) from zarr.core.indexing import ( @@ -124,6 +124,7 @@ from zarr.abc.codec import CodecPipeline from zarr.codecs.sharding import ShardingCodecIndexLocation + from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar from zarr.core.group import AsyncGroup from zarr.storage import StoreLike @@ -550,7 +551,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike | DTypeWrapper[Any, Any], + dtype: npt.DTypeLike | ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -580,7 +581,7 @@ async def _create( Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - dtype_parsed = parse_data_type(dtype) + dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format) store_path = await make_store_path(store) shape = parse_shapelike(shape) @@ -668,7 +669,7 @@ async def _create( @staticmethod def _create_metadata_v3( shape: ShapeLike, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], chunk_shape: ChunkCoords, fill_value: Any | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, @@ -693,14 +694,6 @@ def _create_metadata_v3( else: chunk_key_encoding_parsed = chunk_key_encoding - if dtype.to_dtype().kind in ("U", "T", "S"): - warn( - f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " - "may not be supported by other zarr implementations and may change in the future.", - category=UserWarning, - stacklevel=2, - ) - if fill_value is None: # v3 spec will not allow a null fill value fill_value_parsed = dtype.default_value() @@ -725,7 +718,7 @@ async def _create_v3( store_path: StorePath, *, shape: ShapeLike, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], chunk_shape: ChunkCoords, config: ArrayConfig, fill_value: Any | None = None, @@ -773,7 +766,7 @@ async def _create_v3( @staticmethod def _create_metadata_v2( shape: ChunkCoords, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], chunks: ChunkCoords, order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, @@ -803,7 +796,7 @@ async def _create_v2( store_path: StorePath, *, shape: ChunkCoords, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], chunks: ChunkCoords, order: MemoryOrder, config: ArrayConfig, @@ -946,7 +939,7 @@ def chunks(self) -> ChunkCoords: return self.metadata.chunks @cached_property - def chunk_grid(self) -> RegularChunkGrid: + def chunk_grid(self) -> ChunkGrid: if self.metadata.zarr_format == 2: return RegularChunkGrid(chunk_shape=self.chunks) else: @@ -1036,7 +1029,17 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec ) @property - def dtype(self) -> np.dtype[Any]: + def _zdtype(self) -> ZDType[_BaseDType, _BaseScalar]: + """ + The zarr-specific representation of the array data type + """ + if self.metadata.zarr_format == 2: + return self.metadata.dtype + else: + return self.metadata.data_type + + @property + def dtype(self) -> _BaseDType: """Returns the data type of the array. Returns @@ -1044,10 +1047,7 @@ def dtype(self) -> np.dtype[Any]: np.dtype Data type of the array """ - if self.metadata.zarr_format == 2: - return self.metadata.dtype.to_dtype() - else: - return self.metadata.data_type.to_dtype() + return self._zdtype.to_dtype() @property def order(self) -> MemoryOrder: @@ -1273,7 +1273,7 @@ def get_chunk_spec( ) return ArraySpec( shape=self.chunk_grid.chunk_shape, - dtype=self.dtype, + dtype=self._zdtype, fill_value=self.metadata.fill_value, config=array_config, prototype=prototype, @@ -3922,7 +3922,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_wrapped = parse_data_type(dtype) + dtype_wrapped = parse_data_type(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4215,25 +4215,30 @@ def _parse_chunk_key_encoding( def _get_default_chunk_encoding_v3( - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ + # the config will not allow keys to have "." characters in them + # so we will access the config by transforming "." to "__" + + dtype_name_conf = dtype._zarr_v3_name.replace(".", "__") + # TODO: find a registry-style solution for this that isn't bloated # We need to associate specific dtypes with specific encoding schemes - if dtype._zarr_v3_name in zarr_config.get("array.v3_default_filters"): - filters = zarr_config.get(f"array.v3_default_filters.{dtype._zarr_v3_name}") + if dtype_name_conf in zarr_config.get("array.v3_default_filters"): + filters = zarr_config.get(f"array.v3_default_filters.{dtype_name_conf}") else: filters = zarr_config.get("array.v3_default_filters.default") - if dtype._zarr_v3_name in zarr_config.get("array.v3_default_compressors"): - compressors = zarr_config.get(f"array.v3_default_compressors.{dtype._zarr_v3_name}") + if dtype_name_conf in zarr_config.get("array.v3_default_compressors"): + compressors = zarr_config.get(f"array.v3_default_compressors.{dtype_name_conf}") else: compressors = zarr_config.get("array.v3_default_compressors.default") - if dtype._zarr_v3_name in zarr_config.get("array.v3_default_serializer"): - serializer = zarr_config.get(f"array.v3_default_serializer.{dtype._zarr_v3_name}") + if dtype_name_conf in zarr_config.get("array.v3_default_serializer"): + serializer = zarr_config.get(f"array.v3_default_serializer.{dtype_name_conf}") else: serializer = zarr_config.get("array.v3_default_serializer.default") @@ -4245,18 +4250,22 @@ def _get_default_chunk_encoding_v3( def _get_default_chunk_encoding_v2( - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Get the default chunk encoding for Zarr format 2 arrays, given a dtype """ - if dtype._zarr_v3_name in zarr_config.get("array.v2_default_filters"): - filters = zarr_config.get(f"array.v2_default_filters.{dtype._zarr_v3_name}") + # the config will not allow keys to have "." characters in them + # so we will access the config by transforming "." to "__" + dtype_name_conf = dtype._zarr_v3_name.replace(".", "__") + + if dtype_name_conf in zarr_config.get("array.v2_default_filters"): + filters = zarr_config.get(f"array.v2_default_filters.{dtype_name_conf}") else: filters = zarr_config.get("array.v2_default_filters.default") - if dtype._zarr_v3_name in zarr_config.get("array.v2_default_compressor"): - compressor = zarr_config.get(f"array.v2_default_compressor.{dtype._zarr_v3_name}") + if dtype_name_conf in zarr_config.get("array.v2_default_compressor"): + compressor = zarr_config.get(f"array.v2_default_compressor.{dtype_name_conf}") else: compressor = zarr_config.get("array.v2_default_compressor.default") @@ -4270,7 +4279,7 @@ def _parse_chunk_encoding_v2( *, compressor: CompressorsLike, filters: FiltersLike, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. @@ -4314,7 +4323,7 @@ def _parse_chunk_encoding_v3( compressors: CompressorsLike, filters: FiltersLike, serializer: SerializerLike, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index f297fafa24..e8e451944f 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -11,16 +11,13 @@ parse_shapelike, ) from zarr.core.config import config as zarr_config -from zarr.core.dtype import parse_data_type if TYPE_CHECKING: from typing import NotRequired - import numpy.typing as npt - from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import DTypeWrapper + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar class ArrayConfigParams(TypedDict): @@ -66,7 +63,7 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: """ kwargs_out: ArrayConfigParams = {} for f in fields(ArrayConfig): - field_name = cast(Literal["order", "write_empty_chunks"], f.name) + field_name = cast("Literal['order', 'write_empty_chunks']", f.name) if field_name not in data: kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") else: @@ -92,7 +89,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: DTypeWrapper[Any, Any] + dtype: ZDType[_BaseDType, _BaseScalar] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -100,18 +97,16 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: npt.DTypeLike | DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) - dtype_parsed = parse_data_type(dtype) - fill_value_parsed = parse_fill_value(fill_value) object.__setattr__(self, "shape", shape_parsed) - object.__setattr__(self, "dtype", dtype_parsed) + object.__setattr__(self, "dtype", dtype) object.__setattr__(self, "fill_value", fill_value_parsed) object.__setattr__(self, "config", config) object.__setattr__(self, "prototype", prototype) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 9894fced51..225adb6f5c 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -150,7 +150,7 @@ def create( cls, *, shape: Iterable[int], - dtype: np.dtype[Any], + dtype: npt.DTypeLike, order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 222e97ce74..71600fee90 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -23,12 +23,11 @@ from collections.abc import Iterable, Iterator from typing import Self - import numpy as np - from zarr.abc.store import ByteGetter, ByteSetter from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer from zarr.core.chunk_grids import ChunkGrid + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar T = TypeVar("T") U = TypeVar("U") @@ -133,7 +132,9 @@ def __iter__(self) -> Iterator[Codec]: yield self.array_bytes_codec yield from self.bytes_bytes_codecs - def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: + def validate( + self, *, shape: ChunkCoords, dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid + ) -> None: for codec in self: codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid) @@ -295,7 +296,7 @@ def _merge_chunk_array( is_complete_chunk: bool, drop_axes: tuple[int, ...], ) -> NDBuffer: - if chunk_selection == () or is_scalar(value.as_ndarray_like(), chunk_spec.dtype): + if chunk_selection == () or is_scalar(value.as_ndarray_like(), chunk_spec.dtype.to_dtype()): chunk_value = value else: chunk_value = value[out_selection] diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index d06236f793..4cb59f7a87 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -16,8 +16,6 @@ overload, ) -import numpy as np - from zarr.core.config import config as zarr_config if TYPE_CHECKING: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 054316fd37..8f87910daa 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -80,15 +80,15 @@ def enable_gpu(self) -> ConfigSet: "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { "default": None, - "variable_length_utf8": [{"id": "vlen-utf8"}], - "fixed_length_ucs4": [{"id": "vlen-utf8"}], - "fixed_length_ascii": [{"id": "vlen-bytes"}], + "numpy__variable_length_utf8": [{"id": "vlen-utf8"}], + "numpy__fixed_length_ucs4": [{"id": "vlen-utf8"}], + "numpy__fixed_length_ascii": [{"id": "vlen-bytes"}], }, "v3_default_filters": {"default": []}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "variable_length_utf8": {"name": "vlen-utf8"}, - "fixed_length_ucs4": {"name": "vlen-utf8"}, + "numpy__variable_length_utf8": {"name": "vlen-utf8"}, + "numpy__fixed_length_ucs4": {"name": "vlen-utf8"}, "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 432eabf2ce..4e594f8796 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -4,21 +4,23 @@ import numpy as np -from zarr.core.dtype.common import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.dtype._numpy import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar if TYPE_CHECKING: import numpy.typing as npt - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype._numpy import ( Bool, Complex64, Complex128, DateTime64, - FixedLengthAsciiString, + FixedLengthAscii, FixedLengthBytes, - FixedLengthUnicodeString, + FixedLengthUnicode, Float16, Float32, Float64, @@ -34,16 +36,15 @@ VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry -from zarr.core.dtype.wrapper import DTypeWrapper +from zarr.core.dtype.wrapper import ZDType __all__ = [ "Complex64", "Complex128", - "DTypeWrapper", "DateTime64", - "FixedLengthAsciiString", + "FixedLengthAscii", "FixedLengthBytes", - "FixedLengthUnicodeString", + "FixedLengthUnicode", "Float16", "Float32", "Float64", @@ -57,6 +58,7 @@ "UInt32", "UInt64", "VariableLengthString", + "ZDType", "data_type_registry", "parse_data_type", ] @@ -66,7 +68,7 @@ INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = FixedLengthUnicodeString | VariableLengthString | FixedLengthAsciiString +STRING_DTYPE = FixedLengthUnicode | VariableLengthString | FixedLengthAscii DTYPE = ( Bool | INTEGER_DTYPE @@ -82,34 +84,39 @@ data_type_registry.register(dtype._zarr_v3_name, dtype) -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: +def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, _BaseScalar]: data_type_registry.lazy_load() if not isinstance(dtype, np.dtype): if dtype in (str, "str"): if _NUMPY_SUPPORTS_VLEN_STRING: - np_dtype = np.dtype("T") + na_dtype = np.dtype("T") else: - np_dtype = np.dtype("O") + na_dtype = np.dtype("O") elif isinstance(dtype, list): # this is a valid _VoidDTypeLike check - np_dtype = np.dtype([tuple(d) for d in dtype]) + na_dtype = np.dtype([tuple(d) for d in dtype]) else: - np_dtype = np.dtype(dtype) + na_dtype = np.dtype(dtype) else: - np_dtype = dtype - return data_type_registry.match_dtype(np_dtype) + na_dtype = dtype + return data_type_registry.match_dtype(na_dtype) -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper[Any, Any]: - return data_type_registry.match_json(dtype) +def get_data_type_from_json( + dtype: JSON, zarr_format: ZarrFormat +) -> ZDType[_BaseDType, _BaseScalar]: + return data_type_registry.match_json(dtype, zarr_format=zarr_format) def parse_data_type( - dtype: npt.DTypeLike | DTypeWrapper[Any, Any] | dict[str, JSON], -) -> DTypeWrapper[Any, Any]: - if isinstance(dtype, DTypeWrapper): + dtype: npt.DTypeLike | ZDType[Any, Any] | dict[str, JSON], zarr_format: ZarrFormat +) -> ZDType[Any, Any]: + if isinstance(dtype, ZDType): return dtype elif isinstance(dtype, dict): - return get_data_type_from_dict(dtype) + # This branch assumes that the data type has been specified in the JSON form + # but it's also possible for numpy data types to be specified as dictionaries, which will + # cause an error in the `get_data_type_from_json`, but that's ok for now + return get_data_type_from_json(dtype, zarr_format=zarr_format) # type: ignore[arg-type] else: - return get_data_type_from_numpy(dtype) + return get_data_type_from_native_dtype(dtype) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index c562f0a593..a8bd2b5951 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -2,13 +2,22 @@ import base64 import re +from collections.abc import Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, TypeGuard, cast, get_args +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Literal, + Self, + TypeGuard, + cast, + get_args, +) import numpy as np from zarr.core.dtype.common import ( - _NUMPY_SUPPORTS_VLEN_STRING, DataTypeValidationError, Endianness, JSONFloat, @@ -16,27 +25,26 @@ bytes_to_json, check_json_bool, check_json_complex_float, - check_json_complex_float_v3, - check_json_float_v2, + check_json_float, check_json_int, check_json_str, complex_from_json, complex_to_json, datetime_from_json, datetime_to_json, - endianness_from_numpy_str, - endianness_to_numpy_str, float_from_json, float_to_json, ) -from zarr.core.dtype.wrapper import DTypeWrapper, TDType +from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat +EndiannessNumpy = Literal[">", "<", "=", "|"] + @dataclass(frozen=True, kw_only=True) -class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): +class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): """ Wrapper for numpy boolean dtype. @@ -49,10 +57,37 @@ class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): """ _zarr_v3_name = "bool" - dtype_cls: ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType + _zarr_v2_names: ClassVar[tuple[str,...]] = ("|b1",) + dtype_cls = np.dtypes.BoolDType + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: + return cls() + + def to_dtype(self: Self) -> np.dtypes.BoolDType: + return self.dtype_cls() + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["bool", "|b1"]]: + """ + Check that the input is a valid JSON representation of a bool. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() def default_value(self) -> np.bool_: """ @@ -65,26 +100,6 @@ def default_value(self) -> np.bool_: """ return np.False_ - @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: - """ - Wrap a numpy boolean dtype without checking. - - Parameters - ---------- - dtype : np.dtypes.BoolDType - The numpy dtype to wrap. - - Returns - ------- - Self - The wrapped dtype. - """ - return cls() - - def to_dtype(self) -> np.dtypes.BoolDType: - return self.dtype_cls() - def to_json_value(self, data: np.bool_, zarr_format: ZarrFormat) -> bool: """ Convert a boolean value to JSON-serializable format. @@ -120,337 +135,730 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: The numpy boolean scalar. """ if check_json_bool(data): - return self.cast_value(data) + return np.bool_(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @dataclass(frozen=True, kw_only=True) -class Int8(DTypeWrapper[np.dtypes.Int8DType, np.int8]): +class Int8(ZDType[np.dtypes.Int8DType, np.int8]): dtype_cls = np.dtypes.Int8DType _zarr_v3_name = "int8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.Int8DType) -> Self: return cls() - def to_dtype(self) -> np.dtypes.Int8DType: + def to_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["int8", "|i1"]]: + """ + Check that the input is a valid JSON representation of a 8-bit integer. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() def default_value(self) -> np.int8: - return self.to_dtype().type(0) + """ + Get the default value. + + Returns + ------- + np.int8 + The default value. + """ + return np.int8(0) def to_json_value(self, data: np.int8, zarr_format: ZarrFormat) -> int: + """ + Convert a numpy 8-bit int to JSON-serializable format. + + Parameters + ---------- + data : np.int8 + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + int + The JSON-serializable form of the scalar. + """ return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int8: + """ + Read a JSON-serializable value as a numpy int8 scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.bool_ + The numpy boolean scalar. + """ if check_json_int(data): - return self.cast_value(data) + return np.int8(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class UInt8(DTypeWrapper[np.dtypes.UInt8DType, np.uint8]): +class UInt8(ZDType[np.dtypes.UInt8DType, np.uint8]): dtype_cls = np.dtypes.UInt8DType _zarr_v3_name = "uint8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt8DType) -> Self: return cls() - def to_dtype(self) -> np.dtypes.UInt8DType: + def to_dtype(self: Self) -> np.dtypes.UInt8DType: return self.dtype_cls() - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["uint8", "|u1"]]: + """ + Check that the input is a valid JSON representation of an unsigned 8-bit integer. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() def default_value(self) -> np.uint8: - return self.to_dtype().type(0) + """ + Get the default value for this data type. + + Returns + ------- + np.uint8 + The default value. + """ + return np.uint8(0) def to_json_value(self, data: np.uint8, zarr_format: ZarrFormat) -> int: + """ + Convert a numpy unsigned 8-bit integer to JSON-serializable format. + + Parameters + ---------- + data : np.uint8 + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + int + The JSON-serializable form of the scalar. + """ return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint8: + """ + Read a JSON-serializable value as a numpy boolean scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.bool_ + The numpy boolean scalar. + """ if check_json_int(data): - return self.cast_value(data) + return np.uint8(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Int16(DTypeWrapper[np.dtypes.Int16DType, np.int16]): +class Int16(ZDType[np.dtypes.Int16DType, np.int16]): dtype_cls = np.dtypes.Int16DType _zarr_v3_name = "int16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Int16DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["int16", ">i2", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.int16: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.int16, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int16: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class UInt16(DTypeWrapper[np.dtypes.UInt16DType, np.uint16]): +class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16]): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name = "uint16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.UInt16DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["uint16", ">u2", " dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.uint16: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.uint16, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint16: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Int32(DTypeWrapper[np.dtypes.Int32DType, np.int32]): +class Int32(ZDType[np.dtypes.Int32DType, np.int32]): dtype_cls = np.dtypes.Int32DType _zarr_v3_name = "int32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Int32DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["int32", ">i4", " dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.int32: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.int32, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int32: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class UInt32(DTypeWrapper[np.dtypes.UInt32DType, np.uint32]): +class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32]): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name = "uint32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.UInt32DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["uint32", ">u4", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.uint32: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.uint32, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint32: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Int64(DTypeWrapper[np.dtypes.Int64DType, np.int64]): +class Int64(ZDType[np.dtypes.Int64DType, np.int64]): dtype_cls = np.dtypes.Int64DType _zarr_v3_name = "int64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Int64DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["int64", ">i8", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.int64: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.int64, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int64: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class UInt64(DTypeWrapper[np.dtypes.UInt64DType, np.uint64]): +class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64]): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name = "uint64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.UInt64DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["uint64", ">u8", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.uint64: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.uint64, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint64: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Float16(DTypeWrapper[np.dtypes.Float16DType, np.float16]): +class Float16(ZDType[np.dtypes.Float16DType, np.float16]): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Float16DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["float", ">f2", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.float16: - return self.to_dtype().type(0.0) + return self.to_dtype().type(0) def to_json_value(self, data: np.float16, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float16: - if check_json_float_v2(data): + if check_json_float(data, zarr_format=zarr_format): return self.to_dtype().type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @dataclass(frozen=True, kw_only=True) -class Float32(DTypeWrapper[np.dtypes.Float32DType, np.float32]): +class Float32(ZDType[np.dtypes.Float32DType, np.float32]): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Float32DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["float32", ">f4", " np.float32: - return self.to_dtype().type(value) + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.float32: - return self.to_dtype().type(0.0) + return self.to_dtype().type(0) def to_json_value(self, data: np.float32, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float32: - if check_json_float_v2(data): + if check_json_float(data, zarr_format=zarr_format): return self.to_dtype().type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @dataclass(frozen=True, kw_only=True) -class Float64(DTypeWrapper[np.dtypes.Float64DType, np.float64]): +class Float64(ZDType[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Float64DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["float64", ">f8", " dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.float64: - return self.to_dtype().type(0.0) + return self.to_dtype().type(0) def to_json_value(self, data: np.float64, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float64: - if check_json_float_v2(data): + if check_json_float(data, zarr_format=zarr_format): return self.to_dtype().type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @dataclass(frozen=True, kw_only=True) -class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): +class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64]): dtype_cls = np.dtypes.Complex64DType _zarr_v3_name = "complex64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " Self: - return cls() + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Complex64DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["complex64", ">c8", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.complex64: - return np.complex64(0.0) + return self.to_dtype().type(0) def to_json_value( self, data: np.complex64, zarr_format: ZarrFormat @@ -464,23 +872,51 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex6 @dataclass(frozen=True, kw_only=True) -class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): +class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128]): dtype_cls = np.dtypes.Complex128DType _zarr_v3_name = "complex128" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Complex128DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["complex128", ">c16", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.complex128: - return np.complex128(0.0) + return self.to_dtype().type(0) def to_json_value( self, data: np.complex128, zarr_format: ZarrFormat @@ -488,31 +924,66 @@ def to_json_value( return complex_to_json(data, zarr_format) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: - if check_json_complex_float_v3(data): + if check_json_complex_float(data, zarr_format=zarr_format): return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @dataclass(frozen=True, kw_only=True) -class FixedLengthAsciiString(DTypeWrapper[np.dtypes.BytesDType[Any], np.bytes_]): +class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_]): dtype_cls = np.dtypes.BytesDType - _zarr_v3_name = "fixed_length_ascii" + _zarr_v3_name = "numpy.fixed_length_ascii" item_size_bits: ClassVar[int] = 8 length: int = 1 @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.BytesDType) -> Self: + def _from_dtype_unsafe(cls, dtype: np.dtypes.BytesDType[int]) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - def to_dtype(self) -> np.dtypes.BytesDType: + def to_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "length_bits" in data["configuration"] + and isinstance(data["configuration"]["length_bits"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bits": self.length * self.item_size_bits}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + def default_value(self) -> np.bytes_: return np.bytes_(b"") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3), "configuration": {"length": self.length}} - def to_json_value(self, data: np.bytes_, *, zarr_format: ZarrFormat) -> str: return base64.standard_b64encode(data).decode("ascii") @@ -523,38 +994,61 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType, np.void]): - dtype_cls = np.dtypes.VoidDType - _zarr_v3_name = "r*" +class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void]): + # np.dtypes.VoidDType is specified in an odd way in numpy + # it cannot be used to create instances of the dtype + # so we have to tell mypy to ignore this here + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] + _zarr_v3_name = "numpy.void" item_size_bits: ClassVar[int] = 8 length: int = 1 @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - def default_value(self) -> np.void: - return self.cast_value(("\x00" * self.length).encode("ascii")) - - def to_dtype(self) -> np.dtypes.VoidDType: + def to_dtype(self) -> np.dtypes.VoidDType[int]: # Numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly - return np.dtype(f"V{self.length}") + return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) - def get_name(self, zarr_format: ZarrFormat) -> str: + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: if zarr_format == 2: - return super().get_name(zarr_format=zarr_format) - # note that we don't return self._zarr_v3_name - # because the name is parametrized by the length - return f"r{self.length * self.item_size_bits}" + # Check that the dtype is |V1, |V2, ... + return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and isinstance(data["name"], str) + and (re.match(r"^r\d+$", data["name"]) is not None) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": f"r{self.length * self.item_size_bits}"} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") @classmethod - def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: """ - Reject structured dtypes by ensuring that dtype.fields is None + Numpy void dtype comes in two forms: + * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. + * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, + + In this check we ensure that ``fields`` is ``None``. Parameters ---------- @@ -566,19 +1060,10 @@ def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[np.dtypes.VoidDType Bool True if the dtype matches, False otherwise. """ - return super().check_dtype(dtype) and dtype.fields is None + return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - @classmethod - def check_dict(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: - # Overriding the base class implementation because the r* dtype - # does not have a name that will can appear in array metadata - # Instead, array metadata will contain names like "r8", "r16", etc - return ( - isinstance(data, dict) - and "name" in data - and isinstance(data["name"], str) - and (re.match(r"^r\d+$", data["name"]) is not None) - ) + def default_value(self) -> np.void: + return self.to_dtype().type(("\x00" * self.length).encode("ascii")) def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str: return base64.standard_b64encode(data.tobytes()).decode("ascii") @@ -590,63 +1075,123 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) -class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]): +class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_]): dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "fixed_length_ucs4" + _zarr_v3_name = "numpy.fixed_length_ucs4" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point endianness: Endianness | None = "native" length: int = 1 @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.item_size_bits // 8), - endianness=endianness_from_numpy_str(dtype.byteorder), + endianness=endianness_from_numpy_str(byte_order), ) def to_dtype(self) -> np.dtypes.StrDType[int]: - return cast( - np.dtypes.StrDType[int], - self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)), - ) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls(self.length).newbyteorder(byte_order) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match >U1, <]U\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "length_bits" in data["configuration"] + and isinstance(data["configuration"]["length_bits"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bits": self.length * self.item_size_bits}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.str_: return np.str_("") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3), "configuration": {"length": self.length}} - def to_json_value(self, data: np.str_, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.cast_value(data) + return self.to_dtype().type(data) + + +_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): + class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[type-var] dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "variable_length_utf8" + _zarr_v3_name = "numpy.variable_length_utf8" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: return cls() - def default_value(self) -> str: - return "" + def to_dtype(self) -> np.dtypes.StringDType: + return self.dtype_cls() - def cast_value(self, value: object) -> str: - return str(value) + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy string dtype. + """ + if zarr_format == 2: + # TODO: take the entire metadata document in here, and + # check the compressors / filters for vlen-utf8 + # Note that we are checking for the object dtype name. + return data == "|O" + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + # Note: unlike many other numpy data types, we don't serialize the .str attribute + # of the data type to JSON. This is because Zarr was using `|O` for strings before the + # numpy variable length string data type existed, and we want to be consistent with + # that practice + return "|O" + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() - def to_dtype(self) -> np.dtypes.StringDType: - return self.dtype_cls() + def default_value(self) -> str: + return "" def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) @@ -654,37 +1199,55 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.cast_value(data) + return data else: @dataclass(frozen=True, kw_only=True) - class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): + class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "variable_length_utf8" + _zarr_v3_name = "numpy.variable_length_utf8" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.ObjectDType: - return cast(np.dtypes.ObjectDType, self.dtype_cls()) + return self.dtype_cls() - def cast_value(self, value: object) -> str: - return str(value) + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy O dtype. + """ + if zarr_format == 2: + # TODO: take the entire metadata document in here, and + # check the compressors / filters for vlen-utf8 + return data == "|O" + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() def default_value(self) -> str: return "" - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} - def to_json_value(self, data: str, *, zarr_format: ZarrFormat) -> str: return data def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ - String literals pass through + Strings pass through """ if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -696,35 +1259,72 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) -class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): - dtype_cls = np.dtypes.DateTime64DType - _zarr_v3_name = "datetime64" +class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64]): + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] + _zarr_v3_name = "numpy.datetime64" unit: DateUnit | TimeUnit = "s" - endianness: Endianness = "native" - - def default_value(self) -> np.datetime64: - return np.datetime64("NaT") - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3), "configuration": {"unit": self.unit}} + endianness: Endianness | None = "native" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: - unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] + unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') - return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder)) - - def cast_value(self, value: object) -> np.datetime64: - return cast(np.datetime64, self.to_dtype().type(value, self.unit)) + byteorder = cast("EndiannessNumpy", dtype.byteorder) + return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) def to_dtype(self) -> np.dtypes.DateTime64DType: # Numpy does not allow creating datetime64 via # np.dtypes.DateTime64Dtype() - return np.dtype(f"datetime64[{self.unit}]").newbyteorder( - endianness_to_numpy_str(self.endianness) + return cast( + "np.dtypes.DateTime64DType", + np.dtype(f"datetime64[{self.unit}]").newbyteorder( + endianness_to_numpy_str(self.endianness) + ), ) + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # match M[M], etc + # consider making this a standalone function + return ( + isinstance(data, str) + and len(data) in (6, 7) + and data[0] in (">", "<") + and data[1:4] == "M8[" + and data[4:-1] in get_args(TimeUnit) + get_args(DateUnit) + and data[-1] == "]" + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and "unit" in data["configuration"] + and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def default_value(self) -> np.datetime64: + return np.datetime64("NaT") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): return datetime_from_json(data, self.unit) @@ -735,19 +1335,19 @@ def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) -class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): - dtype_cls = np.dtypes.VoidDType +class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "structured" - fields: tuple[tuple[str, DTypeWrapper[Any, Any]], ...] + fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] def default_value(self) -> np.void: return self.cast_value(0) def cast_value(self, value: object) -> np.void: - return cast(np.void, np.array([value], dtype=self.to_dtype())[0]) + return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) @classmethod - def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: + def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ Check that this dtype is a numpy structured dtype @@ -764,54 +1364,90 @@ def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDTyp return super().check_dtype(dtype) and dtype.fields is not None @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: - from zarr.core.dtype import get_data_type_from_numpy + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: + from zarr.core.dtype import get_data_type_from_native_dtype - fields: list[tuple[str, DTypeWrapper[Any, Any]]] = [] + fields: list[tuple[str, ZDType[Any, Any]]] = [] if dtype.fields is None: raise ValueError("numpy dtype has no fields") - for key, (dtype_instance, _) in dtype.fields.items(): - dtype_wrapped = get_data_type_from_numpy(dtype_instance) + # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only + # care about the first element in either case. + for key, (dtype_instance, *_) in dtype.fields.items(): + dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) fields.append((key, dtype_wrapped)) return cls(fields=tuple(fields)) - def get_name(self, zarr_format: ZarrFormat) -> str | list[tuple[str, str]]: + def to_json(self, zarr_format: ZarrFormat) -> JSON: + fields = [ + (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields + ] if zarr_format == 2: - return [[k, d.get_name(zarr_format=2)] for k, d in self.fields] - return self._zarr_v3_name - - def to_dict(self) -> dict[str, JSON]: - base_dict = {"name": self.get_name(zarr_format=3)} - field_configs = [(f_name, f_dtype.to_dict()) for f_name, f_dtype in self.fields] - base_dict["configuration"] = {"fields": field_configs} - return base_dict + return fields + elif zarr_format == 3: + base_dict = {"name": self._zarr_v3_name} + base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] + return cast("JSON", base_dict) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") @classmethod - def check_dict(cls, data: JSON) -> TypeGuard[JSON]: - return ( - isinstance(data, dict) - and "name" in data - and "configuration" in data - and "fields" in data["configuration"] - ) + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[dict[str, JSON] | list[Any]]: + # the actual JSON form is recursive and hard to annotate, so we give up and do + # list[Any] for now + if zarr_format == 2: + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and all( + not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 + for field in data + ) + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and "configuration" in data + and isinstance(data["configuration"], dict) + and "fields" in data["configuration"] + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") @classmethod - def from_dict(cls, data: dict[str, JSON]) -> Self: - if cls.check_dict(data): - from zarr.core.dtype import get_data_type_from_dict - - fields = tuple( - (f_name, get_data_type_from_dict(f_dtype)) - for f_name, f_dtype in data["configuration"]["fields"] - ) - return cls(fields=fields) + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + from zarr.core.dtype import get_data_type_from_json + + if cls.check_json(data, zarr_format=zarr_format): + if zarr_format == 2: + # structured dtypes are constructed directly from a list of lists + return cls( + fields=tuple( # type: ignore[misc] + (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + for f_name, f_dtype in data + ) + ) + elif zarr_format == 3: # noqa: SIM102 + if isinstance(data, dict) and "configuration" in data: + config = data["configuration"] + if isinstance(config, dict) and "fields" in config: + meta_fields = config["fields"] + fields = tuple( + (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + for f_name, f_dtype in meta_fields + ) + return cls(fields=fields) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - def to_dtype(self) -> np.dtypes.VoidDType: - return cast(np.void, np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields])) + def to_dtype(self) -> np.dtypes.VoidDType[int]: + return cast( + "np.dtypes.VoidDType[int]", + np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), + ) def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return bytes_to_json(data.tobytes(), zarr_format) @@ -822,3 +1458,69 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_dtype() return cast(np.void, np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) + + +def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: + """ + Convert an endianness literal to its numpy string representation. + + Parameters + ---------- + endianness : Endianness or None + The endianness to convert. + + Returns + ------- + Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "little": + return "<" + case "big": + return ">" + case "native": + return "=" + case None: + return "|" + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" + ) + + +def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: + """ + Convert a numpy endianness string literal to a human-readable literal value. + + Parameters + ---------- + endianness : Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Returns + ------- + Endianness or None + The human-readable representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "<": + return "little" + case ">": + return "big" + case "=": + return "native" + case "|": + return None + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" + ) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 1dbf22c3c2..2c4910338e 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -2,7 +2,7 @@ import base64 from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, Literal, TypeGuard, cast, get_args +from typing import TYPE_CHECKING, Any, Literal, TypeGuard, cast import numpy as np @@ -11,81 +11,12 @@ from zarr.core.dtype._numpy import DateUnit, TimeUnit Endianness = Literal["little", "big", "native"] -EndiannessNumpy = Literal[">", "<", "=", "|"] JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] -_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") - class DataTypeValidationError(ValueError): ... -def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: - """ - Convert an endianness literal to its numpy string representation. - - Parameters - ---------- - endianness : Endianness or None - The endianness to convert. - - Returns - ------- - Literal[">", "<", "=", "|"] - The numpy string representation of the endianness. - - Raises - ------ - ValueError - If the endianness is invalid. - """ - match endianness: - case "little": - return "<" - case "big": - return ">" - case "native": - return "=" - case None: - return "|" - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" - ) - - -def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: - """ - Convert a numpy endianness string literal to a human-readable literal value. - - Parameters - ---------- - endianness : Literal[">", "<", "=", "|"] - The numpy string representation of the endianness. - - Returns - ------- - Endianness or None - The human-readable representation of the endianness. - - Raises - ------ - ValueError - If the endianness is invalid. - """ - match endianness: - case "<": - return "little" - case ">": - return "big" - case "=": - return "native" - case "|": - return None - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" - ) - - def check_json_bool(data: JSON) -> TypeGuard[bool]: """ Check if a JSON value is a boolean. diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index d4f1f03258..0d07ab2b9d 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -1,20 +1,22 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Self +from typing import TYPE_CHECKING, Self from zarr.core.dtype.common import DataTypeValidationError if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.core.common import JSON - from zarr.core.dtype.wrapper import DTypeWrapper, TDType + from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: - contents: dict[str, type[DTypeWrapper[Any, Any]]] = field(default_factory=dict, init=False) + contents: dict[str, type[ZDType[_BaseDType, _BaseScalar]]] = field( + default_factory=dict, init=False + ) lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) def lazy_load(self) -> None: @@ -23,15 +25,15 @@ def lazy_load(self) -> None: self.lazy_load_list.clear() - def register(self: Self, key: str, cls: type[DTypeWrapper[Any, Any]]) -> None: + def register(self: Self, key: str, cls: type[ZDType[_BaseDType, _BaseScalar]]) -> None: # don't register the same dtype twice if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls - def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: + def get(self, key: str) -> type[ZDType[_BaseDType, _BaseScalar]]: return self.contents[key] - def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: + def match_dtype(self, dtype: _BaseDType) -> ZDType[_BaseDType, _BaseScalar]: self.lazy_load() for val in self.contents.values(): try: @@ -40,11 +42,11 @@ def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: pass raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") - def match_json(self, data: JSON) -> DTypeWrapper[Any, Any]: + def match_json(self, data: JSON, zarr_format: ZarrFormat) -> ZDType[_BaseDType, _BaseScalar]: self.lazy_load() for val in self.contents.values(): try: - return val.from_dict(data) + return val.from_json(data, zarr_format=zarr_format) except DataTypeValidationError: pass raise ValueError(f"No data type wrapper found that matches {data}") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index dc3a0cc5d2..8707c3cda0 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -2,25 +2,30 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, ClassVar, Generic, Self, TypeGuard, TypeVar, cast +from typing import TYPE_CHECKING, ClassVar, Generic, Self, TypeGuard, TypeVar import numpy as np -from zarr.abc.metadata import Metadata from zarr.core.dtype.common import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat -TScalar = TypeVar("TScalar", bound=np.generic | str) +# This the upper bound for the scalar types we support. It's numpy scalars + str, +# because the new variable-length string dtype in numpy does not have a corresponding scalar type +_BaseScalar = np.generic | str +# This is the bound for the dtypes that we support. If we support non-numpy dtypes, +# then this bound will need to be widened. +_BaseDType = np.dtype[np.generic] +TScalar = TypeVar("TScalar", bound=_BaseScalar) # TODO: figure out an interface or protocol that non-numpy dtypes can use -TDType = TypeVar("TDType", bound=np.dtype[Any]) +TDType = TypeVar("TDType", bound=_BaseDType) @dataclass(frozen=True, kw_only=True) -class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): +class ZDType(Generic[TDType, TScalar], ABC): """ - Abstract base class for wrapping numpy dtypes. + Abstract base class for wrapping native array data types, e.g. numpy dtypes Attributes ---------- @@ -32,13 +37,30 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): have names that depend on their configuration. """ - # this class will create a numpy dtype + # this class will create a native data type # mypy currently disallows class variables to contain type parameters - # but it seems like it should be OK for us to use it here: + # but it seems OK for us to use it here: # https://github.com/python/typing/discussions/1424#discussioncomment-7989934 dtype_cls: ClassVar[type[TDType]] # type: ignore[misc] _zarr_v3_name: ClassVar[str] + @classmethod + def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType]: + """ + Check that a data type matches the dtype_cls class attribute. Used as a type guard. + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return type(dtype) is cls.dtype_cls + @classmethod def from_dtype(cls: type[Self], dtype: TDType) -> Self: """ @@ -81,7 +103,7 @@ def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: Self The wrapped dtype. """ - raise NotImplementedError + ... @abstractmethod def to_dtype(self: Self) -> TDType: @@ -93,26 +115,7 @@ def to_dtype(self: Self) -> TDType: TDType The unwrapped dtype. """ - raise NotImplementedError - - def cast_value(self: Self, value: object) -> TScalar: - """ - Cast a value to an instance of the scalar type. - This implementation assumes a numpy-style dtype class that has a - ``type`` method for casting scalars. Non-numpy dtypes will need to - override this method. - - Parameters - ---------- - value : object - The value to cast. - - Returns - ------- - TScalar - The cast value. - """ - return cast(TScalar, self.to_dtype().type(value)) + ... @abstractmethod def default_value(self) -> TScalar: @@ -129,24 +132,8 @@ def default_value(self) -> TScalar: ... @classmethod - def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: - """ - Check that a data type matches the dtype_cls class attribute. Used as a type guard. - - Parameters - ---------- - dtype : TDType - The dtype to check. - - Returns - ------- - Bool - True if the dtype matches, False otherwise. - """ - return type(dtype) is cls.dtype_cls - - @classmethod - def check_dict(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + @abstractmethod + def check_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: """ Check that a JSON representation of a data type matches the dtype_cls class attribute. Used as a type guard. This base implementation checks that the input is a dictionary, @@ -158,87 +145,75 @@ def check_dict(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JS data : JSON The JSON representation of the data type. + zarr_format : ZarrFormat + The zarr format version. + Returns ------- Bool True if the JSON representation matches, False otherwise. """ - return "name" in data and data["name"] == cls._zarr_v3_name + ... @abstractmethod - def to_dict(self) -> dict[str, JSON]: + def to_json(self, zarr_format: ZarrFormat) -> JSON: """ - Convert the wrapped data type to a dictionary. + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. Returns ------- - dict[str, JSON] - The dictionary representation of the wrapped data type + JSON + The JSON-serializable representation of the wrapped data type """ - raise NotImplementedError + ... @classmethod - def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: + def from_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> Self: """ Wrap a JSON representation of a data type. Parameters ---------- - data : dict[str, JSON] + data : JSON The JSON representation of the data type. + zarr_format : ZarrFormat + The zarr format version. + Returns ------- Self The wrapped data type. """ - if cls.check_dict(data): - return cls._from_dict_unsafe(data) - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + if cls.check_json(data, zarr_format=zarr_format): + return cls._from_json_unsafe(data, zarr_format=zarr_format) + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}: {data}") @classmethod - def _from_dict_unsafe(cls: type[Self], data: dict[str, JSON]) -> Self: + @abstractmethod + def _from_json_unsafe(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> Self: """ Wrap a JSON representation of a data type. Parameters ---------- - data : dict[str, JSON] + data : JSON The JSON representation of the data type. - Returns - ------- - Self - The wrapped data type. - """ - config = data.get("configuration", {}) - return cls(**config) - - def get_name(self, zarr_format: ZarrFormat) -> str: - """ - Return the name of the wrapped data type. - - Parameters - ---------- zarr_format : ZarrFormat The zarr format version. Returns ------- - str - The name of the wrapped data type. - - Notes - ----- - This is a method, rather than an attribute, because the name of the data type may depend on - parameters that are not known until a concrete data type is wrapped. - - As the names of data types vary between zarr versions, this method takes a ``zarr_format`` - parameter + Self + The wrapped data type. """ - if zarr_format == 2: - return self.to_dtype().str - return self._zarr_v3_name + ... @abstractmethod def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: @@ -255,9 +230,9 @@ def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: Returns ------- JSON - The JSON-serializable format. + The JSON-serializable form of the scalar. """ - raise NotImplementedError + ... @abstractmethod def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @@ -274,6 +249,6 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal Returns ------- TScalar - The numpy scalar. + The native scalar value. """ - raise NotImplementedError + ... diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 94c69602af..d26ca52353 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -3,13 +3,13 @@ import base64 import warnings from collections.abc import Iterable -from typing import TYPE_CHECKING, TypedDict, cast +from typing import TYPE_CHECKING, TypedDict import numcodecs.abc from zarr.abc.metadata import Metadata -from zarr.core.dtype import get_data_type_from_numpy -from zarr.core.dtype.wrapper import DTypeWrapper +from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype.wrapper import TDType, TScalar, ZDType, _BaseDType, _BaseScalar if TYPE_CHECKING: from typing import Any, Literal, Self @@ -45,7 +45,7 @@ class ArrayV2MetadataDict(TypedDict): class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: ChunkCoords - dtype: DTypeWrapper[Any, Any] + dtype: ZDType[_BaseDType, _BaseScalar] fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None @@ -58,7 +58,7 @@ def __init__( self, *, shape: ChunkCoords, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[TDType, TScalar], chunks: ChunkCoords, fill_value: Any, order: MemoryOrder, @@ -73,7 +73,7 @@ def __init__( shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) # TODO: remove this - if not isinstance(dtype, DTypeWrapper): + if not isinstance(dtype, ZDType): raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) @@ -122,7 +122,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # check that the zarr_format attribute is correct _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = get_data_type_from_numpy(_data["dtype"]) + dtype = get_data_type_from_native_dtype(_data["dtype"]) _data["dtype"] = dtype if dtype.to_dtype().kind in "SV": fill_value_encoded = _data.get("fill_value") @@ -163,6 +163,10 @@ def to_dict(self) -> dict[str, JSON]: if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] + # TODO: remove this when we can stratically type the output JSON data structure + # entirely + if not isinstance(raw_filters, list | tuple): + raise TypeError("Invalid type for filters. Expected a list or tuple.") new_filters = [] for f in raw_filters: if isinstance(f, numcodecs.abc.Codec): @@ -172,13 +176,10 @@ def to_dict(self) -> dict[str, JSON]: zarray_dict["filters"] = new_filters if self.fill_value is not None: - # There's a relationship between self.dtype and self.fill_value - # that mypy isn't aware of. The fact that we have S or V dtype here - # means we should have a bytes-type fill_value. - fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) + fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) # type: ignore[arg-type] zarray_dict["fill_value"] = fill_value - zarray_dict["dtype"] = self.dtype.get_name(zarr_format=2) + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) return zarray_dict @@ -312,22 +313,3 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: raise ValueError(msg) from e return fill_value - - -def _default_compressor( - dtype: DTypeWrapper[Any, Any], -) -> dict[str, JSON] | None: - """Get the default filters and compressor for a dtype. - - https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html - """ - default_compressor = config.get("array.v2_default_compressor") - return cast(dict[str, JSON] | None, default_compressor.get(dtype.kind, None)) - - -def _default_filters( - dtype: DTypeWrapper, -) -> list[dict[str, JSON]] | None: - """Get the default filters and compressor for a dtype.""" - default_filters = config.get("array.v2_default_filters") - return cast(list[dict[str, JSON]] | None, default_filters.get(dtype.kind, None)) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 2c6e65037e..117bb3c573 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -5,9 +5,9 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype from zarr.core.dtype import ( - DTypeWrapper, VariableLengthString, - get_data_type_from_dict, + ZDType, + get_data_type_from_json, ) if TYPE_CHECKING: @@ -17,6 +17,7 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords + from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar import json @@ -95,7 +96,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper[Any, Any]) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[_BaseDType, _BaseScalar]) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -234,7 +235,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DTypeWrapper[Any, Any] + data_type: ZDType[_BaseDType, _BaseScalar] chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -249,7 +250,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: DTypeWrapper[Any, Any], + data_type: ZDType[_BaseDType, _BaseScalar], chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -263,7 +264,7 @@ def __init__( """ # TODO: remove this - if not isinstance(data_type, DTypeWrapper): + if not isinstance(data_type, ZDType): raise TypeError shape_parsed = parse_shapelike(shape) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) @@ -276,7 +277,7 @@ def __init__( array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type.to_dtype(), + dtype=data_type, fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -310,9 +311,7 @@ def _validate_metadata(self) -> None: if self.fill_value is None: raise ValueError("`fill_value` is required.") for codec in self.codecs: - codec.validate( - shape=self.shape, dtype=self.data_type.to_dtype(), chunk_grid=self.chunk_grid - ) + codec.validate(shape=self.shape, dtype=self.data_type, chunk_grid=self.chunk_grid) @property def ndim(self) -> int: @@ -380,10 +379,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: _ = parse_node_type_array(_data.pop("node_type")) data_type_json = _data.pop("data_type") - if isinstance(data_type_json, str): - data_type = get_data_type_from_dict({"name": data_type_json}) - else: - data_type = get_data_type_from_dict(data_type_json) + data_type = get_data_type_from_json(data_type_json, zarr_format=3) # check that the fill value is consistent with the data type fill_value_parsed = data_type.from_json_value(_data.pop("fill_value"), zarr_format=3) @@ -408,9 +404,15 @@ def to_dict(self) -> dict[str, JSON]: # the metadata document if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") - # if data_type has no configuration, we just serialize the name - if "configuration" not in out_dict["data_type"]: - out_dict["data_type"] = out_dict["data_type"]["name"] + + # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with + # to_json, from_json, and have ZDType inherit from `Metadata` + # until then, we have this hack here + dtype_meta = out_dict["data_type"] + + if isinstance(dtype_meta, ZDType): + out_dict["data_type"] = dtype_meta.to_json(zarr_format=3) + return out_dict def update_shape(self, shape: ChunkCoords) -> Self: diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 2eef703448..6c2e8f7762 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -15,7 +15,7 @@ from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.common import ZarrFormat -from zarr.core.dtype import parse_data_type +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike @@ -135,7 +135,7 @@ def array_metadata( ndim = len(shape) chunk_shape = draw(array_shapes(min_dims=ndim, max_dims=ndim)) np_dtype = draw(v3_dtypes()) - dtype = parse_data_type(np_dtype) + dtype = get_data_type_from_native_dtype(np_dtype) fill_value = draw(npst.from_dtype(np_dtype)) if zarr_format == 2: return ArrayV2Metadata( diff --git a/tests/conftest.py b/tests/conftest.py index 5e17c82a37..b2c106f2e2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,7 +20,7 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -253,7 +253,7 @@ def create_array_metadata( """ Create array metadata """ - dtype_parsed = get_data_type_from_numpy(dtype) + dtype_parsed = get_data_type_from_native_dtype(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/tests/test_array.py b/tests/test_array.py index b2f21d6562..aa61860fa1 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -38,7 +38,7 @@ from zarr.core.buffer.cpu import NDBuffer from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv from zarr.core.sync import sync @@ -1035,7 +1035,7 @@ async def test_v2_chunk_encoding( filters=filters, ) filters_expected, compressor_expected = _parse_chunk_encoding_v2( - filters=filters, compressor=compressors, dtype=get_data_type_from_numpy(dtype) + filters=filters, compressor=compressors, dtype=get_data_type_from_native_dtype(dtype) ) assert arr.metadata.zarr_format == 2 # guard for mypy assert arr.metadata.compressor == compressor_expected @@ -1056,7 +1056,7 @@ async def test_default_filters_compressors( """ Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. """ - zdtype = get_data_type_from_numpy(dtype_str) + zdtype = get_data_type_from_native_dtype(dtype_str) arr = await create_array( store=store, dtype=dtype_str, diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index ee3415a501..b1508953ea 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,7 +8,7 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage import StorePath @@ -52,12 +52,12 @@ def test_vlen_string( else: a[:, :] = data assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == get_data_type_from_numpy(data.dtype) + assert a.metadata.data_type == get_data_type_from_native_dtype(data.dtype) assert a.dtype == data.dtype # test round trip b = Array.open(sp) assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == get_data_type_from_numpy(data.dtype) + assert b.metadata.data_type == get_data_type_from_native_dtype(data.dtype) assert a.dtype == data.dtype diff --git a/tests/test_config.py b/tests/test_config.py index 34ecfdc119..8d6e0a53ed 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,7 +24,7 @@ from zarr.core.buffer import NDBuffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -57,15 +57,15 @@ def test_config_defaults_set() -> None: "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { "default": None, - "variable_length_utf8": [{"id": "vlen-utf8"}], - "fixed_length_ucs4": [{"id": "vlen-utf8"}], - "fixed_length_ascii": [{"id": "vlen-bytes"}], + "numpy__variable_length_utf8": [{"id": "vlen-utf8"}], + "numpy__fixed_length_ucs4": [{"id": "vlen-utf8"}], + "numpy__fixed_length_ascii": [{"id": "vlen-bytes"}], }, "v3_default_filters": {"default": []}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "variable_length_utf8": {"name": "vlen-utf8"}, - "fixed_length_ucs4": {"name": "vlen-utf8"}, + "numpy__variable_length_utf8": {"name": "vlen-utf8"}, + "numpy__fixed_length_ucs4": {"name": "vlen-utf8"}, "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { @@ -302,10 +302,10 @@ async def test_default_codecs(dtype: str) -> None: """ Test that the default compressors are sensitive to the current setting of the config. """ - zdtype = get_data_type_from_numpy(dtype) + zdtype = get_data_type_from_native_dtype(dtype) expected_compressors = (GzipCodec(),) new_conf = { - f"array.v3_default_compressors.{zdtype._zarr_v3_name}": [ + f"array.v3_default_compressors.{zdtype._zarr_v3_name.replace('.', '__')}": [ c.to_dict() for c in expected_compressors ] } diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 508519e696..c9ab06f1e2 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -504,7 +504,7 @@ async def test_consolidated_metadata_backwards_compatibility( async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - dtype = parse_data_type("uint8") + dtype = parse_data_type("uint8", zarr_format=2) await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py index ee19cdf845..db575ee16a 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_metadata/test_dtype.py @@ -1,14 +1,19 @@ from __future__ import annotations -from typing import Any, get_args +import re +from typing import TYPE_CHECKING, Any, get_args + +if TYPE_CHECKING: + from zarr.core.common import ZarrFormat + from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar import numpy as np import pytest from zarr.core.dtype import ( DTYPE, - DTypeWrapper, VariableLengthString, + ZDType, data_type_registry, ) from zarr.core.dtype._numpy import ( @@ -16,9 +21,9 @@ Complex64, Complex128, DateTime64, - FixedLengthAsciiString, + FixedLengthAscii, FixedLengthBytes, - FixedLengthUnicodeString, + FixedLengthUnicode, Float16, Float32, Float64, @@ -37,7 +42,7 @@ @pytest.fixture -def dtype_registry() -> DataTypeRegistry: +def data_type_registry_fixture() -> DataTypeRegistry: return DataTypeRegistry() @@ -66,15 +71,15 @@ def dtype_registry() -> DataTypeRegistry: (Float64, "float64"), (Complex64, "complex64"), (Complex128, "complex128"), - (FixedLengthUnicodeString, "U"), - (FixedLengthAsciiString, "S"), + (FixedLengthUnicode, "U"), + (FixedLengthAscii, "S"), (FixedLengthBytes, "V"), (VariableLengthString, VLEN_STRING_CODE), (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), (DateTime64, "datetime64[s]"), ], ) -def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | str) -> None: +def test_wrap(wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.dtype | str) -> None: """ Test that the wrapper class has the correct dtype class bound to the dtype_cls variable Test that the ``wrap`` method produces an instance of the wrapper class @@ -92,13 +97,13 @@ def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | st @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) -def test_dict_serialization(wrapper_cls: DTYPE) -> None: +def test_dict_serialization(wrapper_cls: DTYPE, zarr_format: ZarrFormat) -> None: if issubclass(wrapper_cls, Structured): instance = wrapper_cls(fields=((("a", Bool()),))) else: instance = wrapper_cls() - as_dict = instance.to_dict() - assert wrapper_cls.from_dict(as_dict) == instance + as_dict = instance.to_json(zarr_format=zarr_format) + assert wrapper_cls.from_json(as_dict, zarr_format=zarr_format) == instance @pytest.mark.parametrize( @@ -118,9 +123,9 @@ def test_dict_serialization(wrapper_cls: DTYPE) -> None: (Float64(), np.float64(0)), (Complex64(), np.complex64(0)), (Complex128(), np.complex128(0)), - (FixedLengthAsciiString(length=3), np.bytes_(b"")), + (FixedLengthAscii(length=3), np.bytes_(b"")), (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), - (FixedLengthUnicodeString(length=3), np.str_("")), + (FixedLengthUnicode(length=3), np.str_("")), ( Structured(fields=(("a", Float64()), ("b", Int8()))), np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], @@ -129,7 +134,9 @@ def test_dict_serialization(wrapper_cls: DTYPE) -> None: (DateTime64(unit="s"), np.datetime64("NaT")), ], ) -def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: Any) -> None: +def test_default_value( + wrapper: type[ZDType[_BaseDType, _BaseScalar]], expected_default: Any +) -> None: """ Test that the default_value method is correctly set for each dtype wrapper. """ @@ -156,15 +163,15 @@ def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: (Float64(), np.float64(42.0), 42.0), (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), - (FixedLengthAsciiString(length=4), np.bytes_(b"test"), "dGVzdA=="), + (FixedLengthAscii(length=4), np.bytes_(b"test"), "dGVzdA=="), (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), - (FixedLengthUnicodeString(length=4), np.str_("test"), "test"), + (FixedLengthUnicode(length=4), np.str_("test"), "test"), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), ], ) def test_to_json_value_v2( - wrapper: type[DTypeWrapper[Any, Any]], input_value: Any, expected_json: Any + wrapper: type[ZDType[_BaseDType, _BaseScalar]], input_value: Any, expected_json: Any ) -> None: """ Test the to_json_value method for each dtype wrapper for zarr v2 @@ -189,15 +196,15 @@ def test_to_json_value_v2( (Float64(), 42.0, np.float64(42.0)), (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), - (FixedLengthAsciiString(length=4), "dGVzdA==", np.bytes_(b"test")), + (FixedLengthAscii(length=4), "dGVzdA==", np.bytes_(b"test")), (FixedLengthBytes(length=4), "dGVzdA==", np.void(b"test")), - (FixedLengthUnicodeString(length=4), "test", np.str_("test")), + (FixedLengthUnicode(length=4), "test", np.str_("test")), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), ], ) def test_from_json_value( - wrapper: type[DTypeWrapper[Any, Any]], json_value: Any, expected_value: Any + wrapper: type[ZDType[_BaseDType, _BaseScalar]], json_value: Any, expected_value: Any ) -> None: """ Test the from_json_value method for each dtype wrapper. @@ -207,43 +214,45 @@ def test_from_json_value( class TestRegistry: @staticmethod - def test_register(dtype_registry: DataTypeRegistry) -> None: + def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - dtype_registry.register(Bool._zarr_v3_name, Bool) - assert dtype_registry.get(Bool._zarr_v3_name) == Bool - assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), Bool) + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) @staticmethod - def test_override(dtype_registry: DataTypeRegistry) -> None: + def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - dtype_registry.register(Bool._zarr_v3_name, Bool) + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): def default_value(self) -> np.bool_: return np.True_ - dtype_registry.register(NewBool._zarr_v3_name, NewBool) - assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), NewBool) + data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) @staticmethod @pytest.mark.parametrize( - ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicodeString, "|U4")] + ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicode, "|U4")] ) def test_match_dtype( - dtype_registry: DataTypeRegistry, wrapper_cls: type[DTypeWrapper[Any, Any]], dtype_str: str + data_type_registry_fixture: DataTypeRegistry, + wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], + dtype_str: str, ) -> None: """ Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. """ - dtype_registry.register(wrapper_cls._zarr_v3_name, wrapper_cls) - assert isinstance(dtype_registry.match_dtype(np.dtype(dtype_str)), wrapper_cls) + data_type_registry_fixture.register(wrapper_cls._zarr_v3_name, wrapper_cls) + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype(dtype_str)), wrapper_cls) @staticmethod - def test_unregistered_dtype(dtype_registry: DataTypeRegistry) -> None: + def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that match_dtype raises an error if the dtype is not registered. """ @@ -251,14 +260,16 @@ def test_unregistered_dtype(dtype_registry: DataTypeRegistry) -> None: with pytest.raises( ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" ): - dtype_registry.match_dtype(np.dtype(outside_dtype)) + data_type_registry_fixture.match_dtype(np.dtype(outside_dtype)) with pytest.raises(KeyError): - dtype_registry.get(outside_dtype) + data_type_registry_fixture.get(outside_dtype) @staticmethod @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) - def test_registered_dtypes(wrapper_cls: DTypeWrapper[Any, Any]) -> None: + def test_registered_dtypes( + wrapper_cls: ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat + ) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ @@ -268,3 +279,40 @@ def test_registered_dtypes(wrapper_cls: DTypeWrapper[Any, Any]) -> None: instance = wrapper_cls() assert data_type_registry.match_dtype(instance.to_dtype()) == instance + assert ( + data_type_registry.match_json( + instance.to_json(zarr_format=zarr_format), zarr_format=zarr_format + ) + == instance + ) + + @staticmethod + @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) + def test_match_dtype_unique( + wrapper_cls: ZDType[_BaseDType, _BaseScalar], + data_type_registry_fixture: DataTypeRegistry, + zarr_format: ZarrFormat, + ) -> None: + """ + Test that the match_dtype method uniquely specifies a registered data type. We create a local registry + that excludes the data type class being tested, and ensure that an instance of the wrapped data type + fails to match anything in the registry + """ + for _cls in get_args(DTYPE): + if _cls is not wrapper_cls: + data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) + + if issubclass(wrapper_cls, Structured): + instance = wrapper_cls(fields=((("a", Bool()),))) + else: + instance = wrapper_cls() + dtype_instance = instance.to_dtype() + + msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" + with pytest.raises(ValueError, match=re.escape(msg)): + data_type_registry_fixture.match_dtype(dtype_instance) + + instance_dict = instance.to_json(zarr_format=zarr_format) + msg = f"No data type wrapper found that matches {instance_dict}" + with pytest.raises(ValueError, match=re.escape(msg)): + data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 23f28ab097..bd5f9be8b6 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -11,7 +11,7 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype._numpy import DateTime64 from zarr.core.dtype.common import complex_from_json from zarr.core.group import GroupMetadata, parse_node_type @@ -128,7 +128,7 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: as length-2 sequences """ zarr_format = 3 - dtype = get_data_type_from_numpy(dtype_str) + dtype = get_data_type_from_native_dtype(dtype_str) expected = dtype.to_dtype().type(complex(*fill_value)) observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected @@ -142,7 +142,7 @@ def test_complex_to_json_invalid(data: object, dtype_str: str) -> None: Test that parse_fill_value(fill_value, dtype) correctly rejects sequences with length not equal to 2 """ - dtype_instance = get_data_type_from_numpy(dtype_str) + dtype_instance = get_data_type_from_native_dtype(dtype_str) match = f"Invalid type: {data}. Expected a sequence of two numbers." with pytest.raises(TypeError, match=re.escape(match)): complex_from_json(data=data, dtype=dtype_instance, zarr_format=3) @@ -155,7 +155,7 @@ def test_parse_fill_value_invalid_type(fill_value: Any, dtype_str: str) -> None: Test that parse_fill_value(fill_value, dtype) raises TypeError for invalid non-sequential types. This test excludes bool because the bool constructor takes anything. """ - dtype_instance = get_data_type_from_numpy(dtype_str) + dtype_instance = get_data_type_from_native_dtype(dtype_str) with pytest.raises(TypeError, match=f"Invalid type: {fill_value}"): dtype_instance.from_json_value(fill_value, zarr_format=3) @@ -176,7 +176,7 @@ def test_parse_fill_value_invalid_type_sequence(fill_value: Any, dtype_str: str) This test excludes bool because the bool constructor takes anything, and complex because complex values can be created from length-2 sequences. """ - dtype_instance = get_data_type_from_numpy(dtype_str) + dtype_instance = get_data_type_from_native_dtype(dtype_str) with pytest.raises(TypeError, match=re.escape(f"Invalid type: {fill_value}")): dtype_instance.from_json_value(fill_value, zarr_format=3) @@ -277,10 +277,12 @@ async def test_datetime_metadata(fill_value: int, precision: str) -> None: "node_type": "array", "shape": (1,), "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, - "data_type": dtype.to_dict(), + "data_type": dtype.to_json(zarr_format=3), "chunk_key_encoding": {"name": "default", "separator": "."}, "codecs": (BytesCodec(),), - "fill_value": dtype.to_json_value(dtype.cast_value(fill_value), zarr_format=3), + "fill_value": dtype.to_json_value( + dtype.to_dtype().type(fill_value, dtype.unit), zarr_format=3 + ), } metadata = ArrayV3Metadata.from_dict(metadata_dict) # ensure there isn't a TypeError here. From cbb159d5ee1f0ff01adf9cb02f8cef2a7715244e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 19 Mar 2025 22:33:01 +0100 Subject: [PATCH 040/130] update code examples in docs; remove native endianness --- docs/user-guide/arrays.rst | 12 +++---- docs/user-guide/consolidated_metadata.rst | 6 ++-- docs/user-guide/data_types.rst | 6 ++-- docs/user-guide/groups.rst | 4 +-- docs/user-guide/performance.rst | 10 +++--- src/zarr/core/_info.py | 5 ++- src/zarr/core/dtype/_numpy.py | 41 +++++++++++------------ src/zarr/core/dtype/common.py | 2 +- 8 files changed, 42 insertions(+), 44 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index a62b2ea0fa..f55dd00c80 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -182,7 +182,7 @@ which can be used to print useful diagnostics, e.g.:: >>> z.info Type : Array Zarr format : 3 - Data type : DataType.int32 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -199,7 +199,7 @@ prints additional diagnostics, e.g.:: >>> z.info_complete() Type : Array Zarr format : 3 - Data type : DataType.int32 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -286,7 +286,7 @@ Here is an example using a delta filter with the Blosc compressor:: >>> z.info Type : Array Zarr format : 3 - Data type : DataType.int32 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -600,7 +600,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za >>> a.info_complete() Type : Array Zarr format : 3 - Data type : DataType.uint8 + Data type : uint8 Shape : (10000, 10000) Shard shape : (1000, 1000) Chunk shape : (100, 100) @@ -608,10 +608,10 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Read-only : False Store type : LocalStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian=None) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) - No. bytes stored : 3981552 + No. bytes stored : 3981473 Storage ratio : 25.1 Shards Initialized : 100 diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index 3c015dcfca..fc410de7d4 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -47,7 +47,7 @@ that can be used.: >>> from pprint import pprint >>> pprint(dict(sorted(consolidated_metadata.items()))) {'a': ArrayV3Metadata(shape=(1,), - data_type=, + data_type=Float64(endianness='little'), chunk_grid=RegularChunkGrid(chunk_shape=(1,)), chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), @@ -60,7 +60,7 @@ that can be used.: node_type='array', storage_transformers=()), 'b': ArrayV3Metadata(shape=(2, 2), - data_type=, + data_type=Float64(endianness='little'), chunk_grid=RegularChunkGrid(chunk_shape=(2, 2)), chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), @@ -73,7 +73,7 @@ that can be used.: node_type='array', storage_transformers=()), 'c': ArrayV3Metadata(shape=(3, 3, 3), - data_type=, + data_type=Float64(endianness='little'), chunk_grid=RegularChunkGrid(chunk_shape=(3, 3, 3)), chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index fffd622209..b964439706 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -10,9 +10,9 @@ Zarr-Python supports creating arrays with Numpy data types:: >>> import zarr >>> import numpy as np - >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) + >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) >>> z - + Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. This means Zarr data types must be interpreted correctly when clients read an array. So each Zarr data type defines a procedure for @@ -34,7 +34,7 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> assert dtype_meta == np_dtype.str # True >>> dtype_meta - `_, or "byte order", of the data type. Following Numpy's example, diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index 4268004f70..c2a955718b 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -128,7 +128,7 @@ property. E.g.:: >>> bar.info_complete() Type : Array Zarr format : 3 - Data type : DataType.int64 + Data type : int64 Shape : (1000000,) Chunk shape : (100000,) Order : C @@ -144,7 +144,7 @@ property. E.g.:: >>> baz.info Type : Array Zarr format : 3 - Data type : DataType.float32 + Data type : float32 Shape : (1000, 1000) Chunk shape : (100, 100) Order : C diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 42d830780f..5c7844f92c 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -52,7 +52,7 @@ a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: >>> z4 = zarr.create_array(store={}, shape=(10000, 10000), chunks='auto', dtype='int32') >>> z4.chunks - (625, 625) + (313, 625) If you know you are always going to be loading the entire array into memory, you can turn off chunks by providing ``chunks`` equal to ``shape``, in which case there @@ -91,7 +91,7 @@ To use sharding, you need to specify the ``shards`` parameter when creating the >>> z6.info Type : Array Zarr format : 3 - Data type : DataType.uint8 + Data type : uint8 Shape : (10000, 10000, 1000) Shard shape : (1000, 1000, 1000) Chunk shape : (100, 100, 100) @@ -99,7 +99,7 @@ To use sharding, you need to specify the ``shards`` parameter when creating the Read-only : False Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian=None) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000000 (93.1G) @@ -121,7 +121,7 @@ ratios, depending on the correlation structure within the data. E.g.:: >>> c.info_complete() Type : Array Zarr format : 3 - Data type : DataType.int32 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -140,7 +140,7 @@ ratios, depending on the correlation structure within the data. E.g.:: >>> f.info_complete() Type : Array Zarr format : 3 - Data type : DataType.int32 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : F diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index c9637b156a..525b80c65f 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -2,11 +2,10 @@ import dataclasses import textwrap -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: import numcodecs.abc - import numpy as np from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat @@ -81,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | ZDType[_BaseDType, _BaseScalar] + _data_type: ZDType[_BaseDType, _BaseScalar] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index a8bd2b5951..f8ebc807d3 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -2,6 +2,7 @@ import base64 import re +import sys from collections.abc import Sequence from dataclasses import dataclass from typing import ( @@ -40,7 +41,7 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat -EndiannessNumpy = Literal[">", "<", "=", "|"] +EndiannessNumpy = Literal[">", "<", "|", "="] @dataclass(frozen=True, kw_only=True) @@ -57,7 +58,7 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): """ _zarr_v3_name = "bool" - _zarr_v2_names: ClassVar[tuple[str,...]] = ("|b1",) + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) dtype_cls = np.dtypes.BoolDType @classmethod @@ -314,7 +315,7 @@ class Int16(ZDType[np.dtypes.Int16DType, np.int16]): dtype_cls = np.dtypes.Int16DType _zarr_v3_name = "int16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: @@ -370,7 +371,7 @@ class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16]): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name = "uint16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: @@ -426,7 +427,7 @@ class Int32(ZDType[np.dtypes.Int32DType, np.int32]): dtype_cls = np.dtypes.Int32DType _zarr_v3_name = "int32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: @@ -482,7 +483,7 @@ class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32]): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name = "uint32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: @@ -538,7 +539,7 @@ class Int64(ZDType[np.dtypes.Int64DType, np.int64]): dtype_cls = np.dtypes.Int64DType _zarr_v3_name = "int64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: @@ -594,7 +595,7 @@ class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64]): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name = "uint64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: @@ -650,7 +651,7 @@ class Float16(ZDType[np.dtypes.Float16DType, np.float16]): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " Self: @@ -706,7 +707,7 @@ class Float32(ZDType[np.dtypes.Float32DType, np.float32]): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " Self: @@ -762,7 +763,7 @@ class Float64(ZDType[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " Self: @@ -818,7 +819,7 @@ class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64]): dtype_cls = np.dtypes.Complex64DType _zarr_v3_name = "complex64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " Self: @@ -876,7 +877,7 @@ class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128]): dtype_cls = np.dtypes.Complex128DType _zarr_v3_name = "complex128" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: @@ -1079,7 +1080,7 @@ class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_]): dtype_cls = np.dtypes.StrDType _zarr_v3_name = "numpy.fixed_length_ucs4" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point - endianness: Endianness | None = "native" + endianness: Endianness | None = "little" length: int = 1 @classmethod @@ -1263,7 +1264,7 @@ class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64]): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" unit: DateUnit | TimeUnit = "s" - endianness: Endianness | None = "native" + endianness: Endianness | None = "little" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: @@ -1457,7 +1458,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: raise TypeError(f"Invalid type: {data}. Expected a string.") as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_dtype() - return cast(np.void, np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) + return cast("np.void", np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: @@ -1471,7 +1472,7 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: Returns ------- - Literal[">", "<", "=", "|"] + Literal[">", "<", "|"] The numpy string representation of the endianness. Raises @@ -1484,8 +1485,6 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: return "<" case "big": return ">" - case "native": - return "=" case None: return "|" raise ValueError( @@ -1513,12 +1512,12 @@ def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: If the endianness is invalid. """ match endianness: + case "=": + return sys.byteorder case "<": return "little" case ">": return "big" - case "=": - return "native" case "|": return None raise ValueError( diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 2c4910338e..4e24d64ad9 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -10,7 +10,7 @@ from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype._numpy import DateUnit, TimeUnit -Endianness = Literal["little", "big", "native"] +Endianness = Literal["little", "big"] JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] From bb1186724ca31020685a69cbc9a8369069943b02 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 20 Mar 2025 15:34:45 +0100 Subject: [PATCH 041/130] adjust type annotations --- src/zarr/api/asynchronous.py | 2 +- src/zarr/codecs/_v2.py | 2 +- src/zarr/core/_info.py | 6 +++--- src/zarr/core/array.py | 6 ++++-- src/zarr/core/array_spec.py | 6 +++--- src/zarr/core/dtype/__init__.py | 4 ++++ src/zarr/core/dtype/wrapper.py | 4 ++-- src/zarr/core/metadata/v3.py | 9 +++++---- tests/test_array.py | 13 +++++++------ tests/test_info.py | 6 +++--- 10 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 72a12f9acb..9a9e00c972 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -990,7 +990,7 @@ async def create( chunks = shape default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype_wrapped) if filters is None: - filters = default_filters + filters = default_filters # type: ignore[assignment] if compressor is None: compressor = default_compressor elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index a89d1f5fa4..c03e3c55fb 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -46,7 +46,7 @@ async def _decode_single( chunk = ensure_ndarray_like(chunk) # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening - if chunk_spec.dtype != object: + if chunk_spec.dtype.dtype_cls is not np.dtypes.ObjectDType: try: chunk = chunk.view(chunk_spec.dtype.to_dtype()) except TypeError: diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 525b80c65f..3e605773bb 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -2,14 +2,14 @@ import dataclasses import textwrap -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal if TYPE_CHECKING: import numcodecs.abc from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import ZDType @dataclasses.dataclass(kw_only=True) @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: ZDType[_BaseDType, _BaseScalar] + _data_type: ZDType[Any, Any] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7b6eb455fc..91d6954e6c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -680,6 +680,8 @@ def _create_metadata_v3( """ Create an instance of ArrayV3Metadata. """ + filters: tuple[ArrayArrayCodec, ...] + compressors: tuple[BytesBytesCodec, ...] shape = parse_shapelike(shape) if codecs is None: @@ -707,7 +709,7 @@ def _create_metadata_v3( chunk_grid=chunk_grid_parsed, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, - codecs=codecs_parsed, + codecs=codecs_parsed, # type: ignore[arg-type] dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, ) @@ -1712,7 +1714,7 @@ def _info( ) -> Any: return ArrayInfo( _zarr_format=self.metadata.zarr_format, - _data_type=self.dtype, + _data_type=self._zdtype, _shape=self.shape, _order=self.order, _shard_shape=self.shards, diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index e8e451944f..f1eac930c4 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -17,7 +17,7 @@ from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import ZDType class ArrayConfigParams(TypedDict): @@ -89,7 +89,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: ZDType[_BaseDType, _BaseScalar] + dtype: ZDType[Any, Any] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -97,7 +97,7 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[Any, Any], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 4e594f8796..fc494030f1 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -84,7 +84,11 @@ data_type_registry.register(dtype._zarr_v3_name, dtype) +# TODO: find a better name for this function def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, _BaseScalar]: + """ + Get a data type wrapper (an instance of ``ZDType``) from a native data type, e.g. a numpy dtype. + """ data_type_registry.lazy_load() if not isinstance(dtype, np.dtype): if dtype in (str, "str"): diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 8707c3cda0..3409fa7ca4 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -22,7 +22,7 @@ TDType = TypeVar("TDType", bound=_BaseDType) -@dataclass(frozen=True, kw_only=True) +@dataclass(frozen=True, kw_only=True, slots=True) class ZDType(Generic[TDType, TScalar], ABC): """ Abstract base class for wrapping native array data types, e.g. numpy dtypes @@ -62,7 +62,7 @@ def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType]: return type(dtype) is cls.dtype_cls @classmethod - def from_dtype(cls: type[Self], dtype: TDType) -> Self: + def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: """ Wrap a dtype object. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 117bb3c573..889946c6ea 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -109,7 +109,8 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[_BaseDType, _BaseSc # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name codec_class_name = abc.__class__.__name__ - if isinstance(dtype, VariableLengthString) and not codec_class_name == "VLenUTF8Codec": + # TODO: Fix typing here + if isinstance(dtype, VariableLengthString) and not codec_class_name == "VLenUTF8Codec": # type: ignore[unreachable] raise ValueError( f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) @@ -407,11 +408,11 @@ def to_dict(self) -> dict[str, JSON]: # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with # to_json, from_json, and have ZDType inherit from `Metadata` - # until then, we have this hack here + # until then, we have this hack here, which relies on the fact that to_dict will pass through + # any non-`Metadata` fields as-is. dtype_meta = out_dict["data_type"] - if isinstance(dtype_meta, ZDType): - out_dict["data_type"] = dtype_meta.to_json(zarr_format=3) + out_dict["data_type"] = dtype_meta.to_json(zarr_format=3) # type: ignore[unreachable] return out_dict diff --git a/tests/test_array.py b/tests/test_array.py index aa61860fa1..5ed5ba06b7 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -39,6 +39,7 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype._numpy import Float64 from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv from zarr.core.sync import sync @@ -448,7 +449,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) result = arr.info expected = ArrayInfo( _zarr_format=2, - _data_type=arr.dtype, + _data_type=arr._async_array._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=None, @@ -465,7 +466,7 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=arr.dtype, + _data_type=arr._async_array._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -490,7 +491,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=arr.dtype, + _data_type=arr._async_array._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -525,7 +526,7 @@ async def test_info_v2_async( result = arr.info expected = ArrayInfo( _zarr_format=2, - _data_type=np.dtype("float64"), + _data_type=Float64(), _shape=(8, 8), _chunk_shape=(2, 2), _shard_shape=None, @@ -550,7 +551,7 @@ async def test_info_v3_async( result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=arr.dtype, + _data_type=arr._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -577,7 +578,7 @@ async def test_info_complete_async( result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=arr.dtype, + _data_type=arr._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, diff --git a/tests/test_info.py b/tests/test_info.py index db0fd0ef76..8662be0ab0 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -1,11 +1,11 @@ import textwrap -import numpy as np import pytest from zarr.codecs.bytes import BytesCodec from zarr.core._info import ArrayInfo, GroupInfo, human_readable_size from zarr.core.common import ZarrFormat +from zarr.core.dtype._numpy import Int32 ZARR_FORMATS = [2, 3] @@ -53,7 +53,7 @@ def test_group_info_complete(zarr_format: ZarrFormat) -> None: def test_array_info(zarr_format: ZarrFormat) -> None: info = ArrayInfo( _zarr_format=zarr_format, - _data_type=np.dtype("int32"), + _data_type=Int32(), _shape=(100, 100), _chunk_shape=(10, 100), _order="C", @@ -91,7 +91,7 @@ def test_array_info_complete( ) = bytes_things info = ArrayInfo( _zarr_format=zarr_format, - _data_type=np.dtype("int32"), + _data_type=Int32(), _shape=(100, 100), _chunk_shape=(10, 100), _order="C", From 7a619e0be900e8979e58c3bedfa980a104920dd0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 20 Mar 2025 15:35:08 +0100 Subject: [PATCH 042/130] fix info tests to use zdtype --- tests/test_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_info.py b/tests/test_info.py index 8662be0ab0..2e465b6a21 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -65,7 +65,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: assert result == textwrap.dedent(f"""\ Type : Array Zarr format : {zarr_format} - Data type : int32 + Data type : Int32(endianness='little') Shape : (100, 100) Chunk shape : (10, 100) Order : C @@ -106,7 +106,7 @@ def test_array_info_complete( assert result == textwrap.dedent(f"""\ Type : Array Zarr format : {zarr_format} - Data type : int32 + Data type : Int32(endianness='little') Shape : (100, 100) Chunk shape : (10, 100) Order : C From ea2d0bf6ccedd57297e570d28a0fe6706de43604 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 20 Mar 2025 16:31:05 +0100 Subject: [PATCH 043/130] remove dead code and add code coverage exemption to zarr format checks --- src/zarr/core/dtype/_numpy.py | 116 +++++++++++++++++----------------- src/zarr/core/dtype/common.py | 2 +- src/zarr/core/metadata/v3.py | 59 ----------------- 3 files changed, 59 insertions(+), 118 deletions(-) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index f8ebc807d3..55bd86a61d 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -77,14 +77,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["b return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -162,14 +162,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["i return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -247,14 +247,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["u return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -337,14 +337,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -352,7 +352,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.int16: return self.to_dtype().type(0) @@ -393,14 +393,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -408,7 +408,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.uint16: return self.to_dtype().type(0) @@ -449,14 +449,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -464,7 +464,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.int32: return self.to_dtype().type(0) @@ -505,14 +505,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -520,7 +520,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.uint32: return self.to_dtype().type(0) @@ -561,14 +561,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -576,7 +576,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.int64: return self.to_dtype().type(0) @@ -617,14 +617,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -632,7 +632,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.uint64: return self.to_dtype().type(0) @@ -673,14 +673,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -688,7 +688,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.float16: return self.to_dtype().type(0) @@ -729,14 +729,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -744,7 +744,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.float32: return self.to_dtype().type(0) @@ -785,14 +785,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -800,7 +800,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.float64: return self.to_dtype().type(0) @@ -841,14 +841,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -856,7 +856,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.complex64: return self.to_dtype().type(0) @@ -899,14 +899,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -914,7 +914,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.complex128: return self.to_dtype().type(0) @@ -962,7 +962,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and "length_bits" in data["configuration"] and isinstance(data["configuration"]["length_bits"], int) ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: @@ -972,7 +972,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: "name": self._zarr_v3_name, "configuration": {"length_bits": self.length * self.item_size_bits}, } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -980,7 +980,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.bytes_: return np.bytes_(b"") @@ -1025,14 +1025,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and isinstance(data["name"], str) and (re.match(r"^r\d+$", data["name"]) is not None) ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return {"name": f"r{self.length * self.item_size_bits}"} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1040,7 +1040,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: @@ -1113,7 +1113,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and "length_bits" in data["configuration"] and isinstance(data["configuration"]["length_bits"], int) ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: @@ -1123,7 +1123,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: "name": self._zarr_v3_name, "configuration": {"length_bits": self.length * self.item_size_bits}, } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1131,7 +1131,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.str_: return np.str_("") @@ -1174,7 +1174,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == "|O" elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: @@ -1185,7 +1185,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: return "|O" elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1227,14 +1227,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == "|O" elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1306,7 +1306,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and "unit" in data["configuration"] and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.datetime64: return np.datetime64("NaT") @@ -1316,7 +1316,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: return self.to_dtype().str elif zarr_format == 3: return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1324,7 +1324,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): @@ -1391,7 +1391,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: base_dict = {"name": self._zarr_v3_name} base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] return cast("JSON", base_dict) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def check_json( @@ -1416,7 +1416,7 @@ def check_json( and isinstance(data["configuration"], dict) and "fields" in data["configuration"] ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1441,7 +1441,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: for f_name, f_dtype in meta_fields ) return cls(fields=fields) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") def to_dtype(self) -> np.dtypes.VoidDType[int]: diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 4e24d64ad9..78dc6bdacd 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -530,4 +530,4 @@ def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: np.datetime64 The datetime64 value. """ - return cast(np.datetime64, np.int64(data).view(f"datetime64[{unit}]")) + return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 889946c6ea..ead05b5e44 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -23,10 +23,8 @@ import json from collections.abc import Iterable from dataclasses import dataclass, field, replace -from enum import Enum from typing import Any, Literal -import numcodecs.abc import numpy as np from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec @@ -169,60 +167,6 @@ def __init__( default=default, ) - def default(self, o: object) -> Any: - if isinstance(o, np.dtype): - return str(o) - if np.isscalar(o): - out: Any - if hasattr(o, "dtype") and o.dtype.kind == "M" and hasattr(o, "view"): - # https://github.com/zarr-developers/zarr-python/issues/2119 - # `.item()` on a datetime type might or might not return an - # integer, depending on the value. - # Explicitly cast to an int first, and then grab .item() - out = o.view("i8").item() - else: - # convert numpy scalar to python type, and pass - # python types through - out = getattr(o, "item", lambda: o)() - if isinstance(out, complex): - # python complex types are not JSON serializable, so we use the - # serialization defined in the zarr v3 spec - return _replace_special_floats([out.real, out.imag]) - elif np.isnan(out): - return "NaN" - elif np.isinf(out): - return "Infinity" if out > 0 else "-Infinity" - return out - elif isinstance(o, Enum): - return o.name - # this serializes numcodecs compressors - # todo: implement to_dict for codecs - elif isinstance(o, numcodecs.abc.Codec): - config: dict[str, Any] = o.get_config() - return config - else: - return super().default(o) - - -def _replace_special_floats(obj: object) -> Any: - """Helper function to replace NaN/Inf/-Inf values with special strings - - Note: this cannot be done in the V3JsonEncoder because Python's `json.dumps` optimistically - converts NaN/Inf values to special types outside of the encoding step. - """ - if isinstance(obj, float): - if np.isnan(obj): - return "NaN" - elif np.isinf(obj): - return "Infinity" if obj > 0 else "-Infinity" - elif isinstance(obj, dict): - # Recursively replace in dictionaries - return {k: _replace_special_floats(v) for k, v in obj.items()} - elif isinstance(obj, list): - # Recursively replace in lists - return [_replace_special_floats(item) for item in obj] - return obj - class ArrayV3MetadataDict(TypedDict): """ @@ -264,9 +208,6 @@ def __init__( Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ - # TODO: remove this - if not isinstance(data_type, ZDType): - raise TypeError shape_parsed = parse_shapelike(shape) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) From 042c9e5e2b7d43172067376c4099d7a1ce4d7c08 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 20 Mar 2025 17:50:53 +0100 Subject: [PATCH 044/130] fix: add special check for resolving int32 on windows --- src/zarr/core/dtype/_numpy.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 55bd86a61d..241626e6ac 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -429,6 +429,15 @@ class Int32(ZDType[np.dtypes.Int32DType, np.int32]): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: + # We override the base implementation to address a windows-specific, pre-numpy 2 issue where + # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` + if dtype == np.dtypes.Int32DType(): + return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) + else: + return super().from_dtype(dtype) + @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.Int32DType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) From def5eb24976a335f13d18c1ae36a094b7e588729 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 20 Mar 2025 22:26:05 +0100 Subject: [PATCH 045/130] add dtype entry point test --- src/zarr/core/dtype/registry.py | 3 + .../entry_points.txt | 2 + tests/package_with_entrypoint/__init__.py | 19 ++++++ tests/{test_metadata => }/test_dtype.py | 66 ++++++++++++++----- 4 files changed, 74 insertions(+), 16 deletions(-) rename tests/{test_metadata => }/test_dtype.py (84%) diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 0d07ab2b9d..4ad2158f96 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -12,6 +12,9 @@ from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar +# This class is different from the other registry classes, which inherit from +# dict. IMO it's simpler to just do a dataclass. But long-term we should +# have just 1 registry class in use. @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: contents: dict[str, type[ZDType[_BaseDType, _BaseScalar]]] = field( diff --git a/tests/package_with_entrypoint-0.1.dist-info/entry_points.txt b/tests/package_with_entrypoint-0.1.dist-info/entry_points.txt index eee724c912..7eb0eb7c86 100644 --- a/tests/package_with_entrypoint-0.1.dist-info/entry_points.txt +++ b/tests/package_with_entrypoint-0.1.dist-info/entry_points.txt @@ -12,3 +12,5 @@ another_buffer = package_with_entrypoint:TestEntrypointGroup.Buffer another_ndbuffer = package_with_entrypoint:TestEntrypointGroup.NDBuffer [zarr.codec_pipeline] another_pipeline = package_with_entrypoint:TestEntrypointGroup.Pipeline +[zarr.data_type] +new_data_type = package_with_entrypoint:TestDataType \ No newline at end of file diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index b818adf8ea..eed2ac43e5 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -1,4 +1,5 @@ from collections.abc import Iterable +from typing import Any, Literal, Self from numpy import ndarray @@ -8,6 +9,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import BytesLike +from zarr.core.dtype import Bool class TestEntrypointCodec(ArrayBytesCodec): @@ -64,3 +66,20 @@ class NDBuffer(zarr.core.buffer.NDBuffer): class Pipeline(CodecPipeline): pass + + +class TestDataType(Bool): + """ + This is a "data type" that serializes to "test" + """ + + _zarr_v3_name = "test" + + @classmethod + def from_json(cls, data: Any, zarr_format: Literal[2, 3]) -> Self: + if data == cls._zarr_v3_name: + return cls() + raise ValueError + + def to_json(self, zarr_format): + return self._zarr_v3_name diff --git a/tests/test_metadata/test_dtype.py b/tests/test_dtype.py similarity index 84% rename from tests/test_metadata/test_dtype.py rename to tests/test_dtype.py index db575ee16a..f690e6ce26 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_dtype.py @@ -1,9 +1,16 @@ from __future__ import annotations +import os import re +import sys from typing import TYPE_CHECKING, Any, get_args +import zarr +from zarr.core.config import config + if TYPE_CHECKING: + from collections.abc import Generator + from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar @@ -15,6 +22,7 @@ VariableLengthString, ZDType, data_type_registry, + get_data_type_from_json, ) from zarr.core.dtype._numpy import ( Bool, @@ -47,6 +55,7 @@ def data_type_registry_fixture() -> DataTypeRegistry: _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +VLEN_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType if _NUMPY_SUPPORTS_VLEN_STRING: VLEN_STRING_DTYPE = np.dtypes.StringDType() VLEN_STRING_CODE = "T" @@ -79,7 +88,9 @@ def data_type_registry_fixture() -> DataTypeRegistry: (DateTime64, "datetime64[s]"), ], ) -def test_wrap(wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.dtype | str) -> None: +def test_wrap( + wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.dtype[np.generic] | str +) -> None: """ Test that the wrapper class has the correct dtype class bound to the dtype_cls variable Test that the ``wrap`` method produces an instance of the wrapper class @@ -90,14 +101,14 @@ def test_wrap(wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.d wrapped = wrapper_cls.from_dtype(dt) with pytest.raises(DataTypeValidationError, match="Invalid dtype"): - wrapper_cls.from_dtype("not a dtype") + wrapper_cls.from_dtype("not a dtype") # type: ignore[arg-type] assert isinstance(wrapped, wrapper_cls) assert wrapped.to_dtype() == dt @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) -def test_dict_serialization(wrapper_cls: DTYPE, zarr_format: ZarrFormat) -> None: +def test_dict_serialization(wrapper_cls: Any, zarr_format: ZarrFormat) -> None: if issubclass(wrapper_cls, Structured): instance = wrapper_cls(fields=((("a", Bool()),))) else: @@ -127,16 +138,14 @@ def test_dict_serialization(wrapper_cls: DTYPE, zarr_format: ZarrFormat) -> None (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), (FixedLengthUnicode(length=3), np.str_("")), ( - Structured(fields=(("a", Float64()), ("b", Int8()))), + Structured(fields=(("a", Float64()), ("b", Int8()))), # type: ignore[arg-type] np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], ), (VariableLengthString(), ""), (DateTime64(unit="s"), np.datetime64("NaT")), ], ) -def test_default_value( - wrapper: type[ZDType[_BaseDType, _BaseScalar]], expected_default: Any -) -> None: +def test_default_value(wrapper: ZDType[Any, Any], expected_default: Any) -> None: """ Test that the default_value method is correctly set for each dtype wrapper. """ @@ -171,7 +180,7 @@ def test_default_value( ], ) def test_to_json_value_v2( - wrapper: type[ZDType[_BaseDType, _BaseScalar]], input_value: Any, expected_json: Any + wrapper: ZDType[_BaseDType, _BaseScalar], input_value: Any, expected_json: Any ) -> None: """ Test the to_json_value method for each dtype wrapper for zarr v2 @@ -204,7 +213,7 @@ def test_to_json_value_v2( ], ) def test_from_json_value( - wrapper: type[ZDType[_BaseDType, _BaseScalar]], json_value: Any, expected_value: Any + wrapper: ZDType[_BaseDType, _BaseScalar], json_value: Any, expected_value: Any ) -> None: """ Test the from_json_value method for each dtype wrapper. @@ -218,7 +227,7 @@ def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) # type: ignore[arg-type] assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) @@ -227,13 +236,13 @@ def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) # type: ignore[arg-type] class NewBool(Bool): def default_value(self) -> np.bool_: return np.True_ - data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) + data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) # type: ignore[arg-type] assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) @staticmethod @@ -268,13 +277,13 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non @staticmethod @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) def test_registered_dtypes( - wrapper_cls: ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat + wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], zarr_format: ZarrFormat ) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) + instance = wrapper_cls(fields=((("a", Bool()),))) # type: ignore[misc] else: instance = wrapper_cls() @@ -289,7 +298,7 @@ def test_registered_dtypes( @staticmethod @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) def test_match_dtype_unique( - wrapper_cls: ZDType[_BaseDType, _BaseScalar], + wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], data_type_registry_fixture: DataTypeRegistry, zarr_format: ZarrFormat, ) -> None: @@ -303,7 +312,7 @@ def test_match_dtype_unique( data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) + instance = wrapper_cls(fields=((("a", Bool()),))) # type: ignore[misc] else: instance = wrapper_cls() dtype_instance = instance.to_dtype() @@ -316,3 +325,28 @@ def test_match_dtype_unique( msg = f"No data type wrapper found that matches {instance_dict}" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) + + +# this is copied from the registry tests -- we should deduplicate +here = os.path.abspath(os.path.dirname(__file__)) + + +@pytest.fixture +def set_path() -> Generator[None, None, None]: + sys.path.append(here) + zarr.registry._collect_entrypoints() + yield + sys.path.remove(here) + registries = zarr.registry._collect_entrypoints() + for registry in registries: + registry.lazy_load_list.clear() + config.reset() + + +@pytest.mark.usefixtures("set_path") +def test_entrypoint_codec(zarr_format: ZarrFormat) -> None: + from package_with_entrypoint import TestDataType + + instance = TestDataType() + dtype_json = instance.to_json(zarr_format=zarr_format) + assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance From 1b7273b719b8801cbda5300d857a618496587dce Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 21 Mar 2025 11:50:17 +0100 Subject: [PATCH 046/130] remove default parameters for parametric dtypes; add mixin classes for numpy dtypes; define zdtypelike --- src/zarr/api/synchronous.py | 5 ++- src/zarr/core/array.py | 36 +++++++++-------- src/zarr/core/dtype/__init__.py | 26 ++++++------ src/zarr/core/dtype/_numpy.py | 71 ++++++++++++++++++--------------- src/zarr/core/dtype/common.py | 7 ++-- 5 files changed, 76 insertions(+), 69 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 9424ae1fde..79a5c47d71 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -37,6 +37,7 @@ ShapeLike, ZarrFormat, ) + from zarr.core.dtype import ZDTypeLike from zarr.storage import StoreLike __all__ = [ @@ -748,7 +749,7 @@ def create_array( *, name: str | None = None, shape: ShapeLike | None = None, - dtype: npt.DTypeLike | None = None, + dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ShardsLike | None = None, @@ -778,7 +779,7 @@ def create_array( at the root of the store. shape : ChunkCoords, optional Shape of the array. Can be ``None`` if ``data`` is provided. - dtype : npt.DTypeLike, optional + dtype : ZDTypeLike, optional Data type of the array. Can be ``None`` if ``data`` is provided. data : np.ndarray, optional Array-like data to use for initializing the array. If this parameter is provided, the diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 91d6954e6c..6c34c0d351 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -23,7 +23,6 @@ import numcodecs import numcodecs.abc import numpy as np -import numpy.typing as npt from typing_extensions import deprecated from zarr._compat import _deprecate_positional_args @@ -67,6 +66,7 @@ from zarr.core.config import config as zarr_config from zarr.core.dtype import ( ZDType, + ZDTypeLike, parse_data_type, ) from zarr.core.indexing import ( @@ -122,6 +122,8 @@ from collections.abc import Iterator, Sequence from typing import Self + import numpy.typing as npt + from zarr.abc.codec import CodecPipeline from zarr.codecs.sharding import ShardingCodecIndexLocation from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar @@ -295,7 +297,7 @@ async def create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike, + dtype: ZDTypeLike, zarr_format: Literal[2], fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -319,7 +321,7 @@ async def create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike, + dtype: ZDTypeLike, zarr_format: Literal[3], fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -347,7 +349,7 @@ async def create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike, + dtype: ZDTypeLike, zarr_format: Literal[3] = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -375,7 +377,7 @@ async def create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike, + dtype: ZDTypeLike, zarr_format: ZarrFormat, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -410,7 +412,7 @@ async def create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike, + dtype: ZDTypeLike, zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -446,7 +448,7 @@ async def create( The store where the array will be created. shape : ShapeLike The shape of the array. - dtype : npt.DTypeLike + dtype : ZDTypeLike The data type of the array. zarr_format : ZarrFormat, optional The Zarr format version (default is 3). @@ -551,7 +553,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike | ZDType[_BaseDType, _BaseScalar], + dtype: ZDTypeLike | ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -1746,7 +1748,7 @@ def create( *, # v2 and v3 shape: ChunkCoords, - dtype: npt.DTypeLike, + dtype: ZDTypeLike, zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -1781,7 +1783,7 @@ def create( The array store that has already been initialized. shape : ChunkCoords The shape of the array. - dtype : npt.DTypeLike + dtype : ZDTypeLike The data type of the array. chunk_shape : ChunkCoords, optional The shape of the Array's chunks. @@ -1875,7 +1877,7 @@ def _create( *, # v2 and v3 shape: ChunkCoords, - dtype: npt.DTypeLike, + dtype: ZDTypeLike, zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -3817,7 +3819,7 @@ async def init_array( *, store_path: StorePath, shape: ShapeLike, - dtype: npt.DTypeLike, + dtype: ZDTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ShardsLike | None = None, filters: FiltersLike = "auto", @@ -3840,7 +3842,7 @@ async def init_array( StorePath instance. The path attribute is the name of the array to initialize. shape : ChunkCoords Shape of the array. - dtype : npt.DTypeLike + dtype : ZDTypeLike Data type of the array. chunks : ChunkCoords, optional Chunk shape of the array. @@ -4028,7 +4030,7 @@ async def create_array( *, name: str | None = None, shape: ShapeLike | None = None, - dtype: npt.DTypeLike | None = None, + dtype: ZDTypeLike | None = None, data: np.ndarray[Any, np.dtype[Any]] | None = None, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ShardsLike | None = None, @@ -4057,7 +4059,7 @@ async def create_array( at the root of the store. shape : ChunkCoords, optional Shape of the array. Can be ``None`` if ``data`` is provided. - dtype : npt.DTypeLike | None + dtype : ZDTypeLike | None Data type of the array. Can be ``None`` if ``data`` is provided. data : Array-like data to use for initializing the array. If this parameter is provided, the ``shape`` and ``dtype`` parameters must be identical to ``data.shape`` and ``data.dtype``, @@ -4401,8 +4403,8 @@ def _parse_data_params( *, data: np.ndarray[Any, np.dtype[Any]] | None, shape: ShapeLike | None, - dtype: npt.DTypeLike | None, -) -> tuple[np.ndarray[Any, np.dtype[Any]] | None, ShapeLike, npt.DTypeLike]: + dtype: ZDTypeLike | None, +) -> tuple[np.ndarray[Any, np.dtype[Any]] | None, ShapeLike, ZDTypeLike]: """ Ensure an array-like ``data`` parameter is consistent with the ``dtype`` and ``shape`` parameters. diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index fc494030f1..021b6b48e2 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,19 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, get_args - -import numpy as np - -from zarr.core.dtype._numpy import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar +from typing import TYPE_CHECKING, Any, TypeAlias, get_args if TYPE_CHECKING: - import numpy.typing as npt - - from zarr.core.common import JSON, ZarrFormat + from zarr.core.common import ZarrFormat +import numpy as np +import numpy.typing as npt +from zarr.core.common import JSON from zarr.core.dtype._numpy import ( + _NUMPY_SUPPORTS_VLEN_STRING, Bool, Complex64, Complex128, @@ -36,7 +33,7 @@ VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry -from zarr.core.dtype.wrapper import ZDType +from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar __all__ = [ "Complex64", @@ -80,6 +77,8 @@ | DateTime64 ) +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[Any, Any] | dict[str, JSON] + for dtype in get_args(DTYPE): data_type_registry.register(dtype._zarr_v3_name, dtype) @@ -112,9 +111,10 @@ def get_data_type_from_json( return data_type_registry.match_json(dtype, zarr_format=zarr_format) -def parse_data_type( - dtype: npt.DTypeLike | ZDType[Any, Any] | dict[str, JSON], zarr_format: ZarrFormat -) -> ZDType[Any, Any]: +def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[Any, Any]: + """ + Interpret the input as a ZDType instance. + """ if isinstance(dtype, ZDType): return dtype elif isinstance(dtype, dict): diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 241626e6ac..4094403c3f 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -44,7 +44,25 @@ EndiannessNumpy = Literal[">", "<", "|", "="] -@dataclass(frozen=True, kw_only=True) +@dataclass(frozen=True) +class HasEndianness: + """ + This is a mix-in class for data types with an endianness attribute + """ + + endianness: Endianness | None = "little" + + +@dataclass(frozen=True) +class HasLength: + """ + This is a mix-in class for data types with a length attribute + """ + + length: int + + +@dataclass(frozen=True, kw_only=True, slots=True) class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): """ Wrapper for numpy boolean dtype. @@ -311,11 +329,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint8: @dataclass(frozen=True, kw_only=True) -class Int16(ZDType[np.dtypes.Int16DType, np.int16]): +class Int16(ZDType[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType _zarr_v3_name = "int16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: @@ -367,11 +384,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int16: @dataclass(frozen=True, kw_only=True) -class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16]): +class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name = "uint16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: @@ -423,16 +439,18 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint16: @dataclass(frozen=True, kw_only=True) -class Int32(ZDType[np.dtypes.Int32DType, np.int32]): +class Int32(ZDType[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType _zarr_v3_name = "int32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: # We override the base implementation to address a windows-specific, pre-numpy 2 issue where # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` + # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, + # despite the two classes being different. Thus we will create an instance of `cls` with the + # latter dtype, after pulling in the byte order of the input if dtype == np.dtypes.Int32DType(): return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) else: @@ -488,11 +506,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int32: @dataclass(frozen=True, kw_only=True) -class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32]): +class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name = "uint32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: @@ -544,11 +561,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint32: @dataclass(frozen=True, kw_only=True) -class Int64(ZDType[np.dtypes.Int64DType, np.int64]): +class Int64(ZDType[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType _zarr_v3_name = "int64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: @@ -600,11 +616,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int64: @dataclass(frozen=True, kw_only=True) -class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64]): +class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name = "uint64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: @@ -656,11 +671,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint64: @dataclass(frozen=True, kw_only=True) -class Float16(ZDType[np.dtypes.Float16DType, np.float16]): +class Float16(ZDType[np.dtypes.Float16DType, np.float16], HasEndianness): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " Self: @@ -712,11 +726,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float16: @dataclass(frozen=True, kw_only=True) -class Float32(ZDType[np.dtypes.Float32DType, np.float32]): +class Float32(ZDType[np.dtypes.Float32DType, np.float32], HasEndianness): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " Self: @@ -768,11 +781,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float32: @dataclass(frozen=True, kw_only=True) -class Float64(ZDType[np.dtypes.Float64DType, np.float64]): +class Float64(ZDType[np.dtypes.Float64DType, np.float64], HasEndianness): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " Self: @@ -824,11 +836,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float64: @dataclass(frozen=True, kw_only=True) -class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64]): +class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64], HasEndianness): dtype_cls = np.dtypes.Complex64DType _zarr_v3_name = "complex64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " Self: @@ -882,11 +893,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex6 @dataclass(frozen=True, kw_only=True) -class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128]): +class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128], HasEndianness): dtype_cls = np.dtypes.Complex128DType _zarr_v3_name = "complex128" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: @@ -940,11 +950,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex1 @dataclass(frozen=True, kw_only=True) -class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_]): +class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): dtype_cls = np.dtypes.BytesDType _zarr_v3_name = "numpy.fixed_length_ascii" item_size_bits: ClassVar[int] = 8 - length: int = 1 @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.BytesDType[int]) -> Self: @@ -1004,14 +1013,13 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void]): +class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): # np.dtypes.VoidDType is specified in an odd way in numpy # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "numpy.void" item_size_bits: ClassVar[int] = 8 - length: int = 1 @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: @@ -1085,12 +1093,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) -class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_]): +class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): dtype_cls = np.dtypes.StrDType _zarr_v3_name = "numpy.fixed_length_ucs4" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point - endianness: Endianness | None = "little" - length: int = 1 @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: @@ -1269,11 +1275,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) -class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64]): +class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" - unit: DateUnit | TimeUnit = "s" - endianness: Endianness | None = "little" + unit: DateUnit | TimeUnit @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 78dc6bdacd..106da80a61 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -325,7 +325,7 @@ def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: Parameters ---------- data : bytes - The structured scalar value to convert. + The bytes to store. zarr_format : ZarrFormat The zarr format version. @@ -334,9 +334,8 @@ def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: str The bytes encoded as ascii using the base64 alphabet. """ - if zarr_format == 2: - return base64.b64encode(data).decode("ascii") - raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + # TODO: decide if we are going to make this implementation zarr format-specific + return base64.b64encode(data).decode("ascii") def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: From 83f508ceb611598dfbaeadd4e4d66fd4b015601c Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 24 Mar 2025 11:39:12 +0100 Subject: [PATCH 047/130] Update docs/user-guide/data_types.rst Co-authored-by: Ilan Gold --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index b964439706..36a9ea40f7 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -40,7 +40,7 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following Numpy's example, in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. -In addition to defining a representation of the data type itself (which in the example above was just a simple string ``" Date: Mon, 24 Mar 2025 14:45:27 +0100 Subject: [PATCH 048/130] refactor: use inheritance to remove boilerplate in dtype definitions --- src/zarr/core/dtype/_numpy.py | 900 ++++++++++++++------------------- src/zarr/core/dtype/common.py | 33 +- src/zarr/core/dtype/wrapper.py | 74 ++- src/zarr/core/metadata/v2.py | 6 +- tests/conftest.py | 18 +- tests/test_array.py | 37 +- tests/test_dtype.py | 105 ++-- tests/test_metadata/test_v3.py | 19 +- 8 files changed, 575 insertions(+), 617 deletions(-) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 4094403c3f..38597f8fee 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -11,7 +11,12 @@ ClassVar, Literal, Self, + SupportsComplex, + SupportsFloat, + SupportsIndex, + SupportsInt, TypeGuard, + TypeVar, cast, get_args, ) @@ -21,7 +26,6 @@ from zarr.core.dtype.common import ( DataTypeValidationError, Endianness, - JSONFloat, bytes_from_json, bytes_to_json, check_json_bool, @@ -29,8 +33,8 @@ check_json_float, check_json_int, check_json_str, - complex_from_json, - complex_to_json, + complex_float_from_json, + complex_float_to_json, datetime_from_json, datetime_to_json, float_from_json, @@ -42,6 +46,9 @@ from zarr.core.common import JSON, ZarrFormat EndiannessNumpy = Literal[">", "<", "|", "="] +IntLike = SupportsInt | SupportsIndex | bytes | str +FloatLike = SupportsIndex | SupportsFloat | bytes | str +ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None @dataclass(frozen=True) @@ -80,7 +87,7 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): dtype_cls = np.dtypes.BoolDType @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() def to_dtype(self: Self) -> np.dtypes.BoolDType: @@ -119,9 +126,9 @@ def default_value(self) -> np.bool_: """ return np.False_ - def to_json_value(self, data: np.bool_, zarr_format: ZarrFormat) -> bool: + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> bool: """ - Convert a boolean value to JSON-serializable format. + Convert a scalar to a python bool. Parameters ---------- @@ -154,188 +161,175 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: The numpy boolean scalar. """ if check_json_bool(data): - return np.bool_(data) + return self._cast_value_unsafe(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") + def check_value(self, data: object) -> bool: + # Anything can become a bool + return True -@dataclass(frozen=True, kw_only=True) -class Int8(ZDType[np.dtypes.Int8DType, np.int8]): - dtype_cls = np.dtypes.Int8DType - _zarr_v3_name = "int8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) + def cast_value(self, value: object) -> np.bool_: + return self._cast_value_unsafe(value) - @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.Int8DType) -> Self: - return cls() + def _cast_value_unsafe(self, value: object) -> np.bool_: + return np.bool_(value) - def to_dtype(self: Self) -> np.dtypes.Int8DType: - return self.dtype_cls() - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["int8", "|i1"]]: +_NumpyIntDType = ( + np.dtypes.Int8DType + | np.dtypes.Int16DType + | np.dtypes.Int32DType + | np.dtypes.Int64DType + | np.dtypes.UInt8DType + | np.dtypes.UInt16DType + | np.dtypes.UInt32DType + | np.dtypes.UInt64DType +) +_NumpyIntScalar = ( + np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 +) +TIntDType_co = TypeVar("TIntDType_co", bound=_NumpyIntDType, covariant=True) +TIntScalar_co = TypeVar("TIntScalar_co", bound=_NumpyIntScalar, covariant=True) + + +@dataclass(frozen=True) +class BaseInt(ZDType[TIntDType_co, TIntScalar_co]): + # This attribute holds the possible zarr v2 JSON names for the data type + _zarr_v2_names: ClassVar[tuple[str, ...]] + + def to_json(self, zarr_format: ZarrFormat) -> str: """ - Check that the input is a valid JSON representation of a 8-bit integer. + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return data in cls._zarr_v2_names + return self.to_dtype().str elif zarr_format == 3: - return data == cls._zarr_v3_name + return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json(self, zarr_format: ZarrFormat) -> str: + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of this data type. + """ if zarr_format == 2: - return self.to_dtype().str + return data in cls._zarr_v2_names elif zarr_format == 3: - return self._zarr_v3_name + return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() + def check_value(self, value: object) -> TypeGuard[IntLike]: + return isinstance(value, IntLike) - def default_value(self) -> np.int8: + def _cast_value_unsafe(self, value: object) -> TIntScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to an integer.") + + def default_value(self) -> TIntScalar_co: """ - Get the default value. + Get the default value, which is 0 cast to this dtype Returns ------- - np.int8 + Int scalar The default value. """ - return np.int8(0) + return self._cast_value_unsafe(0) - def to_json_value(self, data: np.int8, zarr_format: ZarrFormat) -> int: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: """ - Convert a numpy 8-bit int to JSON-serializable format. + Read a JSON-serializable value as a numpy int scalar. Parameters ---------- - data : np.int8 - The value to convert. + data : JSON + The JSON-serializable value. zarr_format : ZarrFormat The zarr format version. Returns ------- - int - The JSON-serializable form of the scalar. + TScalar_co + The numpy scalar. """ - return int(data) + if check_json_int(data): + return self._cast_value_unsafe(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int8: + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> int: """ - Read a JSON-serializable value as a numpy int8 scalar. + Convert an object to JSON-serializable scalar. Parameters ---------- - data : JSON - The JSON-serializable value. + data : _BaseScalar + The value to convert. zarr_format : ZarrFormat The zarr format version. Returns ------- - np.bool_ - The numpy boolean scalar. + int + The JSON-serializable form of the scalar. """ - if check_json_int(data): - return np.int8(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") + return int(self.cast_value(data)) @dataclass(frozen=True, kw_only=True) -class UInt8(ZDType[np.dtypes.UInt8DType, np.uint8]): - dtype_cls = np.dtypes.UInt8DType - _zarr_v3_name = "uint8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) +class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): + dtype_cls = np.dtypes.Int8DType + _zarr_v3_name = "int8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt8DType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() - def to_dtype(self: Self) -> np.dtypes.UInt8DType: + def to_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["uint8", "|u1"]]: - """ - Check that the input is a valid JSON representation of an unsigned 8-bit integer. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() - def default_value(self) -> np.uint8: - """ - Get the default value for this data type. - - Returns - ------- - np.uint8 - The default value. - """ - return np.uint8(0) - - def to_json_value(self, data: np.uint8, zarr_format: ZarrFormat) -> int: - """ - Convert a numpy unsigned 8-bit integer to JSON-serializable format. - - Parameters - ---------- - data : np.uint8 - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - Returns - ------- - int - The JSON-serializable form of the scalar. - """ - return int(data) +@dataclass(frozen=True, kw_only=True) +class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): + dtype_cls = np.dtypes.UInt8DType + _zarr_v3_name = "uint8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint8: - """ - Read a JSON-serializable value as a numpy boolean scalar. + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. + def to_dtype(self: Self) -> np.dtypes.UInt8DType: + return self.dtype_cls() - Returns - ------- - np.bool_ - The numpy boolean scalar. - """ - if check_json_int(data): - return np.uint8(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() @dataclass(frozen=True, kw_only=True) -class Int16(ZDType[np.dtypes.Int16DType, np.int16], HasEndianness): +class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType _zarr_v3_name = "int16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -343,54 +337,24 @@ def to_dtype(self) -> np.dtypes.Int16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["int16", ">i2", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: + # This ensures that we get the endianness correct without annoying string parsing return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.int16: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.int16, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int16: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16], HasEndianness): +class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name = "uint16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -398,26 +362,6 @@ def to_dtype(self) -> np.dtypes.UInt16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["uint16", ">u2", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -426,20 +370,9 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.uint16: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.uint16, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint16: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class Int32(ZDType[np.dtypes.Int32DType, np.int32], HasEndianness): +class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType _zarr_v3_name = "int32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: return super().from_dtype(dtype) @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.Int32DType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -465,26 +398,6 @@ def to_dtype(self) -> np.dtypes.Int32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["int32", ">i4", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -493,26 +406,15 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.int32: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.int32, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int32: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32], HasEndianness): +class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name = "uint32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -520,26 +422,6 @@ def to_dtype(self) -> np.dtypes.UInt32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["uint32", ">u4", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -548,26 +430,15 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.uint32: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.uint32, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint32: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class Int64(ZDType[np.dtypes.Int64DType, np.int64], HasEndianness): +class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType _zarr_v3_name = "int64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -575,26 +446,6 @@ def to_dtype(self) -> np.dtypes.Int64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["int64", ">i8", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -603,26 +454,15 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.int64: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.int64, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int64: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64], HasEndianness): +class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name = "uint64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -630,26 +470,6 @@ def to_dtype(self) -> np.dtypes.UInt64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["uint64", ">u8", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -658,47 +478,45 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.uint64: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.uint64, zarr_format: ZarrFormat) -> int: - return int(data) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint64: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") +TFloatDType_co = TypeVar( + "TFloatDType_co", + bound=np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, + covariant=True, +) +TFloatScalar_co = TypeVar( + "TFloatScalar_co", bound=np.float16 | np.float32 | np.float64, covariant=True +) -@dataclass(frozen=True, kw_only=True) -class Float16(ZDType[np.dtypes.Float16DType, np.float16], HasEndianness): - dtype_cls = np.dtypes.Float16DType - _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Float16DType: + def to_dtype(self) -> TFloatDType_co: byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) + return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["float", ">f2", " str: """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + Convert the wrapped data type to a JSON-serializable form. - def to_json(self, zarr_format: ZarrFormat) -> str: + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: @@ -713,39 +531,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.float16: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.float16, zarr_format: ZarrFormat) -> JSONFloat: - return float_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float16: - if check_json_float(data, zarr_format=zarr_format): - return self.to_dtype().type(float_from_json(data, zarr_format)) - raise TypeError(f"Invalid type: {data}. Expected a float.") - - -@dataclass(frozen=True, kw_only=True) -class Float32(ZDType[np.dtypes.Float32DType, np.float32], HasEndianness): - dtype_cls = np.dtypes.Float32DType - _zarr_v3_name = "float32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Float32DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["float32", ">f4", " TypeGuard[JSON]: """ - Check that the input is a valid JSON representation of a signed 16-bit integer. + Check that the input is a valid JSON representation of this data type. """ if zarr_format == 2: return data in cls._zarr_v2_names @@ -753,62 +542,122 @@ def check_json( return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json(self, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_value(self, value: object) -> TypeGuard[FloatLike]: + return isinstance(value, FloatLike) - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to a float.") + + def default_value(self) -> TFloatScalar_co: + """ + Get the default value, which is 0 cast to this dtype + + Returns + ------- + Int scalar + The default value. + """ + return self._cast_value_unsafe(0) - def default_value(self) -> np.float32: - return self.to_dtype().type(0) + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: + """ + Read a JSON-serializable value as a numpy float. - def to_json_value(self, data: np.float32, zarr_format: ZarrFormat) -> JSONFloat: - return float_to_json(data, zarr_format) + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float32: + Returns + ------- + TScalar_co + The numpy float. + """ if check_json_float(data, zarr_format=zarr_format): - return self.to_dtype().type(float_from_json(data, zarr_format)) - raise TypeError(f"Invalid type: {data}. Expected a float.") + return self._cast_value_unsafe(float_from_json(data, zarr_format=zarr_format)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: + """ + Convert an object to a JSON-serializable float. + + Parameters + ---------- + data : _BaseScalar + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSON + The JSON-serializable form of the float, which is potentially a number or a string. + See the zarr specifications for details on the JSON encoding for floats. + """ + return float_to_json(self._cast_value_unsafe(data), zarr_format=zarr_format) + + +@dataclass(frozen=True, kw_only=True) +class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): + dtype_cls = np.dtypes.Float16DType + _zarr_v3_name = "float16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", "f4", "f8", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Float64DType: + def to_dtype(self) -> TComplexDType_co: byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) + return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["float64", ">f8", " str: """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + Convert the wrapped data type to a JSON-serializable form. - def to_json(self, zarr_format: ZarrFormat) -> str: + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: @@ -823,39 +672,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.float64: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.float64, zarr_format: ZarrFormat) -> JSONFloat: - return float_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float64: - if check_json_float(data, zarr_format=zarr_format): - return self.to_dtype().type(float_from_json(data, zarr_format)) - raise TypeError(f"Invalid type: {data}. Expected a float.") - - -@dataclass(frozen=True, kw_only=True) -class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64], HasEndianness): - dtype_cls = np.dtypes.Complex64DType - _zarr_v3_name = "complex64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Complex64DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["complex64", ">c8", " TypeGuard[JSON]: """ - Check that the input is a valid JSON representation of a signed 16-bit integer. + Check that the input is a valid JSON representation of this data type. """ if zarr_format == 2: return data in cls._zarr_v2_names @@ -863,90 +683,79 @@ def check_json( return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json(self, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_value(self, value: object) -> bool: + return isinstance(value, ComplexLike) - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[arg-type, return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to a complex scalar.") - def default_value(self) -> np.complex64: - return self.to_dtype().type(0) + def default_value(self) -> TComplexScalar_co: + """ + Get the default value, which is 0 cast to this dtype - def to_json_value( - self, data: np.complex64, zarr_format: ZarrFormat - ) -> tuple[JSONFloat, JSONFloat]: - return complex_to_json(data, zarr_format) + Returns + ------- + Int scalar + The default value. + """ + return self._cast_value_unsafe(0) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex64: - if check_json_complex_float(data, zarr_format=zarr_format): - return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) - raise TypeError(f"Invalid type: {data}. Expected a complex float.") + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: + """ + Read a JSON-serializable value as a numpy float. + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. -@dataclass(frozen=True, kw_only=True) -class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128], HasEndianness): - dtype_cls = np.dtypes.Complex128DType - _zarr_v3_name = "complex128" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: + """ + Convert an object to a JSON-serializable float. - def to_dtype(self) -> np.dtypes.Complex128DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) + Parameters + ---------- + data : _BaseScalar + The value to convert. + zarr_format : ZarrFormat + The zarr format version. - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["complex128", ">c16", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.complex128: - return self.to_dtype().type(0) +@dataclass(frozen=True, kw_only=True) +class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): + dtype_cls = np.dtypes.Complex64DType + _zarr_v3_name = "complex64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " tuple[JSONFloat, JSONFloat]: - return complex_to_json(data, zarr_format) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: - if check_json_complex_float(data, zarr_format=zarr_format): - return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) - raise TypeError(f"Invalid type: {data}. Expected a complex float.") +@dataclass(frozen=True, kw_only=True) +class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): + dtype_cls = np.dtypes.Complex128DType + _zarr_v3_name = "complex128" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def to_dtype(self) -> np.dtypes.BytesDType[int]: @@ -1003,14 +812,20 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: def default_value(self) -> np.bytes_: return np.bytes_(b"") - def to_json_value(self, data: np.bytes_, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_str(data): return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) raise TypeError(f"Invalid type: {data}. Expected a string.") + def check_value(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes) + + def _cast_value_unsafe(self, value: object) -> np.bytes_: + return self.to_dtype().type(value) + @dataclass(frozen=True, kw_only=True) class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): @@ -1022,7 +837,7 @@ class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): item_size_bits: ClassVar[int] = 8 @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def to_dtype(self) -> np.dtypes.VoidDType[int]: @@ -1083,14 +898,20 @@ def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidD def default_value(self) -> np.void: return self.to_dtype().type(("\x00" * self.length).encode("ascii")) - def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data.tobytes()).decode("ascii") + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(self.cast_value(data).tobytes()).decode("ascii") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): return self.to_dtype().type(base64.standard_b64decode(data)) raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") + def check_value(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes) + + def _cast_value_unsafe(self, value: object) -> np.void: + return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] + @dataclass(frozen=True, kw_only=True) class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): @@ -1099,7 +920,7 @@ class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.item_size_bits // 8), @@ -1151,7 +972,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: def default_value(self) -> np.str_: return np.str_("") - def to_json_value(self, data: np.str_, *, zarr_format: ZarrFormat) -> str: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @@ -1159,6 +980,12 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: raise TypeError(f"Invalid type: {data}. Expected a string.") return self.to_dtype().type(data) + def check_value(self, data: object) -> bool: + return isinstance(data, str | np.str_ | bytes) + + def _cast_value_unsafe(self, value: object) -> np.str_: + return self.to_dtype().type(value) + _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") @@ -1171,7 +998,7 @@ class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[ _zarr_v3_name = "numpy.variable_length_utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.StringDType: @@ -1217,6 +1044,12 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: raise TypeError(f"Invalid type: {data}. Expected a string.") return data + def check_value(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_value_unsafe(self, value: object) -> str: + return str(value) + else: @dataclass(frozen=True, kw_only=True) @@ -1225,7 +1058,7 @@ class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[ _zarr_v3_name = "numpy.variable_length_utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.ObjectDType: @@ -1258,8 +1091,8 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: def default_value(self) -> str: return "" - def to_json_value(self, data: str, *, zarr_format: ZarrFormat) -> str: - return data + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return data # type: ignore[return-value] def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ @@ -1269,19 +1102,25 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: raise TypeError(f"Invalid type: {data}. Expected a string.") return data + def check_value(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_value_unsafe(self, value: object) -> str: + return str(value) + DateUnit = Literal["Y", "M", "W", "D"] TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] -@dataclass(frozen=True, kw_only=True) +@dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" unit: DateUnit | TimeUnit @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') @@ -1345,8 +1184,19 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime return datetime_from_json(data, self.unit) raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: - return datetime_to_json(data) + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + return datetime_to_json(data) # type: ignore[arg-type] + + def check_value(self, data: object) -> bool: + # not sure which values we should accept for structured dtypes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False + + def _cast_value_unsafe(self, value: object) -> np.datetime64: + return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] @dataclass(frozen=True, kw_only=True) @@ -1356,9 +1206,9 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] def default_value(self) -> np.void: - return self.cast_value(0) + return self._cast_value_unsafe(0) - def cast_value(self, value: object) -> np.void: + def _cast_value_unsafe(self, value: object) -> np.void: return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) @classmethod @@ -1379,7 +1229,7 @@ def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: return super().check_dtype(dtype) and dtype.fields is not None @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[Any, Any]]] = [] @@ -1464,8 +1314,16 @@ def to_dtype(self) -> np.dtypes.VoidDType[int]: np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), ) - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return bytes_to_json(data.tobytes(), zarr_format) + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) + + def check_value(self, data: object) -> bool: + # not sure which values we should accept for structured dtypes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if not check_json_str(data): diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 106da80a61..a53d2e7866 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -31,7 +31,7 @@ def check_json_bool(data: JSON) -> TypeGuard[bool]: Bool True if the data is a boolean, False otherwise. """ - return bool(isinstance(data, bool)) + return isinstance(data, bool) def check_json_str(data: JSON) -> TypeGuard[str]: @@ -293,7 +293,7 @@ def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JS return float_to_json_v3(data.real), float_to_json_v3(data.imag) -def complex_to_json( +def complex_float_to_json( data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat]: """ @@ -424,9 +424,7 @@ def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: return float_from_json_v3(data) -def complex_from_json_v2( - data: tuple[JSONFloat, JSONFloat], dtype: np.dtypes.Complex64DType | np.dtypes.Complex128DType -) -> np.complexfloating[Any, Any]: +def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: """ Convert a JSON complex float to a complex number (v2). @@ -434,20 +432,16 @@ def complex_from_json_v2( ---------- data : tuple[JSONFloat, JSONFloat] The JSON complex float to convert. - dtype : Any - The numpy dtype. Returns ------- np.complexfloating The complex number. """ - return dtype.type(complex(float_from_json_v2(data[0]), float_from_json_v2(data[1]))) + return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) -def complex_from_json_v3( - data: tuple[JSONFloat, JSONFloat], dtype: np.dtypes.Complex64DType | np.dtypes.Complex128DType -) -> np.complexfloating[Any, Any]: +def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: """ Convert a JSON complex float to a complex number (v3). @@ -455,20 +449,16 @@ def complex_from_json_v3( ---------- data : tuple[JSONFloat, JSONFloat] The JSON complex float to convert. - dtype : Any - The numpy dtype. Returns ------- np.complexfloating The complex number. """ - return dtype.type(complex(float_from_json_v3(data[0]), float_from_json_v3(data[1]))) + return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) -def complex_from_json( - data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat -) -> np.complexfloating[Any, Any]: +def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: """ Convert a JSON complex float to a complex number based on zarr format. @@ -476,8 +466,6 @@ def complex_from_json( ---------- data : tuple[JSONFloat, JSONFloat] The JSON complex float to convert. - dtype : Any - The numpy dtype. zarr_format : ZarrFormat The zarr format version. @@ -487,12 +475,9 @@ def complex_from_json( The complex number. """ if zarr_format == 2: - return complex_from_json_v2(data, dtype) + return complex_float_from_json_v2(data) else: - if check_json_complex_float_v3(data): - return complex_from_json_v3(data, dtype) - else: - raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") + return complex_float_from_json_v3(data) raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 3409fa7ca4..74e7bf79e1 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -17,13 +17,14 @@ # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. _BaseDType = np.dtype[np.generic] -TScalar = TypeVar("TScalar", bound=_BaseScalar) +TScalar_co = TypeVar("TScalar_co", bound=_BaseScalar, covariant=True) # TODO: figure out an interface or protocol that non-numpy dtypes can use -TDType = TypeVar("TDType", bound=_BaseDType) +# These two type parameters are covariant because we want isinstance(ZDType[Subclass](), ZDType[BaseDType]) to be True +TDType_co = TypeVar("TDType_co", bound=_BaseDType, covariant=True) @dataclass(frozen=True, kw_only=True, slots=True) -class ZDType(Generic[TDType, TScalar], ABC): +class ZDType(Generic[TDType_co, TScalar_co], ABC): """ Abstract base class for wrapping native array data types, e.g. numpy dtypes @@ -41,11 +42,11 @@ class ZDType(Generic[TDType, TScalar], ABC): # mypy currently disallows class variables to contain type parameters # but it seems OK for us to use it here: # https://github.com/python/typing/discussions/1424#discussioncomment-7989934 - dtype_cls: ClassVar[type[TDType]] # type: ignore[misc] + dtype_cls: ClassVar[type[TDType_co]] # type: ignore[misc] _zarr_v3_name: ClassVar[str] @classmethod - def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType]: + def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType_co]: """ Check that a data type matches the dtype_cls class attribute. Used as a type guard. @@ -89,7 +90,7 @@ def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: @classmethod @abstractmethod - def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: + def _from_dtype_unsafe(cls: type[Self], dtype: _BaseDType) -> Self: """ Wrap a native dtype without checking. @@ -106,7 +107,7 @@ def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: ... @abstractmethod - def to_dtype(self: Self) -> TDType: + def to_dtype(self: Self) -> TDType_co: """ Return an instance of the wrapped dtype. @@ -117,8 +118,61 @@ def to_dtype(self: Self) -> TDType: """ ... + def cast_value(self, data: object) -> TScalar_co: + """ + Cast a value to the wrapped scalar type. The type is first checked for compatibility. If it's + incompatible with the associated scalar type, a ``TypeError`` will be raised. + + Parameters + ---------- + data : TScalar + The scalar value to cast. + + Returns + ------- + TScalar + The cast value. + """ + if self.check_value(data): + return self._cast_value_unsafe(data) + raise TypeError(f"Invalid value: {data}") + + @abstractmethod + def check_value(self, data: object) -> bool: + """ + Check that a value is a valid value for the wrapped data type. + + Parameters + ---------- + data : object + A value to check. + + Returns + ------- + Bool + True if the value is valid, False otherwise. + """ + ... + + @abstractmethod + def _cast_value_unsafe(self, data: object) -> TScalar_co: + """ + Cast a value to the wrapped data type. This method should not perform any input validation. + + Parameters + ---------- + data : TScalar + The scalar value to cast. + + Returns + ------- + TScalar + The cast value. + """ + ... + @abstractmethod - def default_value(self) -> TScalar: + def default_value(self) -> TScalar_co: """ Get the default value for the wrapped data type. This is a method, rather than an attribute, because the default value for some data types may depend on parameters that are not known @@ -216,7 +270,7 @@ def _from_json_unsafe(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> S ... @abstractmethod - def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ Convert a single value to JSON-serializable format. @@ -235,7 +289,7 @@ def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: ... @abstractmethod - def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: + def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: """ Read a JSON-serializable value as a scalar. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index d26ca52353..f3f738eea7 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -9,7 +9,7 @@ from zarr.abc.metadata import Metadata from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.wrapper import TDType, TScalar, ZDType, _BaseDType, _BaseScalar +from zarr.core.dtype.wrapper import TDType_co, TScalar_co, ZDType, _BaseDType, _BaseScalar if TYPE_CHECKING: from typing import Any, Literal, Self @@ -58,7 +58,7 @@ def __init__( self, *, shape: ChunkCoords, - dtype: ZDType[TDType, TScalar], + dtype: ZDType[TDType_co, TScalar_co], chunks: ChunkCoords, fill_value: Any, order: MemoryOrder, @@ -176,7 +176,7 @@ def to_dict(self) -> dict[str, JSON]: zarray_dict["filters"] = new_filters if self.fill_value is not None: - fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) # type: ignore[arg-type] + fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) diff --git a/tests/conftest.py b/tests/conftest.py index b2c106f2e2..ac419ae2f5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,7 +20,8 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype import data_type_registry, get_data_type_from_native_dtype +from zarr.core.dtype._numpy import DateTime64, HasLength, Structured from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -36,6 +37,7 @@ from zarr.core.array import CompressorsLike, FiltersLike, SerializerLike, ShardsLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import ChunkCoords, MemoryOrder, ShapeLike, ZarrFormat + from zarr.core.dtype.wrapper import ZDType async def parse_store( @@ -404,3 +406,17 @@ def meta_from_array( chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, ) + + +# Generate a collection of zdtype instances for use in testing. +zdtype_examples: tuple[ZDType[Any, Any], ...] = () +for wrapper_cls in data_type_registry.contents.values(): + # The Structured dtype has to be constructed with some actual fields + if wrapper_cls is Structured: + zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) + elif issubclass(wrapper_cls, HasLength): + zdtype_examples += (wrapper_cls(length=1),) + elif issubclass(wrapper_cls, DateTime64): + zdtype_examples += (wrapper_cls(unit="s"),) + else: + zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_array.py b/tests/test_array.py index 5ed5ba06b7..6a562f1d07 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -40,12 +40,15 @@ from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype._numpy import Float64 +from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath +from .conftest import zdtype_examples + if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike from zarr.core.metadata.v2 import ArrayV2Metadata @@ -177,32 +180,42 @@ def test_array_name_properties_with_group( @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize("specifiy_fill_value", [True, False]) -@pytest.mark.parametrize("dtype_str", ["bool", "uint8", "complex64"]) -def test_array_v3_fill_value_default( - store: MemoryStore, specifiy_fill_value: bool, dtype_str: str +@pytest.mark.parametrize( + "zdtype", zdtype_examples, ids=tuple(str(type(v)) for v in zdtype_examples) +) +def test_array_fill_value_default( + store: MemoryStore, specifiy_fill_value: bool, zdtype: ZDType[Any, Any] ) -> None: """ Test that creating an array with the fill_value parameter set to None, or unspecified, results in the expected fill_value attribute of the array, i.e. 0 cast to the array's dtype. """ shape = (10,) - default_fill_value = 0 if specifiy_fill_value: arr = zarr.create_array( store=store, shape=shape, - dtype=dtype_str, + dtype=zdtype, zarr_format=3, chunks=shape, fill_value=None, ) else: - arr = zarr.create_array( - store=store, shape=shape, dtype=dtype_str, zarr_format=3, chunks=shape - ) + arr = zarr.create_array(store=store, shape=shape, dtype=zdtype, zarr_format=3, chunks=shape) + expected_fill_value = zdtype.default_value() + if isinstance(expected_fill_value, np.datetime64 | np.timedelta64): + if np.isnat(expected_fill_value): + assert np.isnat(arr.fill_value) + elif isinstance(expected_fill_value, np.floating | np.complexfloating): + if np.isnan(expected_fill_value): + assert np.isnan(arr.fill_value) + else: + assert arr.fill_value == expected_fill_value + # A simpler check would be to ensure that arr.fill_value.dtype == arr.dtype + # But for some numpy data types (namely, U), scalars might not have length. An empty string + # scalar from a `>U4` array would have dtype `>U`, and arr.fill_value.dtype == arr.dtype will fail. - assert arr.fill_value == np.dtype(dtype_str).type(default_fill_value) - assert arr.fill_value.dtype == arr.dtype + assert type(arr.fill_value) is type(np.array([arr.fill_value], dtype=arr.dtype)[0]) @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1004,7 +1017,7 @@ async def test_v3_chunk_encoding( filters=filters, compressors=compressors, serializer="auto", - dtype=arr.metadata.data_type, # type: ignore[union-attr] + dtype=arr._zdtype, ) assert arr.filters == filters_expected assert arr.compressors == compressors_expected @@ -1369,4 +1382,4 @@ async def test_sharding_coordinate_selection() -> None: shards=(2, 4, 4), ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) - assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() # type: ignore[index] + assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() diff --git a/tests/test_dtype.py b/tests/test_dtype.py index f690e6ce26..122949664c 100644 --- a/tests/test_dtype.py +++ b/tests/test_dtype.py @@ -8,6 +8,8 @@ import zarr from zarr.core.config import config +from .conftest import zdtype_examples + if TYPE_CHECKING: from collections.abc import Generator @@ -64,6 +66,17 @@ def data_type_registry_fixture() -> DataTypeRegistry: VLEN_STRING_CODE = "O" +def test_zdtype_examples() -> None: + """ + Test that all the elements of the exported union type DTYPE have an example in the variable + zdtype_examples, which we use for testing. + + If this test fails, that means that either there is a data type that does not have an example, + or there is a data type that is missing from the DTYPE union type. + """ + assert set(map(type, zdtype_examples)) == set(get_args(DTYPE)) + + @pytest.mark.parametrize( ("wrapper_cls", "np_dtype"), [ @@ -88,9 +101,7 @@ def data_type_registry_fixture() -> DataTypeRegistry: (DateTime64, "datetime64[s]"), ], ) -def test_wrap( - wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.dtype[np.generic] | str -) -> None: +def test_wrap(wrapper_cls: type[ZDType[Any, Any]], np_dtype: np.dtype[np.generic] | str) -> None: """ Test that the wrapper class has the correct dtype class bound to the dtype_cls variable Test that the ``wrap`` method produces an instance of the wrapper class @@ -102,19 +113,17 @@ def test_wrap( with pytest.raises(DataTypeValidationError, match="Invalid dtype"): wrapper_cls.from_dtype("not a dtype") # type: ignore[arg-type] - assert isinstance(wrapped, wrapper_cls) assert wrapped.to_dtype() == dt -@pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) -def test_dict_serialization(wrapper_cls: Any, zarr_format: ZarrFormat) -> None: - if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) - else: - instance = wrapper_cls() - as_dict = instance.to_json(zarr_format=zarr_format) - assert wrapper_cls.from_json(as_dict, zarr_format=zarr_format) == instance +@pytest.mark.parametrize("zdtype", zdtype_examples) +def test_to_json_roundtrip(zdtype: ZDType[Any, Any], zarr_format: ZarrFormat) -> None: + """ + Test that a zdtype instance can round-trip through its JSON form + """ + as_dict = zdtype.to_json(zarr_format=zarr_format) + assert zdtype.from_json(as_dict, zarr_format=zarr_format) == zdtype @pytest.mark.parametrize( @@ -138,7 +147,7 @@ def test_dict_serialization(wrapper_cls: Any, zarr_format: ZarrFormat) -> None: (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), (FixedLengthUnicode(length=3), np.str_("")), ( - Structured(fields=(("a", Float64()), ("b", Int8()))), # type: ignore[arg-type] + Structured(fields=(("a", Float64()), ("b", Int8()))), np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], ), (VariableLengthString(), ""), @@ -188,6 +197,42 @@ def test_to_json_value_v2( assert wrapper.to_json_value(input_value, zarr_format=2) == expected_json +# NOTE! This test is currently a direct copy of the v2 version. When or if we change JSON serialization +# in a v3-specific manner, this test must be changed. +# TODO: Apply zarr-v3-specific changes to this test as needed +@pytest.mark.parametrize( + ("wrapper", "input_value", "expected_json"), + [ + (Bool(), np.bool_(True), True), + (Int8(), np.int8(42), 42), + (UInt8(), np.uint8(42), 42), + (Int16(), np.int16(42), 42), + (UInt16(), np.uint16(42), 42), + (Int32(), np.int32(42), 42), + (UInt32(), np.uint32(42), 42), + (Int64(), np.int64(42), 42), + (UInt64(), np.uint64(42), 42), + (Float16(), np.float16(42.0), 42.0), + (Float32(), np.float32(42.0), 42.0), + (Float64(), np.float64(42.0), 42.0), + (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), + (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), + (FixedLengthAscii(length=4), np.bytes_(b"test"), "dGVzdA=="), + (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), + (FixedLengthUnicode(length=4), np.str_("test"), "test"), + (VariableLengthString(), "test", "test"), + (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), + ], +) +def test_to_json_value_v3( + wrapper: ZDType[_BaseDType, _BaseScalar], input_value: Any, expected_json: Any +) -> None: + """ + Test the to_json_value method for each dtype wrapper for zarr v3 + """ + assert wrapper.to_json_value(input_value, zarr_format=3) == expected_json + + @pytest.mark.parametrize( ("wrapper", "json_value", "expected_value"), [ @@ -227,7 +272,7 @@ def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) # type: ignore[arg-type] + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) @@ -236,13 +281,13 @@ def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) # type: ignore[arg-type] + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): def default_value(self) -> np.bool_: return np.True_ - data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) # type: ignore[arg-type] + data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) @staticmethod @@ -275,30 +320,26 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non data_type_registry_fixture.get(outside_dtype) @staticmethod - @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) + @pytest.mark.parametrize("zdtype", zdtype_examples) def test_registered_dtypes( - wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], zarr_format: ZarrFormat + zdtype: ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat ) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ - if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) # type: ignore[misc] - else: - instance = wrapper_cls() - assert data_type_registry.match_dtype(instance.to_dtype()) == instance + assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype assert ( data_type_registry.match_json( - instance.to_json(zarr_format=zarr_format), zarr_format=zarr_format + zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format ) - == instance + == zdtype ) @staticmethod - @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) + @pytest.mark.parametrize("zdtype", zdtype_examples) def test_match_dtype_unique( - wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], + zdtype: ZDType[Any, Any], data_type_registry_fixture: DataTypeRegistry, zarr_format: ZarrFormat, ) -> None: @@ -308,20 +349,16 @@ def test_match_dtype_unique( fails to match anything in the registry """ for _cls in get_args(DTYPE): - if _cls is not wrapper_cls: + if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) - if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) # type: ignore[misc] - else: - instance = wrapper_cls() - dtype_instance = instance.to_dtype() + dtype_instance = zdtype.to_dtype() msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_dtype(dtype_instance) - instance_dict = instance.to_json(zarr_format=zarr_format) + instance_dict = zdtype.to_json(zarr_format=zarr_format) msg = f"No data type wrapper found that matches {instance_dict}" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index bd5f9be8b6..cd30f5cf3f 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -13,7 +13,7 @@ from zarr.core.config import config from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype._numpy import DateTime64 -from zarr.core.dtype.common import complex_from_json +from zarr.core.dtype.common import check_json_complex_float from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, @@ -28,7 +28,7 @@ from typing import Any from zarr.abc.codec import Codec - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat from zarr.core.metadata.v3 import ( @@ -135,17 +135,12 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) -@pytest.mark.parametrize("dtype_str", [*complex_dtypes]) @pytest.mark.parametrize("data", [[1.0, 0.0, 3.0], [0, 1, 3], [1]]) -def test_complex_to_json_invalid(data: object, dtype_str: str) -> None: - """ - Test that parse_fill_value(fill_value, dtype) correctly rejects sequences with length not - equal to 2 - """ - dtype_instance = get_data_type_from_native_dtype(dtype_str) - match = f"Invalid type: {data}. Expected a sequence of two numbers." - with pytest.raises(TypeError, match=re.escape(match)): - complex_from_json(data=data, dtype=dtype_instance, zarr_format=3) +def test_complex_to_json_invalid(data: object, zarr_format: ZarrFormat) -> None: + assert not check_json_complex_float(data, zarr_format=zarr_format) + # match = f"Invalid type: {data}. Expected a sequence of two numbers." + # with pytest.raises(TypeError, match=re.escape(match)): + # complex_float_from_json(data=data, zarr_format=3) @pytest.mark.parametrize("fill_value", [{"foo": 10}]) From cb0a7d49614b99b36e2d29ba450739ec71c3da16 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 24 Mar 2025 17:20:37 +0100 Subject: [PATCH 049/130] update data types documentation, and expose core/dtype module to autodoc --- docs/conf.py | 5 +- docs/user-guide/data_types.rst | 203 ++++++++++----------------------- 2 files changed, 66 insertions(+), 142 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index d69309d432..8a9835e4cb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -68,7 +68,10 @@ def skip_submodules( ) -> bool: # Skip documenting zarr.codecs submodules # codecs are documented in the main zarr.codecs namespace - if what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): + # TODO: just document everything instead using this weak case-by-case logic + if what == "module" and name.startswith("zarr.core.dtype."): + skip = False + elif what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): skip = True return skip diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 36a9ea40f7..a281b349de 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -16,14 +16,14 @@ Zarr-Python supports creating arrays with Numpy data types:: Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. This means Zarr data types must be interpreted correctly when clients read an array. So each Zarr data type defines a procedure for -encoding / decoding that data type to / from Zarr array metadata, and also encoding / decoding **instances** of that data type to / from +encoding/decoding that data type to/from Zarr array metadata, and also encoding/decoding **instances** of that data type to/from array metadata. These serialization procedures depend on the Zarr format. Data types in Zarr version 2 ----------------------------- Version 2 of the Zarr format defined its data types relative to `Numpy's data types `_, and added a few non-Numpy data types as well. -Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype: +Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype:: >>> import zarr >>> import numpy as np @@ -32,158 +32,79 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st >>> np_dtype = np.dtype('int64') >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] - >>> assert dtype_meta == np_dtype.str # True + >>> assert dtype_meta == np_dtype.str # True >>> dtype_meta '`_, or "byte order", of the data type. Following Numpy's example, - in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. + The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following Numpy's example, + in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. In addition to defining a representation of the data type itself (which in the example above was just a simple string ``"i2``; Zarr V3 represents the same data type as ``int16``. -* No endianness -* A data type can be encoded in metadata as a string or a ``JSON`` object with the structure ``{"name": , "configuration": {...}}`` +----------------------------- + +Zarr V3 brings several key changes to how data types are represented: + +- Zarr V3 identifies the basic data types as strings like ``int8``, ``int16``, etc. In Zarr V2 ``int8`` would represented as ``|i1``, ``int16`` would be ``>i2`` **or** ``i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. - - In Zarr V3 metadata, the same Numpy data type would be saved as ``{..., "data_type": "int16", "codecs": [..., {"name": "bytes", "configuration": {"endian": "big"}, ...]}`` - -* Associate a default fill value with a native data type. This is not mandated by the Zarr specifications, but it's convenient for users - to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for - parametric data types like fixed-length strings the default can only be generated after the data type has been parametrized at runtime. - -* Round-trip native scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications - define how scalars of each data type should be stored as JSON in array metadata documents, and in principle each data type - can define this encoding separately. - -* Do all of the above for *user-defined data types*. Zarr-Python should support data types added as extensions,so we cannot - hard-code the list of data types. We need to ensure that users can easily (or easily enough) define a python object - that models their custom data type and register this object with Zarr-Python, so that the above operations all succeed for their - custom data type. - -To achieve these goals, Zarr Python uses a class called :class:`zarr.core.dtype.DTypeWrapper` to wrap native data types. Each data type -supported by Zarr Python is modeled by a subclass of `DTypeWrapper`, which has the following structure: - -(attribute) ``dtype_cls`` -^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``dtype_cls`` attribute is a **class variable** that is bound to a class that can produce -an instance of a native data type. For example, on the ``DTypeWrapper`` used to model the boolean -data type, the ``dtype_cls`` attribute is bound to the numpy bool data type class: ``np.dtypes.BoolDType``. -This attribute is used when we need to create an instance of the native data type, for example when -defining a Numpy array that will contain Zarr data. - -It might seem odd that ``DTypeWrapper.dtype_cls`` binds to a *class* that produces a native data type instead of an instance of that native data type -- -why not have a ``DTypeWrapper.dtype`` attribute that binds to ``np.dtypes.BoolDType()``? The reason why ``DTypeWrapper`` -doesn't wrap a concrete data type instance is because data type instances may have endianness information, but Zarr V3 -data types do not. To model Zarr V3 data types, we need endianness to be an **instance variable** which is -defined when creating an instance of the ```DTypeWrapper``. Subclasses of ``DTypeWrapper`` that model data types with -byte order semantics thus have ``endianness`` as an instance variable, and this value can be set when creating an instance of the wrapper. - - -(attribute) ``_zarr_v3_name`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``_zarr_v3_name`` attribute encodes the canonical name for a data type for Zarr V3. For many data types these names -are defined in the `Zarr V3 specification `_ For nearly all of the -data types defined in Zarr V3, this name can be used to uniquely specify a data type. The one exception is the ``r*`` data type, -which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. - -(class method) ``from_dtype(cls, dtype) -> Self`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method defines a procedure for safely converting a native dtype instance into an instance of ``DTypeWrapper``. It should perform -validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some -data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. -A ``DTypeWrapper`` that wraps Numpy structured data types must do additional checks to ensure that the input ``dtype`` is actually a structured data type. -If input validation succeeds, this method will call ``_from_dtype_unsafe``. - -(method) ``to_dtype(self) -> dtype`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together -with ``from_dtype``, this method allows round-trip conversion of a native data type in to a wrapper class and then out again. - -That is, for some ``DTypeWrapper`` class ``FooWrapper`` that wraps a native data type called ``foo``, ``FooWrapper.from_dtype(instance_of_foo).to_dtype() == instance_of_foo`` should be true. - -(method) ``to_dict(self) -> dict`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in -Zarr metadata. - -(method) ``cast_value(self, value: object) -> scalar`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method converts a python object to an instance of the wrapped data type. It is used for generating the default -value associated with this data type. - - -(method) ``default_value(self) -> scalar`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method returns the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value -for an array when a user has not requested one. - -Why is this a method and not a static attribute? Although some data types -can have a static default value, parametrized data types like fixed-length strings or structured data types cannot. For these data types, -a default value must be calculated based on the attributes of the wrapped data type. - -(class method) ``check_dtype(cls, dtype) -> bool`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This class method checks if a native dtype is compatible with the ``DTypeWrapper`` class. It returns ``True`` -if ``dtype`` is compatible with the wrapper class, and ``False`` otherwise. For many data types, this check is as simple -as checking that ``cls.dtype_cls`` matches ``type(dtype)``, i.e. checking that the data type class wrapped -by the ``DTypeWrapper`` is the same as the class of ``dtype``. But there are some data types where this check alone is not sufficient, -in which case this method is overridden so that additional properties of ``dtype`` can be inspected and compared with -the expectations of ``cls``. - -(class method) ``from_dict(cls, dtype) -> Self`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This class method creates a ``DTypeWrapper`` from an appropriately structured dictionary. The default -implementation first checks that the dictionary has the correct structure, and then uses its data -to instantiate the ``DTypeWrapper`` instance. - -(method) ``to_dict(self) -> dict[str, JSON]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Returns a dictionary form of the wrapped data type. This is used prior to writing array metadata. - -(class method) ``get_name(self, zarr_format: Literal[2, 3]) -> str`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method generates a name for the wrapped data type, depending on the Zarr format. If ``zarr_format`` is -2 and the wrapped data type is a Numpy data type, then the Numpy string representation of that data type is returned. -If ``zarr_format`` is 3, then the Zarr V3 name for the wrapped data type is returned. For most data types -the Zarr V3 name will be stored as the ``_zarr_v3_name`` class attribute, but for parametric data types the -name must be computed at runtime based on the parameters of the data type. - - -(method) ``to_json_value(self, data: scalar, zarr_format: Literal[2, 3]) -> JSON`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method converts a scalar instance of the data type into a JSON-serialiazable value. -For some data types like bool and integers this conversion is simple -- just return a JSON boolean -or number -- but other data types define a JSON serialization for scalars that is a bit more involved. -And this JSON serialization depends on the Zarr format. - -(method) ``from_json_value(self, data: JSON, zarr_format: Literal[2, 3]) -> scalar`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Convert a JSON-serialiazed scalar to a native scalar. This inverts the operation of ``to_json_value``. - -Using a custom data type ------------------------- - -TODO \ No newline at end of file +To abstract over these syntactical and semantic differences, Zarr-Python uses a class called `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ to wrap native data types (e.g., Numpy data types) and provide Zarr V2 and Zarr V3 compatibility routines. +Each data type supported by Zarr-Python is modeled by a subclass of ``ZDType``, which provides an API for the following operations: + +- Wrapping / unwrapping a native data type +- Encoding / decoding a data type to / from Zarr V2 and Zarr V3 array metadata. +- Encoding / decoding a scalar value to / from Zarr V2 and Zarr V3 array metadata. + + +Example Usage +~~~~~~~~~~~~~ + +.. code-block:: python + + from zarr.core.dtype.wrapper import Int8 + + # Create a ZDType instance from a native dtype + int8 = Int8.from_dtype(np.dtype('int8')) + + # Convert back to native dtype + native_dtype = int8.to_dtype() + assert native_dtype == np.dtype('int8') + + # Get the default value + default_value = int8.default_value() + assert default_value == np.int8(0) + + # Serialize to JSON + json_representation = int8.to_json(zarr_format=3) + + # Serialize a scalar value + json_value = int8.to_json_value(42, zarr_format=3) + assert json_value == 42 + + # Deserialize a scalar value + scalar_value = int8.from_json_value(42, zarr_format=3) + assert scalar_value == np.int8(42) + +Custom Data Types +~~~~~~~~~~~~~~~~~ + +Users can define custom data types by subclassing `ZDType` and implementing the required methods. +Once defined, the custom data type can be registered with Zarr-Python to enable seamless integration with the library. + + \ No newline at end of file From 9989c64114364f8a5381a7423be4cb4bfedb9461 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 24 Mar 2025 17:43:12 +0100 Subject: [PATCH 050/130] add failing endianness round-trip test --- tests/test_array.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 6a562f1d07..ac35012fa1 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -7,7 +7,7 @@ import re import sys from itertools import accumulate -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, get_args from unittest import mock import numcodecs @@ -39,7 +39,8 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import Float64 +from zarr.core.dtype._numpy import Float64, endianness_from_numpy_str +from zarr.core.dtype.common import Endianness from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv @@ -1383,3 +1384,20 @@ async def test_sharding_coordinate_selection() -> None: ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("endianness", get_args(Endianness)) +def test_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None: + """ + Test that that endianness is correctly set when creating an array. + """ + if endianness == "little": + np_dtype = " Date: Mon, 24 Mar 2025 18:28:45 +0100 Subject: [PATCH 051/130] fix endianness --- src/zarr/core/array.py | 33 +++++++++++++++++++++++++++++++ src/zarr/dtype.py | 3 +++ tests/test_array.py | 45 ++++++++++++++++++++++++++++++++---------- 3 files changed, 71 insertions(+), 10 deletions(-) create mode 100644 src/zarr/dtype.py diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 6c34c0d351..978e7d0c62 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -69,6 +69,7 @@ ZDTypeLike, parse_data_type, ) +from zarr.core.dtype._numpy import HasEndianness from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -4246,6 +4247,24 @@ def _get_default_chunk_encoding_v3( else: serializer = zarr_config.get("array.v3_default_serializer.default") + # Modify the default serializer so that it matches the endianness of the dtype, otherwise unset the + # endian key + + # This is effective problematic for many reasons: + # - we are assuming that endianness is set by the serializer, when it could also be changed + # by any one of the filters. + # - we are assuming that the serializer has a specific configuration. A different serializer that + # alters endianness might not use the same configuration structure. + # - we are mutating a configuration dictionary. It would be much better to work with the codec + # api for this. + # All of these things are acceptable right now because there is only 1 serializer that affects + # endianness, but this design will not last if this situation changes. + if "endian" in serializer["configuration"]: + if isinstance(dtype, HasEndianness): + serializer["configuration"]["endian"] = dtype.endianness + else: + serializer["configuration"].pop("endian") + return ( tuple(_parse_array_array_codec(f) for f in filters), _parse_array_bytes_codec(serializer), @@ -4352,6 +4371,20 @@ def _parse_chunk_encoding_v3( out_array_bytes = default_array_bytes else: out_array_bytes = _parse_array_bytes_codec(serializer) + # check that the endianness of the requested serializer matches the dtype of the data, if applicable + if ( + isinstance(out_array_bytes, BytesCodec) + and isinstance(dtype, HasEndianness) + and ( + out_array_bytes.endian is None + or str(out_array_bytes.endian.value) != dtype.endianness + ) + ): + msg = ( + f"The endianness of the requested serializer ({out_array_bytes}) does not match the endianness of the dtype ({dtype.endianness}). " + "The endianness of the serializer and the dtype must match." + ) + raise ValueError(msg) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py new file mode 100644 index 0000000000..6e3789543b --- /dev/null +++ b/src/zarr/dtype.py @@ -0,0 +1,3 @@ +from zarr.core.dtype import ZDType, data_type_registry + +__all__ = ["ZDType", "data_type_registry"] diff --git a/tests/test_array.py b/tests/test_array.py index ac35012fa1..20de0d6032 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -39,11 +39,12 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import Float64, endianness_from_numpy_str +from zarr.core.dtype._numpy import Float64, Int16, endianness_from_numpy_str from zarr.core.dtype.common import Endianness from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath @@ -53,7 +54,6 @@ if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike from zarr.core.metadata.v2 import ArrayV2Metadata - from zarr.core.metadata.v3 import ArrayV3Metadata @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @@ -1388,16 +1388,41 @@ async def test_sharding_coordinate_selection() -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize("endianness", get_args(Endianness)) -def test_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None: +def test_default_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None: """ - Test that that endianness is correctly set when creating an array. + Test that that endianness is correctly set when creating an array when not specifying a serializer + """ + dtype = Int16(endianness=endianness) + arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) + assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness + if zarr_format == 3: + assert isinstance(arr.metadata, ArrayV3Metadata) # mypy + assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("endianness", get_args(Endianness)) +def test_explicit_endianness(store: Store, endianness: Endianness) -> None: + """ + Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error """ if endianness == "little": - np_dtype = " Date: Mon, 24 Mar 2025 18:38:08 +0100 Subject: [PATCH 052/130] additional check in test_explicit_endianness --- tests/test_array.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_array.py b/tests/test_array.py index 20de0d6032..f08018960f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1426,3 +1426,19 @@ def test_explicit_endianness(store: Store, endianness: Endianness) -> None: zarr_format=3, serializer=serializer, ) + + # additional check for the case where the serializer has endian=None + none_serializer = dataclasses.replace(serializer, endian=None) + msg = ( + f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). " + "The endianness of the serializer and the dtype must match." + ) + + with pytest.raises(ValueError, match=re.escape(msg)): + _ = zarr.create_array( + store=store, + shape=(1,), + dtype=dtype, + zarr_format=3, + serializer=none_serializer, + ) From 2bffe1a6ccd3aed4f7e1b708fd110fc9418cb9dd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 24 Mar 2025 21:22:52 +0100 Subject: [PATCH 053/130] add failing test for round-tripping vlen strings --- src/zarr/core/array.py | 4 +- src/zarr/core/dtype/__init__.py | 7 +- src/zarr/core/dtype/_numpy.py | 2 +- src/zarr/core/dtype/common.py | 5 +- tests/test_array.py | 181 +++++++++++++++++++++----------- 5 files changed, 130 insertions(+), 69 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 978e7d0c62..cba4a49410 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4259,11 +4259,11 @@ def _get_default_chunk_encoding_v3( # api for this. # All of these things are acceptable right now because there is only 1 serializer that affects # endianness, but this design will not last if this situation changes. - if "endian" in serializer["configuration"]: + if serializer.get("configuration") is not None: if isinstance(dtype, HasEndianness): serializer["configuration"]["endian"] = dtype.endianness else: - serializer["configuration"].pop("endian") + serializer["configuration"].pop("endian", None) return ( tuple(_parse_array_array_codec(f) for f in filters), diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 021b6b48e2..f9b1364011 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -99,7 +99,12 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, # this is a valid _VoidDTypeLike check na_dtype = np.dtype([tuple(d) for d in dtype]) else: - na_dtype = np.dtype(dtype) + if dtype == "|T16": + # `|T16` is the numpy dtype str form for variable length strings. unfortunately + # numpy cannot create these directly from np.dtype("|T16") + na_dtype = np.dtypes.StringDType() + else: + na_dtype = np.dtype(dtype) else: na_dtype = dtype return data_type_registry.match_dtype(na_dtype) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 38597f8fee..cab849cf74 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -907,7 +907,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") def check_value(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes) + return isinstance(data, np.bytes_ | str | bytes | np.void) def _cast_value_unsafe(self, value: object) -> np.void: return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index a53d2e7866..900b3fddbd 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -356,7 +356,10 @@ def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: """ if zarr_format == 2: return base64.b64decode(data.encode("ascii")) - raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + # TODO: differentiate these as needed. This is a spec question. + if zarr_format == 3: + return base64.b64decode(data.encode("ascii")) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") def float_from_json_v2(data: JSONFloat) -> float: diff --git a/tests/test_array.py b/tests/test_array.py index f08018960f..7b8b72f119 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -39,7 +39,13 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import Float64, Int16, endianness_from_numpy_str +from zarr.core.dtype._numpy import ( + DateTime64, + Float64, + Int16, + Structured, + endianness_from_numpy_str, +) from zarr.core.dtype.common import Endianness from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup @@ -936,12 +942,59 @@ def test_chunks_and_shards(store: Store) -> None: assert arr_v2.shards is None @staticmethod - @pytest.mark.parametrize( - ("dtype", "fill_value_expected"), [(" None: + @pytest.mark.parametrize("dtype", zdtype_examples) + def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: + """ + Test that the fill value of an array is set to the default value for the dtype object + """ a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype) - assert a.fill_value == fill_value_expected + if isinstance(dtype, DateTime64) and np.isnat(a.fill_value): + assert np.isnat(dtype.default_value()) + else: + assert a.fill_value == dtype.default_value() + + @staticmethod + @pytest.mark.parametrize("dtype", zdtype_examples) + def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat) -> None: + """ + Test that the same array is produced from a ZDType instance, a numpy dtype, or a numpy string + """ + a = zarr.create_array( + store, name="a", shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format + ) + b = zarr.create_array( + store, + name="b", + shape=(5,), + chunks=(5,), + dtype=dtype.to_dtype(), + zarr_format=zarr_format, + ) + assert a.dtype == b.dtype + + # Structured dtypes do not have a numpy string representation that uniquely identifies them + if not isinstance(dtype, Structured): + c = zarr.create_array( + store, + name="c", + shape=(5,), + chunks=(5,), + dtype=dtype.to_dtype().str, + zarr_format=zarr_format, + ) + assert a.dtype == c.dtype + + @staticmethod + @pytest.mark.parametrize("dtype", zdtype_examples) + def test_dtype_roundtrip( + dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat + ) -> None: + """ + Test that creating an array, then opening it, gets the same array. + """ + a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format) + b = zarr.open_array(store) + assert a.dtype == b.dtype @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"]) @@ -1266,6 +1319,64 @@ async def test_name(store: Store, zarr_format: ZarrFormat, path: str | None) -> store=store, path=parent_path, mode="r", zarr_format=zarr_format ) + @staticmethod + @pytest.mark.parametrize("endianness", get_args(Endianness)) + def test_default_endianness( + store: Store, zarr_format: ZarrFormat, endianness: Endianness + ) -> None: + """ + Test that that endianness is correctly set when creating an array when not specifying a serializer + """ + dtype = Int16(endianness=endianness) + arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) + assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness + if zarr_format == 3: + assert isinstance(arr.metadata, ArrayV3Metadata) # mypy + assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr] + + @staticmethod + @pytest.mark.parametrize("endianness", get_args(Endianness)) + def test_explicit_endianness(store: Store, endianness: Endianness) -> None: + """ + Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error + """ + if endianness == "little": + dtype = Int16(endianness="big") + else: + dtype = Int16(endianness="little") + + serializer = BytesCodec(endian=endianness) + + msg = ( + f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). " + "The endianness of the serializer and the dtype must match." + ) + + with pytest.raises(ValueError, match=re.escape(msg)): + _ = zarr.create_array( + store=store, + shape=(1,), + dtype=dtype, + zarr_format=3, + serializer=serializer, + ) + + # additional check for the case where the serializer has endian=None + none_serializer = dataclasses.replace(serializer, endian=None) + msg = ( + f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). " + "The endianness of the serializer and the dtype must match." + ) + + with pytest.raises(ValueError, match=re.escape(msg)): + _ = zarr.create_array( + store=store, + shape=(1,), + dtype=dtype, + zarr_format=3, + serializer=none_serializer, + ) + async def test_scalar_array() -> None: arr = zarr.array(1.5) @@ -1384,61 +1495,3 @@ async def test_sharding_coordinate_selection() -> None: ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("endianness", get_args(Endianness)) -def test_default_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None: - """ - Test that that endianness is correctly set when creating an array when not specifying a serializer - """ - dtype = Int16(endianness=endianness) - arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) - assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness - if zarr_format == 3: - assert isinstance(arr.metadata, ArrayV3Metadata) # mypy - assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr] - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("endianness", get_args(Endianness)) -def test_explicit_endianness(store: Store, endianness: Endianness) -> None: - """ - Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error - """ - if endianness == "little": - dtype = Int16(endianness="big") - else: - dtype = Int16(endianness="little") - - serializer = BytesCodec(endian=endianness) - - msg = ( - f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." - ) - - with pytest.raises(ValueError, match=re.escape(msg)): - _ = zarr.create_array( - store=store, - shape=(1,), - dtype=dtype, - zarr_format=3, - serializer=serializer, - ) - - # additional check for the case where the serializer has endian=None - none_serializer = dataclasses.replace(serializer, endian=None) - msg = ( - f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." - ) - - with pytest.raises(ValueError, match=re.escape(msg)): - _ = zarr.create_array( - store=store, - shape=(1,), - dtype=dtype, - zarr_format=3, - serializer=none_serializer, - ) From aa322715adcba81f90da998ff7f56ba9c379b654 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 10:11:54 +0100 Subject: [PATCH 054/130] route object dtype arrays to vlen string dtype when numpy > 2 --- src/zarr/core/dtype/__init__.py | 12 +++++------- src/zarr/core/dtype/_numpy.py | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index f9b1364011..5483b21998 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -90,7 +90,10 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, """ data_type_registry.lazy_load() if not isinstance(dtype, np.dtype): - if dtype in (str, "str"): + # TODO: This check has a lot of assumptions in it! Chiefly, we assume that the + # numpy object dtype contains variable length strings, which is not in general true + # When / if zarr python supports ragged arrays, for example, this check will fail! + if dtype in (str, "str", "|T16", "O", "|O", np.dtypes.ObjectDType()): if _NUMPY_SUPPORTS_VLEN_STRING: na_dtype = np.dtype("T") else: @@ -99,12 +102,7 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, # this is a valid _VoidDTypeLike check na_dtype = np.dtype([tuple(d) for d in dtype]) else: - if dtype == "|T16": - # `|T16` is the numpy dtype str form for variable length strings. unfortunately - # numpy cannot create these directly from np.dtype("|T16") - na_dtype = np.dtypes.StringDType() - else: - na_dtype = np.dtype(dtype) + na_dtype = np.dtype(dtype) else: na_dtype = dtype return data_type_registry.match_dtype(na_dtype) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index cab849cf74..7c803ce1f0 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -1051,7 +1051,7 @@ def _cast_value_unsafe(self, value: object) -> str: return str(value) else: - + # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @dataclass(frozen=True, kw_only=True) class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] dtype_cls = np.dtypes.ObjectDType From 617d3f05dabeba2b8e5b654406a581965825e2b7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 10:57:22 +0100 Subject: [PATCH 055/130] relax endianness mismatch to a warning instead of an error --- src/zarr/core/array.py | 4 ++-- tests/test_array.py | 21 +++------------------ tests/test_codecs/test_endian.py | 2 ++ 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index cba4a49410..9a0d0fa83d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4382,9 +4382,9 @@ def _parse_chunk_encoding_v3( ): msg = ( f"The endianness of the requested serializer ({out_array_bytes}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." + "In this situation the serializer's endianness takes priority. To avoid this warning, ensure the endianness of the serializer matches the endianness of the dtype." ) - raise ValueError(msg) + warnings.warn(msg, UserWarning, stacklevel=2) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () diff --git a/tests/test_array.py b/tests/test_array.py index 7b8b72f119..2da7f0fa72 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1349,10 +1349,11 @@ def test_explicit_endianness(store: Store, endianness: Endianness) -> None: msg = ( f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." + "In this situation the serializer's endianness takes priority. " + "To avoid this warning, ensure the endianness of the serializer matches the endianness of the dtype." ) - with pytest.raises(ValueError, match=re.escape(msg)): + with pytest.warns(UserWarning, match=re.escape(msg)): _ = zarr.create_array( store=store, shape=(1,), @@ -1361,22 +1362,6 @@ def test_explicit_endianness(store: Store, endianness: Endianness) -> None: serializer=serializer, ) - # additional check for the case where the serializer has endian=None - none_serializer = dataclasses.replace(serializer, endian=None) - msg = ( - f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." - ) - - with pytest.raises(ValueError, match=re.escape(msg)): - _ = zarr.create_array( - store=store, - shape=(1,), - dtype=dtype, - zarr_format=3, - serializer=none_serializer, - ) - async def test_scalar_array() -> None: arr = zarr.array(1.5) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index c0c4dd4e75..ab64afb1b8 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -11,6 +11,7 @@ from .test_codecs import _AsyncArrayProxy +@pytest.mark.filterwarnings("ignore:The endianness of the requested serializer") @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("endian", ["big", "little"]) async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: @@ -32,6 +33,7 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: assert np.array_equal(data, readback_data) +@pytest.mark.filterwarnings("ignore:The endianness of the requested serializer") @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("dtype_input_endian", [">u2", " Date: Tue, 25 Mar 2025 12:14:56 +0100 Subject: [PATCH 056/130] use public dtype module for docs instead of special-casing the core dype module --- docs/conf.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index d29088d070..08f8318fd7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,7 @@ issues_github_path = "zarr-developers/zarr-python" autoapi_dirs = ['../src/zarr'] -autoapi_add_toctree_entry = False +autoapi_add_toctree_entry = True autoapi_generate_api_docs = True autoapi_member_order = "groupwise" autoapi_root = "api" @@ -68,10 +68,7 @@ def skip_submodules( ) -> bool: # Skip documenting zarr.codecs submodules # codecs are documented in the main zarr.codecs namespace - # TODO: just document everything instead using this weak case-by-case logic - if what == "module" and name.startswith("zarr.core.dtype."): - skip = False - elif what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): + if what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): skip = True return skip From 1831f206c09ee9b7ca563a42ebdc4bb89772220a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:15:08 +0100 Subject: [PATCH 057/130] use public dtype module for docs instead of special-casing the core dype module --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 08f8318fd7..9bb1c48901 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,7 @@ issues_github_path = "zarr-developers/zarr-python" autoapi_dirs = ['../src/zarr'] -autoapi_add_toctree_entry = True +autoapi_add_toctree_entry = False autoapi_generate_api_docs = True autoapi_member_order = "groupwise" autoapi_root = "api" From a427a16192b0fd39bcaa7a16e503786775b2c3ce Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:35:17 +0100 Subject: [PATCH 058/130] silence mypy error about array indexing --- tests/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_array.py b/tests/test_array.py index 2da7f0fa72..4579c8bd58 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1479,4 +1479,4 @@ async def test_sharding_coordinate_selection() -> None: shards=(2, 4, 4), ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) - assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() + assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() # type: ignore[index] From 41d7e585eed8875aad5366f269881c55da9c08d0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:35:38 +0100 Subject: [PATCH 059/130] add release note --- changes/2874.feature.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/2874.feature.rst diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst new file mode 100644 index 0000000000..26eda3a257 --- /dev/null +++ b/changes/2874.feature.rst @@ -0,0 +1,2 @@ +Adds zarr-specific data type classes. This replaces the direct use of numpy data types for zarr +v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation `_ \ No newline at end of file From c08ffd9970ab04c43f243902b9e5a5458c8d17ab Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:50:26 +0100 Subject: [PATCH 060/130] fix doctests, excluding config tests --- docs/user-guide/groups.rst | 4 ++-- docs/user-guide/performance.rst | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index c2a955718b..50e8a68aad 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -128,7 +128,7 @@ property. E.g.:: >>> bar.info_complete() Type : Array Zarr format : 3 - Data type : int64 + Data type : Int64(endianness='little') Shape : (1000000,) Chunk shape : (100000,) Order : C @@ -144,7 +144,7 @@ property. E.g.:: >>> baz.info Type : Array Zarr format : 3 - Data type : float32 + Data type : Float32(endianness='little') Shape : (1000, 1000) Chunk shape : (100, 100) Order : C diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 5c7844f92c..40882fbf1f 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -91,7 +91,7 @@ To use sharding, you need to specify the ``shards`` parameter when creating the >>> z6.info Type : Array Zarr format : 3 - Data type : uint8 + Data type : UInt8() Shape : (10000, 10000, 1000) Shard shape : (1000, 1000, 1000) Chunk shape : (100, 100, 100) @@ -121,7 +121,7 @@ ratios, depending on the correlation structure within the data. E.g.:: >>> c.info_complete() Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -140,7 +140,7 @@ ratios, depending on the correlation structure within the data. E.g.:: >>> f.info_complete() Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : F From 778d740c3b47665c6197eaa2e6ffe1c5557f77d5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 10:17:28 +0100 Subject: [PATCH 061/130] revert addition of linkage between dtype endianness and bytes codec endianness --- src/zarr/core/array.py | 39 ++++++--------------------------------- tests/test_array.py | 33 +-------------------------------- 2 files changed, 7 insertions(+), 65 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9a0d0fa83d..2a63e07f27 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -69,7 +69,6 @@ ZDTypeLike, parse_data_type, ) -from zarr.core.dtype._numpy import HasEndianness from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -4247,24 +4246,6 @@ def _get_default_chunk_encoding_v3( else: serializer = zarr_config.get("array.v3_default_serializer.default") - # Modify the default serializer so that it matches the endianness of the dtype, otherwise unset the - # endian key - - # This is effective problematic for many reasons: - # - we are assuming that endianness is set by the serializer, when it could also be changed - # by any one of the filters. - # - we are assuming that the serializer has a specific configuration. A different serializer that - # alters endianness might not use the same configuration structure. - # - we are mutating a configuration dictionary. It would be much better to work with the codec - # api for this. - # All of these things are acceptable right now because there is only 1 serializer that affects - # endianness, but this design will not last if this situation changes. - if serializer.get("configuration") is not None: - if isinstance(dtype, HasEndianness): - serializer["configuration"]["endian"] = dtype.endianness - else: - serializer["configuration"].pop("endian", None) - return ( tuple(_parse_array_array_codec(f) for f in filters), _parse_array_bytes_codec(serializer), @@ -4370,21 +4351,10 @@ def _parse_chunk_encoding_v3( if serializer == "auto": out_array_bytes = default_array_bytes else: + # TODO: ensure that the serializer is compatible with the ndarray produced by the + # array-array codecs. For example, if a sequence of array-array codecs produces an + # array with a single-byte data type, then the serializer should not specify endiannesss. out_array_bytes = _parse_array_bytes_codec(serializer) - # check that the endianness of the requested serializer matches the dtype of the data, if applicable - if ( - isinstance(out_array_bytes, BytesCodec) - and isinstance(dtype, HasEndianness) - and ( - out_array_bytes.endian is None - or str(out_array_bytes.endian.value) != dtype.endianness - ) - ): - msg = ( - f"The endianness of the requested serializer ({out_array_bytes}) does not match the endianness of the dtype ({dtype.endianness}). " - "In this situation the serializer's endianness takes priority. To avoid this warning, ensure the endianness of the serializer matches the endianness of the dtype." - ) - warnings.warn(msg, UserWarning, stacklevel=2) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () @@ -4404,6 +4374,9 @@ def _parse_chunk_encoding_v3( # TODO: refactor so that the config only contains the name of the codec, and we use the dtype # to create the codec instance, instead of storing a dict representation of a full codec. + # TODO: ensure that the serializer is compatible with the ndarray produced by the + # array-array codecs. For example, if a sequence of array-array codecs produces an + # array with a single-byte data type, then the serializer should not specify endiannesss. if isinstance(out_array_bytes, BytesCodec) and dtype.to_dtype().itemsize == 1: # The default endianness in the bytescodec might not be None, so we need to replace it out_array_bytes = replace(out_array_bytes, endian=None) diff --git a/tests/test_array.py b/tests/test_array.py index 4579c8bd58..ae53a3d3d6 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -50,7 +50,6 @@ from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv -from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath @@ -60,6 +59,7 @@ if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike from zarr.core.metadata.v2 import ArrayV2Metadata + from zarr.core.metadata.v3 import ArrayV3Metadata @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @@ -1330,37 +1330,6 @@ def test_default_endianness( dtype = Int16(endianness=endianness) arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness - if zarr_format == 3: - assert isinstance(arr.metadata, ArrayV3Metadata) # mypy - assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr] - - @staticmethod - @pytest.mark.parametrize("endianness", get_args(Endianness)) - def test_explicit_endianness(store: Store, endianness: Endianness) -> None: - """ - Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error - """ - if endianness == "little": - dtype = Int16(endianness="big") - else: - dtype = Int16(endianness="little") - - serializer = BytesCodec(endian=endianness) - - msg = ( - f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "In this situation the serializer's endianness takes priority. " - "To avoid this warning, ensure the endianness of the serializer matches the endianness of the dtype." - ) - - with pytest.warns(UserWarning, match=re.escape(msg)): - _ = zarr.create_array( - store=store, - shape=(1,), - dtype=dtype, - zarr_format=3, - serializer=serializer, - ) async def test_scalar_array() -> None: From 269215eb3005fd653a173c96aa508d9a484df2fb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 10:45:53 +0100 Subject: [PATCH 062/130] remove Any types --- src/zarr/core/_info.py | 6 +++--- src/zarr/core/array_spec.py | 6 +++--- src/zarr/core/dtype/__init__.py | 6 +++--- src/zarr/core/dtype/_numpy.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 3e605773bb..525b80c65f 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -2,14 +2,14 @@ import dataclasses import textwrap -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: import numcodecs.abc from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import ZDType + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar @dataclasses.dataclass(kw_only=True) @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: ZDType[Any, Any] + _data_type: ZDType[_BaseDType, _BaseScalar] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index f1eac930c4..e8e451944f 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -17,7 +17,7 @@ from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import ZDType + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar class ArrayConfigParams(TypedDict): @@ -89,7 +89,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: ZDType[Any, Any] + dtype: ZDType[_BaseDType, _BaseScalar] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -97,7 +97,7 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: ZDType[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 5483b21998..0aaf9ccf06 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, TypeAlias, get_args +from typing import TYPE_CHECKING, TypeAlias, get_args if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -77,7 +77,7 @@ | DateTime64 ) -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[Any, Any] | dict[str, JSON] +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[_BaseDType, _BaseScalar] | dict[str, JSON] for dtype in get_args(DTYPE): data_type_registry.register(dtype._zarr_v3_name, dtype) @@ -114,7 +114,7 @@ def get_data_type_from_json( return data_type_registry.match_json(dtype, zarr_format=zarr_format) -def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[Any, Any]: +def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[_BaseDType, _BaseScalar]: """ Interpret the input as a ZDType instance. """ diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 7c803ce1f0..51be83b173 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -1232,7 +1232,7 @@ def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype - fields: list[tuple[str, ZDType[Any, Any]]] = [] + fields: list[tuple[str, ZDType[_BaseDType, _BaseScalar]]] = [] if dtype.fields is None: raise ValueError("numpy dtype has no fields") From 8af0ce420c4622fc76d2c0ab2a243994ff493dcb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 11:27:54 +0100 Subject: [PATCH 063/130] add docstring for wrapper module --- src/zarr/core/dtype/wrapper.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 74e7bf79e1..ba1b78f096 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -1,3 +1,25 @@ +""" +Wrapper for native array data types. + +The `ZDType` class is an abstract base class for wrapping native array data types, e.g. numpy dtypes. +It provides a common interface for working with data types in a way that is independent of the +underlying data type system. + +The wrapper class encapsulates a native data type. Instances of the class can be created from a +native data type instance, and a native data type instance can be created from an instance of the +wrapper class. + +The wrapper class is responsible for: +- Reversibly serializing a native data type to Zarr V2 or Zarr V3 metadata. + This ensures that the data type can be properly stored and retrieved from array metadata. +- Reversibly serializing scalar values to Zarr V2 or Zarr V3 metadata. This is important for + storing a fill value for an array in a manner that is valid for the data type. + +To add support for a new data type in Zarr, you should subclass the wrapper class and adapt its methods +to support your native data type. The wrapper class must be added to a data type registry +(defined elsewhere) before ``create_array`` can properly handle the new data type. +""" + from __future__ import annotations from abc import ABC, abstractmethod @@ -17,9 +39,10 @@ # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. _BaseDType = np.dtype[np.generic] +# These two type parameters are covariant because we want +# x : ZDType[BaseDType, BaseScalar] = ZDType[SubDType, SubScalar] +# to type check TScalar_co = TypeVar("TScalar_co", bound=_BaseScalar, covariant=True) -# TODO: figure out an interface or protocol that non-numpy dtypes can use -# These two type parameters are covariant because we want isinstance(ZDType[Subclass](), ZDType[BaseDType]) to be True TDType_co = TypeVar("TDType_co", bound=_BaseDType, covariant=True) From df60d057f3ebdadac7ce29457a0eabbebc15d2c2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 15:23:08 +0100 Subject: [PATCH 064/130] simplify config and docs --- docs/user-guide/arrays.rst | 10 +++---- docs/user-guide/config.rst | 59 ++++++++++++++++---------------------- src/zarr/core/array.py | 41 ++++++-------------------- src/zarr/core/config.py | 44 ++++++++++++++++++++++------ 4 files changed, 73 insertions(+), 81 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index f55dd00c80..a354298a16 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -182,7 +182,7 @@ which can be used to print useful diagnostics, e.g.:: >>> z.info Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -199,7 +199,7 @@ prints additional diagnostics, e.g.:: >>> z.info_complete() Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -246,7 +246,7 @@ built-in delta filter:: The default compressor can be changed by setting the value of the using Zarr's :ref:`user-guide-config`, e.g.:: - >>> with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}): + >>> with zarr.config.set({'array.v2_default_compressor.default': {'id': 'blosc'}}): ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) >>> z.filters () @@ -286,7 +286,7 @@ Here is an example using a delta filter with the Blosc compressor:: >>> z.info Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -600,7 +600,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za >>> a.info_complete() Type : Array Zarr format : 3 - Data type : uint8 + Data type : UInt8() Shape : (10000, 10000) Shard shape : (1000, 1000) Chunk shape : (100, 100) diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 91ffe50b91..4479e30619 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -43,39 +43,30 @@ This is the current default configuration:: >>> zarr.config.pprint() {'array': {'order': 'C', - 'v2_default_compressor': {'bytes': {'checksum': False, - 'id': 'zstd', - 'level': 0}, - 'numeric': {'checksum': False, - 'id': 'zstd', - 'level': 0}, - 'string': {'checksum': False, + 'v2_default_compressor': {'default': {'checksum': False, 'id': 'zstd', - 'level': 0}}, - 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], - 'numeric': None, - 'raw': None, - 'string': [{'id': 'vlen-utf8'}]}, - 'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'numeric': [{'configuration': {'checksum': False, + 'level': 0}, + 'variable-length-string': {'checksum': False, + 'id': 'zstd', + 'level': 0}}, + 'v2_default_filters': {'default': None, + 'variable-length-string': [{'id': 'vlen-utf8'}]}, + 'v3_default_compressors': {'default': [{'configuration': {'checksum': False, 'level': 0}, 'name': 'zstd'}], - 'string': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}]}, - 'v3_default_filters': {'bytes': [], 'numeric': [], 'string': []}, - 'v3_default_serializer': {'bytes': {'name': 'vlen-bytes'}, - 'numeric': {'configuration': {'endian': 'little'}, - 'name': 'bytes'}, - 'string': {'name': 'vlen-utf8'}}, - 'write_empty_chunks': False}, - 'async': {'concurrency': 10, 'timeout': None}, - 'buffer': 'zarr.core.buffer.cpu.Buffer', - 'codec_pipeline': {'batch_size': 1, - 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, - 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', + 'variable-length-string': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}]}, + 'v3_default_filters': {'default': [], 'variable-length-string': []}, + 'v3_default_serializer': {'default': {'configuration': {'endian': 'little'}, + 'name': 'bytes'}, + 'variable-length-string': {'name': 'vlen-utf8'}}, + 'write_empty_chunks': False}, + 'async': {'concurrency': 10, 'timeout': None}, + 'buffer': 'zarr.core.buffer.cpu.Buffer', + 'codec_pipeline': {'batch_size': 1, + 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, + 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', 'bytes': 'zarr.codecs.bytes.BytesCodec', 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', 'endian': 'zarr.codecs.bytes.BytesCodec', @@ -85,7 +76,7 @@ This is the current default configuration:: 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', 'vlen-utf8': 'zarr.codecs.vlen_utf8.VLenUTF8Codec', 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, - 'default_zarr_format': 3, - 'json_indent': 2, - 'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer', - 'threading': {'max_workers': None}} + 'default_zarr_format': 3, + 'json_indent': 2, + 'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer', + 'threading': {'max_workers': None}} diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2a63e07f27..8b1fb2d236 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -63,6 +63,7 @@ parse_shapelike, product, ) +from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( ZDType, @@ -4224,27 +4225,12 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - # the config will not allow keys to have "." characters in them - # so we will access the config by transforming "." to "__" - dtype_name_conf = dtype._zarr_v3_name.replace(".", "__") + dtype_category = categorize_data_type(dtype) - # TODO: find a registry-style solution for this that isn't bloated - # We need to associate specific dtypes with specific encoding schemes - - if dtype_name_conf in zarr_config.get("array.v3_default_filters"): - filters = zarr_config.get(f"array.v3_default_filters.{dtype_name_conf}") - else: - filters = zarr_config.get("array.v3_default_filters.default") - - if dtype_name_conf in zarr_config.get("array.v3_default_compressors"): - compressors = zarr_config.get(f"array.v3_default_compressors.{dtype_name_conf}") - else: - compressors = zarr_config.get("array.v3_default_compressors.default") - if dtype_name_conf in zarr_config.get("array.v3_default_serializer"): - serializer = zarr_config.get(f"array.v3_default_serializer.{dtype_name_conf}") - else: - serializer = zarr_config.get("array.v3_default_serializer.default") + filters = zarr_config.get("array.v3_default_filters").get(dtype_category) + compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) + serializer = zarr_config.get("array.v3_default_serializer").get(dtype_category) return ( tuple(_parse_array_array_codec(f) for f in filters), @@ -4259,20 +4245,9 @@ def _get_default_chunk_encoding_v2( """ Get the default chunk encoding for Zarr format 2 arrays, given a dtype """ - # the config will not allow keys to have "." characters in them - # so we will access the config by transforming "." to "__" - dtype_name_conf = dtype._zarr_v3_name.replace(".", "__") - - if dtype_name_conf in zarr_config.get("array.v2_default_filters"): - filters = zarr_config.get(f"array.v2_default_filters.{dtype_name_conf}") - else: - filters = zarr_config.get("array.v2_default_filters.default") - - if dtype_name_conf in zarr_config.get("array.v2_default_compressor"): - compressor = zarr_config.get(f"array.v2_default_compressor.{dtype_name_conf}") - else: - compressor = zarr_config.get("array.v2_default_compressor.default") - + dtype_category = categorize_data_type(dtype) + filters = zarr_config.get("array.v2_default_filters").get(dtype_category) + compressor = zarr_config.get("array.v2_default_compressor").get(dtype_category) if filters is not None: filters = tuple(numcodecs.get_codec(f) for f in filters) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 8f87910daa..7c61c2e6ac 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -36,11 +36,21 @@ if TYPE_CHECKING: from donfig.config_obj import ConfigSet + from zarr.core.dtype.wrapper import ZDType + class BadConfigError(ValueError): _msg = "bad Config: %r" +# These values are used for rough categorization of data types +# we use this for choosing a default encoding scheme based on the data type. Specifically, +# these categories are keys in a configuration dictionary. +# it is not a part of the ZDType class because these categories are more of an implementation detail +# of our config system rather than a useful attribute of any particular data type. +DTypeCategory = Literal["variable-length-string", "default"] + + class Config(DConfig): # type: ignore[misc] """The Config will collect configuration from config files and environment variables @@ -77,24 +87,26 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, + "v2_default_compressor": { + "default": {"id": "zstd", "level": 0, "checksum": False}, + "variable-length-string": {"id": "zstd", "level": 0, "checksum": False}, + }, "v2_default_filters": { "default": None, - "numpy__variable_length_utf8": [{"id": "vlen-utf8"}], - "numpy__fixed_length_ucs4": [{"id": "vlen-utf8"}], - "numpy__fixed_length_ascii": [{"id": "vlen-bytes"}], + "variable-length-string": [{"id": "vlen-utf8"}], }, - "v3_default_filters": {"default": []}, + "v3_default_filters": {"default": [], "variable-length-string": []}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "numpy__variable_length_utf8": {"name": "vlen-utf8"}, - "numpy__fixed_length_ucs4": {"name": "vlen-utf8"}, - "r*": {"name": "vlen-bytes"}, + "variable-length-string": {"name": "vlen-utf8"}, }, "v3_default_compressors": { "default": [ {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ] + ], + "variable-length-string": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}} + ], }, }, "async": {"concurrency": 10, "timeout": None}, @@ -128,3 +140,17 @@ def parse_indexing_order(data: Any) -> Literal["C", "F"]: return cast(Literal["C", "F"], data) msg = f"Expected one of ('C', 'F'), got {data} instead." raise ValueError(msg) + + +def categorize_data_type(dtype: ZDType[Any, Any]) -> DTypeCategory: + """ + Classify a ZDType. The return value is a string which belongs to the type ``DTypeKind``. + + This is used by the config system to determine how to encode arrays with the associated data type + when the user has not specified a particular serialization scheme. + """ + from zarr.core.dtype._numpy import VariableLengthString + + if isinstance(dtype, VariableLengthString): + return "variable-length-string" + return "default" From 7f54bbfe2308f8910acae88b8affeb1f0bf74557 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 17:32:03 +0100 Subject: [PATCH 065/130] update config test --- tests/test_config.py | 112 +++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 52 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index 8d6e0a53ed..a2a84e7e7e 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,7 +24,7 @@ from zarr.core.buffer import NDBuffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype._numpy import Int8, VariableLengthString from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -48,55 +48,60 @@ def test_config_defaults_set() -> None: # regression test for available defaults - assert config.defaults == [ - { - "default_zarr_format": 3, - "array": { - "order": "C", - "write_empty_chunks": False, - "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, - "v2_default_filters": { - "default": None, - "numpy__variable_length_utf8": [{"id": "vlen-utf8"}], - "numpy__fixed_length_ucs4": [{"id": "vlen-utf8"}], - "numpy__fixed_length_ascii": [{"id": "vlen-bytes"}], + assert ( + config.defaults + == [ + { + "default_zarr_format": 3, + "array": { + "order": "C", + "write_empty_chunks": False, + "v2_default_compressor": { + "default": {"id": "zstd", "level": 0, "checksum": False}, + "variable-length-string": {"id": "zstd", "level": 0, "checksum": False}, + }, + "v2_default_filters": { + "default": None, + "variable-length-string": [{"id": "vlen-utf8"}], + }, + "v3_default_filters": {"default": [], "variable-length-string": []}, + "v3_default_serializer": { + "default": {"name": "bytes", "configuration": {"endian": "little"}}, + "variable-length-string": {"name": "vlen-utf8"}, + }, + "v3_default_compressors": { + "default": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "variable-length-string": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}} + ], + }, }, - "v3_default_filters": {"default": []}, - "v3_default_serializer": { - "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "numpy__variable_length_utf8": {"name": "vlen-utf8"}, - "numpy__fixed_length_ucs4": {"name": "vlen-utf8"}, - "r*": {"name": "vlen-bytes"}, + "async": {"concurrency": 10, "timeout": None}, + "threading": {"max_workers": None}, + "json_indent": 2, + "codec_pipeline": { + "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", + "batch_size": 1, }, - "v3_default_compressors": { - "default": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ] + "codecs": { + "blosc": "zarr.codecs.blosc.BloscCodec", + "gzip": "zarr.codecs.gzip.GzipCodec", + "zstd": "zarr.codecs.zstd.ZstdCodec", + "bytes": "zarr.codecs.bytes.BytesCodec", + "endian": "zarr.codecs.bytes.BytesCodec", # compatibility with earlier versions of ZEP1 + "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", + "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", + "transpose": "zarr.codecs.transpose.TransposeCodec", + "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", + "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", }, - }, - "async": {"concurrency": 10, "timeout": None}, - "threading": {"max_workers": None}, - "json_indent": 2, - "codec_pipeline": { - "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", - "batch_size": 1, - }, - "buffer": "zarr.core.buffer.cpu.Buffer", - "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", - "codecs": { - "blosc": "zarr.codecs.blosc.BloscCodec", - "gzip": "zarr.codecs.gzip.GzipCodec", - "zstd": "zarr.codecs.zstd.ZstdCodec", - "bytes": "zarr.codecs.bytes.BytesCodec", - "endian": "zarr.codecs.bytes.BytesCodec", - "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", - "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", - "transpose": "zarr.codecs.transpose.TransposeCodec", - "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", - "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", - }, - } - ] + "buffer": "zarr.core.buffer.cpu.Buffer", + "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", + } + ] + ) assert config.get("array.order") == "C" assert config.get("async.concurrency") == 10 assert config.get("async.timeout") is None @@ -297,15 +302,18 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") -@pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) -async def test_default_codecs(dtype: str) -> None: +@pytest.mark.parametrize("dtype_category", ["variable-length-string", "default"]) +async def test_default_codecs(dtype_category: str) -> None: """ Test that the default compressors are sensitive to the current setting of the config. """ - zdtype = get_data_type_from_native_dtype(dtype) + if dtype_category == "variable-length-string": + zdtype = VariableLengthString() + else: + zdtype = Int8() expected_compressors = (GzipCodec(),) new_conf = { - f"array.v3_default_compressors.{zdtype._zarr_v3_name.replace('.', '__')}": [ + f"array.v3_default_compressors.{dtype_category}": [ c.to_dict() for c in expected_compressors ] } @@ -313,7 +321,7 @@ async def test_default_codecs(dtype: str) -> None: arr = await create_array( shape=(100,), chunks=(100,), - dtype=dtype, + dtype=zdtype, zarr_format=3, store=MemoryStore(), ) From be83f03058da71718340d35aa337d928d130a724 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 17:39:55 +0100 Subject: [PATCH 066/130] fix S dtype test for v2 --- tests/test_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_v2.py b/tests/test_v2.py index f3dec247b7..293359d910 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -87,7 +87,7 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js "compressor": None, "dtype": expected_dtype, "fill_value": fill_value_json, - "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, + "filters": None, "order": "C", "shape": [3], "zarr_format": 2, From a210f9fda46e09f0aa0b08e67ca18a87f24d7dfe Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 28 Apr 2025 16:30:02 +0200 Subject: [PATCH 067/130] fully remove v3jsonencoder --- src/zarr/api/asynchronous.py | 1 - src/zarr/core/group.py | 10 +++-- src/zarr/core/metadata/v2.py | 8 ++-- src/zarr/core/metadata/v3.py | 51 +++++------------------- tests/test_metadata/test_consolidated.py | 10 +---- tests/test_properties.py | 33 ++++++++++++--- 6 files changed, 50 insertions(+), 63 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index bbc5f99c31..50cddaa1d6 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -223,7 +223,6 @@ async def consolidate_metadata( group, metadata=metadata, ) - await group._save_metadata() return group diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 41c3e33baf..6c8df6aacd 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -49,7 +49,6 @@ ) from zarr.core.config import config from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v3 import V3JsonEncoder from zarr.core.sync import SyncMixin, sync from zarr.errors import ContainsArrayError, ContainsGroupError, MetadataValidationError from zarr.storage import StoreLike, StorePath @@ -334,7 +333,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: if self.zarr_format == 3: return { ZARR_JSON: prototype.buffer.from_bytes( - json.dumps(self.to_dict(), cls=V3JsonEncoder).encode() + json.dumps(self.to_dict(), indent=json_indent, allow_nan=False).encode() ) } else: @@ -343,7 +342,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode() ), ZATTRS_JSON: prototype.buffer.from_bytes( - json.dumps(self.attributes, indent=json_indent).encode() + json.dumps(self.attributes, indent=json_indent, allow_nan=False).encode() ), } if self.consolidated_metadata: @@ -354,6 +353,8 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: consolidated_metadata = self.consolidated_metadata.to_dict()["metadata"] assert isinstance(consolidated_metadata, dict) for k, v in consolidated_metadata.items(): + attrs = v.pop("attributes", {}) + d[f"{k}/{ZATTRS_JSON}"] = attrs if "shape" in v: # it's an array d[f"{k}/{ZARRAY_JSON}"] = v @@ -369,7 +370,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes( json.dumps( - {"metadata": d, "zarr_consolidated_format": 1}, cls=V3JsonEncoder + {"metadata": d, "zarr_consolidated_format": 1}, allow_nan=False ).encode() ) @@ -608,6 +609,7 @@ def _from_bytes_v2( consolidated_metadata[path].update(v) else: raise ValueError(f"Invalid file type '{kind}' at path '{path}") + group_metadata["consolidated_metadata"] = { "metadata": dict(consolidated_metadata), "kind": "inline", diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 13e775d0b0..23824520f7 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -3,7 +3,7 @@ import base64 import warnings from collections.abc import Iterable, Sequence -from typing import TYPE_CHECKING, TypedDict +from typing import TYPE_CHECKING, Any, TypedDict import numcodecs.abc @@ -109,7 +109,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: json_indent = config.get("json_indent") return { ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, indent=json_indent).encode() + json.dumps(zarray_dict, indent=json_indent, allow_nan=False).encode() ), ZATTRS_JSON: prototype.buffer.from_bytes( json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode() @@ -178,10 +178,12 @@ def to_dict(self) -> dict[str, JSON]: new_filters.append(f) zarray_dict["filters"] = new_filters + # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value + # serialize the dtype after fill value-specific JSON encoding zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) return zarray_dict @@ -289,7 +291,7 @@ def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e -def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: +def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: """ Parse a potential fill value into a value that is compatible with the provided dtype. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index ead05b5e44..559298c13f 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -11,7 +11,6 @@ ) if TYPE_CHECKING: - from collections.abc import Callable from typing import Self from zarr.core.buffer import Buffer, BufferPrototype @@ -25,8 +24,6 @@ from dataclasses import dataclass, field, replace from typing import Any, Literal -import numpy as np - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid @@ -43,15 +40,6 @@ from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class -DEFAULT_DTYPE = "float64" - -# Keep in sync with _replace_special_floats -SPECIAL_FLOATS_ENCODED = { - "Infinity": np.inf, - "-Infinity": -np.inf, - "NaN": np.nan, -} - def parse_zarr_format(data: object) -> Literal[3]: if data == 3: @@ -141,33 +129,6 @@ def parse_storage_transformers(data: object) -> tuple[dict[str, JSON], ...]: ) -class V3JsonEncoder(json.JSONEncoder): - def __init__( - self, - *, - skipkeys: bool = False, - ensure_ascii: bool = True, - check_circular: bool = True, - allow_nan: bool = True, - sort_keys: bool = False, - indent: int | None = None, - separators: tuple[str, str] | None = None, - default: Callable[[object], object] | None = None, - ) -> None: - if indent is None: - indent = config.get("json_indent") - super().__init__( - skipkeys=skipkeys, - ensure_ascii=ensure_ascii, - check_circular=check_circular, - allow_nan=allow_nan, - sort_keys=sort_keys, - indent=indent, - separators=separators, - default=default, - ) - - class ArrayV3MetadataDict(TypedDict): """ A typed dictionary model for zarr v3 metadata. @@ -259,6 +220,10 @@ def _validate_metadata(self) -> None: def ndim(self) -> int: return len(self.shape) + @property + def dtype(self) -> ZDType[_BaseDType, _BaseScalar]: + return self.data_type + @property def chunks(self) -> ChunkCoords: if isinstance(self.chunk_grid, RegularChunkGrid): @@ -306,9 +271,13 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: + json_indent = config.get("json_indent") d = self.to_dict() - # d = _replace_special_floats(self.to_dict()) - return {ZARR_JSON: prototype.buffer.from_bytes(json.dumps(d, cls=V3JsonEncoder).encode())} + return { + ZARR_JSON: prototype.buffer.from_bytes( + json.dumps(d, allow_nan=False, indent=json_indent).encode() + ) + } @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 71720af58b..b2244c5047 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -581,7 +581,6 @@ async def test_consolidated_metadata_encodes_special_chars( memory_store: Store, zarr_format: ZarrFormat, fill_value: float ): root = await group(store=memory_store, zarr_format=zarr_format) - _child = await root.create_group("child", attributes={"test": fill_value}) _time = await root.create_array("time", shape=(12,), dtype=np.float64, fill_value=fill_value) await zarr.api.asynchronous.consolidate_metadata(memory_store) @@ -595,16 +594,9 @@ async def test_consolidated_metadata_encodes_special_chars( "consolidated_metadata" ]["metadata"] - if np.isnan(fill_value): - expected_fill_value = "NaN" - elif np.isneginf(fill_value): - expected_fill_value = "-Infinity" - elif np.isinf(fill_value): - expected_fill_value = "Infinity" + expected_fill_value = _time._zdtype.to_json_value(fill_value, zarr_format=2) if zarr_format == 2: - assert root_metadata["child/.zattrs"]["test"] == expected_fill_value assert root_metadata["time/.zarray"]["fill_value"] == expected_fill_value elif zarr_format == 3: - assert root_metadata["child"]["attributes"]["test"] == expected_fill_value assert root_metadata["time"]["fill_value"] == expected_fill_value diff --git a/tests/test_properties.py b/tests/test_properties.py index d48dfe2fef..7c741ec873 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -239,6 +239,29 @@ def test_roundtrip_array_metadata_from_json(data: st.DataObject, zarr_format: in # assert_array_equal(nparray, zarray[:]) +def serialized_complex_float_is_valid( + serialized: tuple[numbers.Real | str, numbers.Real | str], +) -> bool: + """ + Validate that the serialized representation of a complex float conforms to the spec. + + The specification requires that a serialized complex float must be either: + - A JSON number, or + - One of the strings "NaN", "Infinity", or "-Infinity". + + Args: + serialized: The value produced by JSON serialization for a complex floating point number. + + Returns: + bool: True if the serialized value is valid according to the spec, False otherwise. + """ + return ( + isinstance(serialized, tuple) + and len(serialized) == 2 + and all(serialized_float_is_valid(x) for x in serialized) + ) + + def serialized_float_is_valid(serialized: numbers.Real | str) -> bool: """ Validate that the serialized representation of a float conforms to the spec. @@ -294,11 +317,11 @@ def test_array_metadata_meets_spec(meta: ArrayV2Metadata | ArrayV3Metadata) -> N assert asdict_dict["zarr_format"] == 3 # version-agnostic validations - if meta.dtype.kind == "f": + dtype_native = meta.dtype.to_dtype() + if dtype_native.kind == "f": assert serialized_float_is_valid(asdict_dict["fill_value"]) - elif meta.dtype.kind == "c": + elif dtype_native.kind == "c": # fill_value should be a two-element array [real, imag]. - assert serialized_float_is_valid(asdict_dict["fill_value"].real) - assert serialized_float_is_valid(asdict_dict["fill_value"].imag) - elif meta.dtype.kind == "M" and np.isnat(meta.fill_value): + assert serialized_complex_float_is_valid(asdict_dict["fill_value"]) + elif dtype_native.kind == "M" and np.isnat(meta.fill_value): assert asdict_dict["fill_value"] == "NaT" From 8fbd29a42f6529184508bec5f9ef2e08cf9cdd84 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 29 Apr 2025 15:06:11 +0200 Subject: [PATCH 068/130] refactor dtype module structure --- src/zarr/codecs/bytes.py | 2 +- src/zarr/codecs/sharding.py | 2 +- src/zarr/core/config.py | 4 +- src/zarr/core/dtype/__init__.py | 33 +- src/zarr/core/dtype/_numpy.py | 1397 --------------------- src/zarr/core/dtype/common.py | 513 +------- src/zarr/core/dtype/npy/__init__.py | 0 src/zarr/core/dtype/npy/bool.py | 114 ++ src/zarr/core/dtype/npy/common.py | 578 +++++++++ src/zarr/core/dtype/npy/complex.py | 155 +++ src/zarr/core/dtype/npy/float.py | 154 +++ src/zarr/core/dtype/npy/int.py | 318 +++++ src/zarr/core/dtype/npy/sized.py | 382 ++++++ src/zarr/core/dtype/npy/string.py | 134 ++ src/zarr/core/dtype/npy/time.py | 142 +++ src/zarr/core/metadata/v2.py | 2 +- src/zarr/core/metadata/v3.py | 1 + tests/conftest.py | 4 +- tests/package_with_entrypoint/__init__.py | 2 +- tests/test_array.py | 12 +- tests/test_config.py | 2 +- tests/test_dtype.py | 28 +- tests/test_info.py | 2 +- tests/test_metadata/test_v2.py | 3 +- tests/test_metadata/test_v3.py | 4 +- 25 files changed, 2031 insertions(+), 1957 deletions(-) delete mode 100644 src/zarr/core/dtype/_numpy.py create mode 100644 src/zarr/core/dtype/npy/__init__.py create mode 100644 src/zarr/core/dtype/npy/bool.py create mode 100644 src/zarr/core/dtype/npy/common.py create mode 100644 src/zarr/core/dtype/npy/complex.py create mode 100644 src/zarr/core/dtype/npy/float.py create mode 100644 src/zarr/core/dtype/npy/int.py create mode 100644 src/zarr/core/dtype/npy/sized.py create mode 100644 src/zarr/core/dtype/npy/string.py create mode 100644 src/zarr/core/dtype/npy/time.py diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 80972096c2..6c28bfe543 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -10,7 +10,7 @@ from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration -from zarr.core.dtype._numpy import endianness_to_numpy_str +from zarr.core.dtype.npy.common import endianness_to_numpy_str from zarr.registry import register_codec if TYPE_CHECKING: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index e8a23e20c4..12d709b599 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -43,7 +43,7 @@ parse_shapelike, product, ) -from zarr.core.dtype._numpy import UInt64 +from zarr.core.dtype.npy.int import UInt64 from zarr.core.indexing import ( BasicIndexer, SelectorTuple, diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 7c61c2e6ac..08674d9a66 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -144,12 +144,12 @@ def parse_indexing_order(data: Any) -> Literal["C", "F"]: def categorize_data_type(dtype: ZDType[Any, Any]) -> DTypeCategory: """ - Classify a ZDType. The return value is a string which belongs to the type ``DTypeKind``. + Classify a ZDType. The return value is a string which belongs to the type ``DTypeCategory``. This is used by the config system to determine how to encode arrays with the associated data type when the user has not specified a particular serialization scheme. """ - from zarr.core.dtype._numpy import VariableLengthString + from zarr.core.dtype import VariableLengthString if isinstance(dtype, VariableLengthString): return "variable-length-string" diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 0aaf9ccf06..63b593fd28 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -2,6 +2,18 @@ from typing import TYPE_CHECKING, TypeAlias, get_args +from zarr.core.dtype.npy.bool import Bool +from zarr.core.dtype.npy.complex import Complex64, Complex128 +from zarr.core.dtype.npy.float import Float16, Float32, Float64 +from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 +from zarr.core.dtype.npy.sized import ( + FixedLengthAscii, + FixedLengthBytes, + FixedLengthUnicode, + Structured, +) +from zarr.core.dtype.npy.time import DateTime64 + if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -9,27 +21,8 @@ import numpy.typing as npt from zarr.core.common import JSON -from zarr.core.dtype._numpy import ( +from zarr.core.dtype.npy.string import ( _NUMPY_SUPPORTS_VLEN_STRING, - Bool, - Complex64, - Complex128, - DateTime64, - FixedLengthAscii, - FixedLengthBytes, - FixedLengthUnicode, - Float16, - Float32, - Float64, - Int8, - Int16, - Int32, - Int64, - Structured, - UInt8, - UInt16, - UInt32, - UInt64, VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py deleted file mode 100644 index 51be83b173..0000000000 --- a/src/zarr/core/dtype/_numpy.py +++ /dev/null @@ -1,1397 +0,0 @@ -from __future__ import annotations - -import base64 -import re -import sys -from collections.abc import Sequence -from dataclasses import dataclass -from typing import ( - TYPE_CHECKING, - Any, - ClassVar, - Literal, - Self, - SupportsComplex, - SupportsFloat, - SupportsIndex, - SupportsInt, - TypeGuard, - TypeVar, - cast, - get_args, -) - -import numpy as np - -from zarr.core.dtype.common import ( - DataTypeValidationError, - Endianness, - bytes_from_json, - bytes_to_json, - check_json_bool, - check_json_complex_float, - check_json_float, - check_json_int, - check_json_str, - complex_float_from_json, - complex_float_to_json, - datetime_from_json, - datetime_to_json, - float_from_json, - float_to_json, -) -from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar - -if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat - -EndiannessNumpy = Literal[">", "<", "|", "="] -IntLike = SupportsInt | SupportsIndex | bytes | str -FloatLike = SupportsIndex | SupportsFloat | bytes | str -ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None - - -@dataclass(frozen=True) -class HasEndianness: - """ - This is a mix-in class for data types with an endianness attribute - """ - - endianness: Endianness | None = "little" - - -@dataclass(frozen=True) -class HasLength: - """ - This is a mix-in class for data types with a length attribute - """ - - length: int - - -@dataclass(frozen=True, kw_only=True, slots=True) -class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): - """ - Wrapper for numpy boolean dtype. - - Attributes - ---------- - name : str - The name of the dtype. - dtype_cls : ClassVar[type[np.dtypes.BoolDType]] - The numpy dtype class. - """ - - _zarr_v3_name = "bool" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) - dtype_cls = np.dtypes.BoolDType - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self: Self) -> np.dtypes.BoolDType: - return self.dtype_cls() - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["bool", "|b1"]]: - """ - Check that the input is a valid JSON representation of a bool. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - def default_value(self) -> np.bool_: - """ - Get the default value for the boolean dtype. - - Returns - ------- - np.bool_ - The default value. - """ - return np.False_ - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> bool: - """ - Convert a scalar to a python bool. - - Parameters - ---------- - data : object - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - bool - The JSON-serializable format. - """ - return bool(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: - """ - Read a JSON-serializable value as a numpy boolean scalar. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - np.bool_ - The numpy boolean scalar. - """ - if check_json_bool(data): - return self._cast_value_unsafe(data) - raise TypeError(f"Invalid type: {data}. Expected a boolean.") - - def check_value(self, data: object) -> bool: - # Anything can become a bool - return True - - def cast_value(self, value: object) -> np.bool_: - return self._cast_value_unsafe(value) - - def _cast_value_unsafe(self, value: object) -> np.bool_: - return np.bool_(value) - - -_NumpyIntDType = ( - np.dtypes.Int8DType - | np.dtypes.Int16DType - | np.dtypes.Int32DType - | np.dtypes.Int64DType - | np.dtypes.UInt8DType - | np.dtypes.UInt16DType - | np.dtypes.UInt32DType - | np.dtypes.UInt64DType -) -_NumpyIntScalar = ( - np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 -) -TIntDType_co = TypeVar("TIntDType_co", bound=_NumpyIntDType, covariant=True) -TIntScalar_co = TypeVar("TIntScalar_co", bound=_NumpyIntScalar, covariant=True) - - -@dataclass(frozen=True) -class BaseInt(ZDType[TIntDType_co, TIntScalar_co]): - # This attribute holds the possible zarr v2 JSON names for the data type - _zarr_v2_names: ClassVar[tuple[str, ...]] - - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of this data type. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def check_value(self, value: object) -> TypeGuard[IntLike]: - return isinstance(value, IntLike) - - def _cast_value_unsafe(self, value: object) -> TIntScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to an integer.") - - def default_value(self) -> TIntScalar_co: - """ - Get the default value, which is 0 cast to this dtype - - Returns - ------- - Int scalar - The default value. - """ - return self._cast_value_unsafe(0) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: - """ - Read a JSON-serializable value as a numpy int scalar. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - TScalar_co - The numpy scalar. - """ - if check_json_int(data): - return self._cast_value_unsafe(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> int: - """ - Convert an object to JSON-serializable scalar. - - Parameters - ---------- - data : _BaseScalar - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - int - The JSON-serializable form of the scalar. - """ - return int(self.cast_value(data)) - - -@dataclass(frozen=True, kw_only=True) -class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): - dtype_cls = np.dtypes.Int8DType - _zarr_v3_name = "int8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self: Self) -> np.dtypes.Int8DType: - return self.dtype_cls() - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - -@dataclass(frozen=True, kw_only=True) -class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): - dtype_cls = np.dtypes.UInt8DType - _zarr_v3_name = "uint8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self: Self) -> np.dtypes.UInt8DType: - return self.dtype_cls() - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - -@dataclass(frozen=True, kw_only=True) -class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): - dtype_cls = np.dtypes.Int16DType - _zarr_v3_name = "int16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Int16DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - # This ensures that we get the endianness correct without annoying string parsing - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): - dtype_cls = np.dtypes.UInt16DType - _zarr_v3_name = "uint16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.UInt16DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): - dtype_cls = np.dtypes.Int32DType - _zarr_v3_name = "int32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: - # We override the base implementation to address a windows-specific, pre-numpy 2 issue where - # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` - # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, - # despite the two classes being different. Thus we will create an instance of `cls` with the - # latter dtype, after pulling in the byte order of the input - if dtype == np.dtypes.Int32DType(): - return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) - else: - return super().from_dtype(dtype) - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Int32DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): - dtype_cls = np.dtypes.UInt32DType - _zarr_v3_name = "uint32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.UInt32DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): - dtype_cls = np.dtypes.Int64DType - _zarr_v3_name = "int64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Int64DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): - dtype_cls = np.dtypes.UInt64DType - _zarr_v3_name = "uint64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.UInt64DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -TFloatDType_co = TypeVar( - "TFloatDType_co", - bound=np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, - covariant=True, -) -TFloatScalar_co = TypeVar( - "TFloatScalar_co", bound=np.float16 | np.float32 | np.float64, covariant=True -) - - -@dataclass(frozen=True) -class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness): - # This attribute holds the possible zarr v2 JSON names for the data type - _zarr_v2_names: ClassVar[tuple[str, ...]] - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> TFloatDType_co: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of this data type. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def check_value(self, value: object) -> TypeGuard[FloatLike]: - return isinstance(value, FloatLike) - - def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to a float.") - - def default_value(self) -> TFloatScalar_co: - """ - Get the default value, which is 0 cast to this dtype - - Returns - ------- - Int scalar - The default value. - """ - return self._cast_value_unsafe(0) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: - """ - Read a JSON-serializable value as a numpy float. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - TScalar_co - The numpy float. - """ - if check_json_float(data, zarr_format=zarr_format): - return self._cast_value_unsafe(float_from_json(data, zarr_format=zarr_format)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: - """ - Convert an object to a JSON-serializable float. - - Parameters - ---------- - data : _BaseScalar - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - JSON - The JSON-serializable form of the float, which is potentially a number or a string. - See the zarr specifications for details on the JSON encoding for floats. - """ - return float_to_json(self._cast_value_unsafe(data), zarr_format=zarr_format) - - -@dataclass(frozen=True, kw_only=True) -class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): - dtype_cls = np.dtypes.Float16DType - _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", "f4", "f8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> TComplexDType_co: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of this data type. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def check_value(self, value: object) -> bool: - return isinstance(value, ComplexLike) - - def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[arg-type, return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to a complex scalar.") - - def default_value(self) -> TComplexScalar_co: - """ - Get the default value, which is 0 cast to this dtype - - Returns - ------- - Int scalar - The default value. - """ - return self._cast_value_unsafe(0) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: - """ - Read a JSON-serializable value as a numpy float. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - TScalar_co - The numpy float. - """ - if check_json_complex_float(data, zarr_format=zarr_format): - return self._cast_value_unsafe(complex_float_from_json(data, zarr_format=zarr_format)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: - """ - Convert an object to a JSON-serializable float. - - Parameters - ---------- - data : _BaseScalar - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - JSON - The JSON-serializable form of the complex number, which is a list of two floats, - each of which is encoding according to a zarr-format-specific encoding. - """ - return complex_float_to_json(self.cast_value(data), zarr_format=zarr_format) - - -@dataclass(frozen=True, kw_only=True) -class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): - dtype_cls = np.dtypes.Complex64DType - _zarr_v3_name = "complex64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", "c16", " Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - - def to_dtype(self) -> np.dtypes.BytesDType[int]: - return self.dtype_cls(self.length) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - if zarr_format == 2: - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and "length_bits" in data["configuration"] - and isinstance(data["configuration"]["length_bits"], int) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bits": self.length * self.item_size_bits}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.bytes_: - return np.bytes_(b"") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") - - def check_value(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes) - - def _cast_value_unsafe(self, value: object) -> np.bytes_: - return self.to_dtype().type(value) - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): - # np.dtypes.VoidDType is specified in an odd way in numpy - # it cannot be used to create instances of the dtype - # so we have to tell mypy to ignore this here - dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "numpy.void" - item_size_bits: ClassVar[int] = 8 - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - - def to_dtype(self) -> np.dtypes.VoidDType[int]: - # Numpy does not allow creating a void type - # by invoking np.dtypes.VoidDType directly - return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # Check that the dtype is |V1, |V2, ... - return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and isinstance(data["name"], str) - and (re.match(r"^r\d+$", data["name"]) is not None) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return {"name": f"r{self.length * self.item_size_bits}"} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: - """ - Numpy void dtype comes in two forms: - * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. - * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, - - In this check we ensure that ``fields`` is ``None``. - - Parameters - ---------- - dtype : TDType - The dtype to check. - - Returns - ------- - Bool - True if the dtype matches, False otherwise. - """ - return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - - def default_value(self) -> np.void: - return self.to_dtype().type(("\x00" * self.length).encode("ascii")) - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_value(data).tobytes()).decode("ascii") - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data)) - raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") - - def check_value(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes | np.void) - - def _cast_value_unsafe(self, value: object) -> np.void: - return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): - dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_ucs4" - item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls( - length=dtype.itemsize // (cls.item_size_bits // 8), - endianness=endianness_from_numpy_str(byte_order), - ) - - def to_dtype(self) -> np.dtypes.StrDType[int]: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls(self.length).newbyteorder(byte_order) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - if zarr_format == 2: - # match >U1, <]U\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and "length_bits" in data["configuration"] - and isinstance(data["configuration"]["length_bits"], int) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bits": self.length * self.item_size_bits}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.str_: - return np.str_("") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.to_dtype().type(data) - - def check_value(self, data: object) -> bool: - return isinstance(data, str | np.str_ | bytes) - - def _cast_value_unsafe(self, value: object) -> np.str_: - return self.to_dtype().type(value) - - -_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") - - -if _NUMPY_SUPPORTS_VLEN_STRING: - - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[type-var] - dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "numpy.variable_length_utf8" - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self) -> np.dtypes.StringDType: - return self.dtype_cls() - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy string dtype. - """ - if zarr_format == 2: - # TODO: take the entire metadata document in here, and - # check the compressors / filters for vlen-utf8 - # Note that we are checking for the object dtype name. - return data == "|O" - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - # Note: unlike many other numpy data types, we don't serialize the .str attribute - # of the data type to JSON. This is because Zarr was using `|O` for strings before the - # numpy variable length string data type existed, and we want to be consistent with - # that practice - return "|O" - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - def default_value(self) -> str: - return "" - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data - - def check_value(self, data: object) -> bool: - return isinstance(data, str) - - def _cast_value_unsafe(self, value: object) -> str: - return str(value) - -else: - # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] - dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "numpy.variable_length_utf8" - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self) -> np.dtypes.ObjectDType: - return self.dtype_cls() - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy O dtype. - """ - if zarr_format == 2: - # TODO: take the entire metadata document in here, and - # check the compressors / filters for vlen-utf8 - return data == "|O" - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - def default_value(self) -> str: - return "" - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return data # type: ignore[return-value] - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - """ - Strings pass through - """ - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data - - def check_value(self, data: object) -> bool: - return isinstance(data, str) - - def _cast_value_unsafe(self, value: object) -> str: - return str(value) - - -DateUnit = Literal["Y", "M", "W", "D"] -TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] - - -@dataclass(frozen=True, kw_only=True, slots=True) -class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): - dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] - _zarr_v3_name = "numpy.datetime64" - unit: DateUnit | TimeUnit - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] - if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): - raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') - byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) - - def to_dtype(self) -> np.dtypes.DateTime64DType: - # Numpy does not allow creating datetime64 via - # np.dtypes.DateTime64Dtype() - return cast( - "np.dtypes.DateTime64DType", - np.dtype(f"datetime64[{self.unit}]").newbyteorder( - endianness_to_numpy_str(self.endianness) - ), - ) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # match M[M], etc - # consider making this a standalone function - return ( - isinstance(data, str) - and len(data) in (6, 7) - and data[0] in (">", "<") - and data[1:4] == "M8[" - and data[4:-1] in get_args(TimeUnit) + get_args(DateUnit) - and data[-1] == "]" - ) - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and "unit" in data["configuration"] - and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.datetime64: - return np.datetime64("NaT") - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_int(data): - return datetime_from_json(data, self.unit) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: - return datetime_to_json(data) # type: ignore[arg-type] - - def check_value(self, data: object) -> bool: - # not sure which values we should accept for structured dtypes. - try: - np.array([data], dtype=self.to_dtype()) - return True # noqa: TRY300 - except ValueError: - return False - - def _cast_value_unsafe(self, value: object) -> np.datetime64: - return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] - - -@dataclass(frozen=True, kw_only=True) -class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): - dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "structured" - fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] - - def default_value(self) -> np.void: - return self._cast_value_unsafe(0) - - def _cast_value_unsafe(self, value: object) -> np.void: - return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) - - @classmethod - def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: - """ - Check that this dtype is a numpy structured dtype - - Parameters - ---------- - dtype : np.dtypes.DTypeLike - The dtype to check. - - Returns - ------- - TypeGuard[np.dtypes.VoidDType] - True if the dtype matches, False otherwise. - """ - return super().check_dtype(dtype) and dtype.fields is not None - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - from zarr.core.dtype import get_data_type_from_native_dtype - - fields: list[tuple[str, ZDType[_BaseDType, _BaseScalar]]] = [] - - if dtype.fields is None: - raise ValueError("numpy dtype has no fields") - - # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only - # care about the first element in either case. - for key, (dtype_instance, *_) in dtype.fields.items(): - dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) - fields.append((key, dtype_wrapped)) - - return cls(fields=tuple(fields)) - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - fields = [ - (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields - ] - if zarr_format == 2: - return fields - elif zarr_format == 3: - base_dict = {"name": self._zarr_v3_name} - base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] - return cast("JSON", base_dict) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[dict[str, JSON] | list[Any]]: - # the actual JSON form is recursive and hard to annotate, so we give up and do - # list[Any] for now - if zarr_format == 2: - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and all( - not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 - for field in data - ) - ) - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and "configuration" in data - and isinstance(data["configuration"], dict) - and "fields" in data["configuration"] - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - from zarr.core.dtype import get_data_type_from_json - - if cls.check_json(data, zarr_format=zarr_format): - if zarr_format == 2: - # structured dtypes are constructed directly from a list of lists - return cls( - fields=tuple( # type: ignore[misc] - (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) - for f_name, f_dtype in data - ) - ) - elif zarr_format == 3: # noqa: SIM102 - if isinstance(data, dict) and "configuration" in data: - config = data["configuration"] - if isinstance(config, dict) and "fields" in config: - meta_fields = config["fields"] - fields = tuple( - (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) - for f_name, f_dtype in meta_fields - ) - return cls(fields=fields) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - - def to_dtype(self) -> np.dtypes.VoidDType[int]: - return cast( - "np.dtypes.VoidDType[int]", - np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), - ) - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) - - def check_value(self, data: object) -> bool: - # not sure which values we should accept for structured dtypes. - try: - np.array([data], dtype=self.to_dtype()) - return True # noqa: TRY300 - except ValueError: - return False - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - as_bytes = bytes_from_json(data, zarr_format=zarr_format) - dtype = self.to_dtype() - return cast("np.void", np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) - - -def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: - """ - Convert an endianness literal to its numpy string representation. - - Parameters - ---------- - endianness : Endianness or None - The endianness to convert. - - Returns - ------- - Literal[">", "<", "|"] - The numpy string representation of the endianness. - - Raises - ------ - ValueError - If the endianness is invalid. - """ - match endianness: - case "little": - return "<" - case "big": - return ">" - case None: - return "|" - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" - ) - - -def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: - """ - Convert a numpy endianness string literal to a human-readable literal value. - - Parameters - ---------- - endianness : Literal[">", "<", "=", "|"] - The numpy string representation of the endianness. - - Returns - ------- - Endianness or None - The human-readable representation of the endianness. - - Raises - ------ - ValueError - If the endianness is invalid. - """ - match endianness: - case "=": - return sys.byteorder - case "<": - return "little" - case ">": - return "big" - case "|": - return None - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" - ) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 900b3fddbd..657f56bfb7 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,14 +1,7 @@ from __future__ import annotations -import base64 -from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, Literal, TypeGuard, cast - -import numpy as np - -if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat - from zarr.core.dtype._numpy import DateUnit, TimeUnit +from dataclasses import dataclass +from typing import Literal Endianness = Literal["little", "big"] JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] @@ -17,504 +10,20 @@ class DataTypeValidationError(ValueError): ... -def check_json_bool(data: JSON) -> TypeGuard[bool]: - """ - Check if a JSON value is a boolean. - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a boolean, False otherwise. - """ - return isinstance(data, bool) - - -def check_json_str(data: JSON) -> TypeGuard[str]: - """ - Check if a JSON value is a string. - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a string, False otherwise. - """ - return bool(isinstance(data, str)) - - -def check_json_int(data: JSON) -> TypeGuard[int]: - """ - Check if a JSON value is an integer. - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is an integer, False otherwise. - """ - return bool(isinstance(data, int)) - - -def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: - """ - Check if a JSON value represents a float (v2). - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a float, False otherwise. - """ - if data == "NaN" or data == "Infinity" or data == "-Infinity": - return True - return isinstance(data, float | int) - - -def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: - """ - Check if a JSON value represents a float (v3). - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a float, False otherwise. - """ - # TODO: handle the special JSON serialization of different NaN values - return check_json_float_v2(data) - - -def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: - """ - Check if a JSON value represents a float based on zarr format. - - Parameters - ---------- - data : JSON - The JSON value to check. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - Bool - True if the data is a float, False otherwise. - """ - if zarr_format == 2: - return check_json_float_v2(data) - else: - return check_json_float_v3(data) - - -def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, as per the zarr v3 spec - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a complex float, False otherwise. - """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v3(data[0]) - and check_json_float_v3(data[1]) - ) - - -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a complex float, False otherwise. - """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v2(data[0]) - and check_json_float_v2(data[1]) - ) - - -def check_json_complex_float( - data: JSON, zarr_format: ZarrFormat -) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float based on zarr format. - - Parameters - ---------- - data : JSON - The JSON value to check. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - Bool - True if the data represents a complex float, False otherwise. - """ - if zarr_format == 2: - return check_json_complex_float_v2(data) - return check_json_complex_float_v3(data) - - -def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: - """ - Convert a float to JSON (v2). - - Parameters - ---------- - data : float or np.floating - The float value to convert. - - Returns - ------- - JSONFloat - The JSON representation of the float. - """ - if np.isnan(data): - return "NaN" - elif np.isinf(data): - return "Infinity" if data > 0 else "-Infinity" - return float(data) - - -def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: - """ - Convert a float to JSON (v3). - - Parameters - ---------- - data : float or np.floating - The float value to convert. - - Returns - ------- - JSONFloat - The JSON representation of the float. - """ - # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly - # so we just reuse the v2 routine here - return float_to_json_v2(data) - - -def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: - """ - Convert a float to JSON, parametrized by the zarr format version. - - Parameters - ---------- - data : float or np.floating - The float value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - JSONFloat - The JSON representation of the float. +@dataclass(frozen=True) +class HasLength: """ - if zarr_format == 2: - return float_to_json_v2(data) - else: - return float_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: - """ - Convert a complex number to JSON (v2). - - Parameters - ---------- - data : complex or np.complexfloating - The complex value to convert. - - Returns - ------- - tuple[JSONFloat, JSONFloat] - The JSON representation of the complex number. - """ - return float_to_json_v2(data.real), float_to_json_v2(data.imag) - - -def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: - """ - Convert a complex number to JSON (v3). - - Parameters - ---------- - data : complex or np.complexfloating - The complex value to convert. - - Returns - ------- - tuple[JSONFloat, JSONFloat] - The JSON representation of the complex number. - """ - return float_to_json_v3(data.real), float_to_json_v3(data.imag) - - -def complex_float_to_json( - data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat -) -> tuple[JSONFloat, JSONFloat]: - """ - Convert a complex number to JSON, parametrized by the zarr format version. - - Parameters - ---------- - data : complex or np.complexfloating - The complex value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - tuple[JSONFloat, JSONFloat] or JSONFloat - The JSON representation of the complex number. - """ - if zarr_format == 2: - return complex_to_json_v2(data) - else: - return complex_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: - """ - Convert bytes to JSON. - - Parameters - ---------- - data : bytes - The bytes to store. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The bytes encoded as ascii using the base64 alphabet. - """ - # TODO: decide if we are going to make this implementation zarr format-specific - return base64.b64encode(data).decode("ascii") - - -def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: - """ - Convert a JSON string to bytes - - Parameters - ---------- - data : str - The JSON string to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - bytes - The bytes. + A mix-in class for data types with a length attribute, such as fixed-size collections + of unicode strings, or bytes. """ - if zarr_format == 2: - return base64.b64decode(data.encode("ascii")) - # TODO: differentiate these as needed. This is a spec question. - if zarr_format == 3: - return base64.b64decode(data.encode("ascii")) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + length: int -def float_from_json_v2(data: JSONFloat) -> float: - """ - Convert a JSON float to a float (Zarr v2). - Parameters - ---------- - data : JSONFloat - The JSON float to convert. - - Returns - ------- - float - The float value. +@dataclass(frozen=True) +class HasEndianness: """ - match data: - case "NaN": - return float("nan") - case "Infinity": - return float("inf") - case "-Infinity": - return float("-inf") - case _: - return float(data) - - -def float_from_json_v3(data: JSONFloat) -> float: - """ - Convert a JSON float to a float (v3). - - Parameters - ---------- - data : JSONFloat - The JSON float to convert. - - Returns - ------- - float - The float value. + A mix-in class for data types with an endianness attribute """ - # todo: support the v3-specific NaN handling - return float_from_json_v2(data) - -def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: - """ - Convert a JSON float to a float based on zarr format. - - Parameters - ---------- - data : JSONFloat - The JSON float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - float - The float value. - """ - if zarr_format == 2: - return float_from_json_v2(data) - else: - return float_from_json_v3(data) - - -def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: - """ - Convert a JSON complex float to a complex number (v2). - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - - Returns - ------- - np.complexfloating - The complex number. - """ - return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) - - -def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: - """ - Convert a JSON complex float to a complex number (v3). - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - - Returns - ------- - np.complexfloating - The complex number. - """ - return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) - - -def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: - """ - Convert a JSON complex float to a complex number based on zarr format. - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - np.complexfloating - The complex number. - """ - if zarr_format == 2: - return complex_float_from_json_v2(data) - else: - return complex_float_from_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def datetime_to_json(data: np.datetime64) -> int: - """ - Convert a datetime64 to a JSON integer. - - Parameters - ---------- - data : np.datetime64 - The datetime64 value to convert. - - Returns - ------- - int - The JSON representation of the datetime64. - """ - return data.view(np.int64).item() - - -def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: - """ - Convert a JSON integer to a datetime64. - - Parameters - ---------- - data : int - The JSON integer to convert. - unit : DateUnit or TimeUnit - The unit of the datetime64. - - Returns - ------- - np.datetime64 - The datetime64 value. - """ - return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) + endianness: Endianness | None = "little" diff --git a/src/zarr/core/dtype/npy/__init__.py b/src/zarr/core/dtype/npy/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py new file mode 100644 index 0000000000..293d8383c0 --- /dev/null +++ b/src/zarr/core/dtype/npy/bool.py @@ -0,0 +1,114 @@ +from dataclasses import dataclass +from typing import ClassVar, Literal, Self, TypeGuard + +import numpy as np + +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.npy.common import check_json_bool +from zarr.core.dtype.wrapper import ZDType, _BaseDType + + +@dataclass(frozen=True, kw_only=True, slots=True) +class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): + """ + Wrapper for numpy boolean dtype. + + Attributes + ---------- + name : str + The name of the dtype. + dtype_cls : ClassVar[type[np.dtypes.BoolDType]] + The numpy dtype class. + """ + + _zarr_v3_name = "bool" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) + dtype_cls = np.dtypes.BoolDType + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() + + def to_dtype(self: Self) -> np.dtypes.BoolDType: + return self.dtype_cls() + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["bool", "|b1"]]: + """ + Check that the input is a valid JSON representation of a bool. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() + + def default_value(self) -> np.bool_: + """ + Get the default value for the boolean dtype. + + Returns + ------- + np.bool_ + The default value. + """ + return np.False_ + + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> bool: + """ + Convert a scalar to a python bool. + + Parameters + ---------- + data : object + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + bool + The JSON-serializable format. + """ + return bool(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: + """ + Read a JSON-serializable value as a numpy boolean scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.bool_ + The numpy boolean scalar. + """ + if check_json_bool(data): + return self._cast_value_unsafe(data) + raise TypeError(f"Invalid type: {data}. Expected a boolean.") + + def check_value(self, data: object) -> bool: + # Anything can become a bool + return True + + def cast_value(self, value: object) -> np.bool_: + return self._cast_value_unsafe(value) + + def _cast_value_unsafe(self, value: object) -> np.bool_: + return np.bool_(value) diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py new file mode 100644 index 0000000000..6571002bbb --- /dev/null +++ b/src/zarr/core/dtype/npy/common.py @@ -0,0 +1,578 @@ +from __future__ import annotations + +import base64 +import sys +from collections.abc import Sequence +from typing import ( + TYPE_CHECKING, + Any, + Literal, + SupportsComplex, + SupportsFloat, + SupportsIndex, + SupportsInt, + TypeGuard, + TypeVar, + get_args, +) + +import numpy as np + +from zarr.core.dtype.common import Endianness, JSONFloat + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + +IntLike = SupportsInt | SupportsIndex | bytes | str +FloatLike = SupportsIndex | SupportsFloat | bytes | str +ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None +DateUnit = Literal["Y", "M", "W", "D"] +TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +EndiannessNumpy = Literal[">", "<", "|", "="] + +TFloatDType_co = TypeVar( + "TFloatDType_co", + bound=np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, + covariant=True, +) +TFloatScalar_co = TypeVar( + "TFloatScalar_co", bound=np.float16 | np.float32 | np.float64, covariant=True +) + +TComplexDType_co = TypeVar( + "TComplexDType_co", bound=np.dtypes.Complex64DType | np.dtypes.Complex128DType, covariant=True +) +TComplexScalar_co = TypeVar("TComplexScalar_co", bound=np.complex64 | np.complex128, covariant=True) + + +def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: + """ + Convert a numpy endianness string literal to a human-readable literal value. + + Parameters + ---------- + endianness : Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Returns + ------- + Endianness or None + The human-readable representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "=": + # Use the local system endianness + return sys.byteorder + case "<": + return "little" + case ">": + return "big" + case "|": + # for dtypes without byte ordering semantics + return None + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" + ) + + +def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: + """ + Convert an endianness literal to its numpy string representation. + + Parameters + ---------- + endianness : Endianness or None + The endianness to convert. + + Returns + ------- + Literal[">", "<", "|"] + The numpy string representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "little": + return "<" + case "big": + return ">" + case None: + return "|" + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" + ) + + +def float_from_json_v2(data: JSONFloat) -> float: + """ + Convert a JSON float to a float (Zarr v2). + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + + Returns + ------- + float + The float value. + """ + match data: + case "NaN": + return float("nan") + case "Infinity": + return float("inf") + case "-Infinity": + return float("-inf") + case _: + return float(data) + + +def float_from_json_v3(data: JSONFloat) -> float: + """ + Convert a JSON float to a float (v3). + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + + Returns + ------- + float + The float value. + """ + # todo: support the v3-specific NaN handling + return float_from_json_v2(data) + + +def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: + """ + Convert a JSON float to a float based on zarr format. + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + float + The float value. + """ + if zarr_format == 2: + return float_from_json_v2(data) + else: + return float_from_json_v3(data) + + +def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: + """ + Convert a JSON string to bytes + + Parameters + ---------- + data : str + The JSON string to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + bytes + The bytes. + """ + if zarr_format == 2: + return base64.b64decode(data.encode("ascii")) + # TODO: differentiate these as needed. This is a spec question. + if zarr_format == 3: + return base64.b64decode(data.encode("ascii")) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: + """ + Convert bytes to JSON. + + Parameters + ---------- + data : bytes + The bytes to store. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The bytes encoded as ascii using the base64 alphabet. + """ + # TODO: decide if we are going to make this implementation zarr format-specific + return base64.b64encode(data).decode("ascii") + + +def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: + """ + Convert a float to JSON (v2). + + Parameters + ---------- + data : float or np.floating + The float value to convert. + + Returns + ------- + JSONFloat + The JSON representation of the float. + """ + if np.isnan(data): + return "NaN" + elif np.isinf(data): + return "Infinity" if data > 0 else "-Infinity" + return float(data) + + +def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: + """ + Convert a float to JSON (v3). + + Parameters + ---------- + data : float or np.floating + The float value to convert. + + Returns + ------- + JSONFloat + The JSON representation of the float. + """ + # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly + # so we just reuse the v2 routine here + return float_to_json_v2(data) + + +def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: + """ + Convert a complex number to JSON (v3). + + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + + Returns + ------- + tuple[JSONFloat, JSONFloat] + The JSON representation of the complex number. + """ + return float_to_json_v3(data.real), float_to_json_v3(data.imag) + + +def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: + """ + Convert a complex number to JSON (v2). + + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + + Returns + ------- + tuple[JSONFloat, JSONFloat] + The JSON representation of the complex number. + """ + return float_to_json_v2(data.real), float_to_json_v2(data.imag) + + +def complex_float_to_json( + data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat +) -> tuple[JSONFloat, JSONFloat]: + """ + Convert a complex number to JSON, parametrized by the zarr format version. + + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + tuple[JSONFloat, JSONFloat] or JSONFloat + The JSON representation of the complex number. + """ + if zarr_format == 2: + return complex_to_json_v2(data) + else: + return complex_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: + """ + Convert a float to JSON, parametrized by the zarr format version. + + Parameters + ---------- + data : float or np.floating + The float value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSONFloat + The JSON representation of the float. + """ + if zarr_format == 2: + return float_to_json_v2(data) + else: + return float_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: + """ + Check if a JSON value represents a float (v2). + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a float, False otherwise. + """ + if data == "NaN" or data == "Infinity" or data == "-Infinity": + return True + return isinstance(data, float | int) + + +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a complex float, False otherwise. + """ + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v2(data[0]) + and check_json_float_v2(data[1]) + ) + + +def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: + """ + Check if a JSON value represents a float (v3). + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a float, False otherwise. + """ + # TODO: handle the special JSON serialization of different NaN values + return check_json_float_v2(data) + + +def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float, as per the zarr v3 spec + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a complex float, False otherwise. + """ + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v3(data[0]) + and check_json_float_v3(data[1]) + ) + + +def check_json_complex_float( + data: JSON, zarr_format: ZarrFormat +) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float based on zarr format. + + Parameters + ---------- + data : JSON + The JSON value to check. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + Bool + True if the data represents a complex float, False otherwise. + """ + if zarr_format == 2: + return check_json_complex_float_v2(data) + return check_json_complex_float_v3(data) + + +def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: + """ + Check if a JSON value represents a float based on zarr format. + + Parameters + ---------- + data : JSON + The JSON value to check. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + Bool + True if the data is a float, False otherwise. + """ + if zarr_format == 2: + return check_json_float_v2(data) + else: + return check_json_float_v3(data) + + +def check_json_int(data: JSON) -> TypeGuard[int]: + """ + Check if a JSON value is an integer. + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is an integer, False otherwise. + """ + return bool(isinstance(data, int)) + + +def check_json_str(data: JSON) -> TypeGuard[str]: + """ + Check if a JSON value is a string. + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a string, False otherwise. + """ + return bool(isinstance(data, str)) + + +def check_json_bool(data: JSON) -> TypeGuard[bool]: + """ + Check if a JSON value is a boolean. + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a boolean, False otherwise. + """ + return isinstance(data, bool) + + +def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: + """ + Convert a JSON complex float to a complex number (v2). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + + Returns + ------- + np.complexfloating + The complex number. + """ + return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) + + +def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: + """ + Convert a JSON complex float to a complex number (v3). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + + Returns + ------- + np.complexfloating + The complex number. + """ + return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) + + +def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: + """ + Convert a JSON complex float to a complex number based on zarr format. + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.complexfloating + The complex number. + """ + if zarr_format == 2: + return complex_float_from_json_v2(data) + else: + return complex_float_from_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py new file mode 100644 index 0000000000..22e1bd66a3 --- /dev/null +++ b/src/zarr/core/dtype/npy/complex.py @@ -0,0 +1,155 @@ +from dataclasses import dataclass +from typing import ( + TYPE_CHECKING, + ClassVar, + Self, + TypeGuard, + cast, +) + +import numpy as np + +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.npy.common import ( + ComplexLike, + TComplexDType_co, + TComplexScalar_co, + check_json_complex_float, + complex_float_from_json, + complex_float_to_json, + endianness_from_numpy_str, + endianness_to_numpy_str, +) +from zarr.core.dtype.wrapper import ZDType, _BaseDType + +if TYPE_CHECKING: + from zarr.core.dtype.npy.common import EndiannessNumpy + + +@dataclass(frozen=True) +class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness): + # This attribute holds the possible zarr v2 JSON names for the data type + _zarr_v2_names: ClassVar[tuple[str, ...]] + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + + def to_dtype(self) -> TComplexDType_co: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] + + def to_json(self, zarr_format: ZarrFormat) -> str: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of this data type. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def check_value(self, value: object) -> bool: + return isinstance(value, ComplexLike) + + def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[arg-type, return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to a complex scalar.") + + def default_value(self) -> TComplexScalar_co: + """ + Get the default value, which is 0 cast to this dtype + + Returns + ------- + Int scalar + The default value. + """ + return self._cast_value_unsafe(0) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: + """ + Read a JSON-serializable value as a numpy float. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + TScalar_co + The numpy float. + """ + if check_json_complex_float(data, zarr_format=zarr_format): + return self._cast_value_unsafe(complex_float_from_json(data, zarr_format=zarr_format)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: + """ + Convert an object to a JSON-serializable float. + + Parameters + ---------- + data : _BaseScalar + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSON + The JSON-serializable form of the complex number, which is a list of two floats, + each of which is encoding according to a zarr-format-specific encoding. + """ + return complex_float_to_json(self.cast_value(data), zarr_format=zarr_format) + + +@dataclass(frozen=True, kw_only=True) +class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): + dtype_cls = np.dtypes.Complex64DType + _zarr_v3_name = "complex64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", "c16", " Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + + def to_dtype(self) -> TFloatDType_co: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] + + def to_json(self, zarr_format: ZarrFormat) -> str: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of this data type. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def check_value(self, value: object) -> TypeGuard[FloatLike]: + return isinstance(value, FloatLike) + + def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to a float.") + + def default_value(self) -> TFloatScalar_co: + """ + Get the default value, which is 0 cast to this dtype + + Returns + ------- + Int scalar + The default value. + """ + return self._cast_value_unsafe(0) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: + """ + Read a JSON-serializable value as a numpy float. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + TScalar_co + The numpy float. + """ + if check_json_float(data, zarr_format=zarr_format): + return self._cast_value_unsafe(float_from_json(data, zarr_format=zarr_format)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: + """ + Convert an object to a JSON-serializable float. + + Parameters + ---------- + data : _BaseScalar + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSON + The JSON-serializable form of the float, which is potentially a number or a string. + See the zarr specifications for details on the JSON encoding for floats. + """ + return float_to_json(self._cast_value_unsafe(data), zarr_format=zarr_format) + + +@dataclass(frozen=True, kw_only=True) +class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): + dtype_cls = np.dtypes.Float16DType + _zarr_v3_name = "float16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", "f4", "f8", " str: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of this data type. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def check_value(self, value: object) -> TypeGuard[IntLike]: + return isinstance(value, IntLike) + + def _cast_value_unsafe(self, value: object) -> TIntScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to an integer.") + + def default_value(self) -> TIntScalar_co: + """ + Get the default value, which is 0 cast to this dtype + + Returns + ------- + Int scalar + The default value. + """ + return self._cast_value_unsafe(0) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: + """ + Read a JSON-serializable value as a numpy int scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + TScalar_co + The numpy scalar. + """ + if check_json_int(data): + return self._cast_value_unsafe(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> int: + """ + Convert an object to JSON-serializable scalar. + + Parameters + ---------- + data : _BaseScalar + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + int + The JSON-serializable form of the scalar. + """ + return int(self.cast_value(data)) + + +@dataclass(frozen=True, kw_only=True) +class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): + dtype_cls = np.dtypes.Int8DType + _zarr_v3_name = "int8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() + + def to_dtype(self: Self) -> np.dtypes.Int8DType: + return self.dtype_cls() + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() + + +@dataclass(frozen=True, kw_only=True) +class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): + dtype_cls = np.dtypes.UInt8DType + _zarr_v3_name = "uint8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() + + def to_dtype(self: Self) -> np.dtypes.UInt8DType: + return self.dtype_cls() + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() + + +@dataclass(frozen=True, kw_only=True) +class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): + dtype_cls = np.dtypes.Int16DType + _zarr_v3_name = "int16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + + def to_dtype(self) -> np.dtypes.Int16DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + # This ensures that we get the endianness correct without annoying string parsing + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + +@dataclass(frozen=True, kw_only=True) +class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): + dtype_cls = np.dtypes.UInt16DType + _zarr_v3_name = "uint16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + + def to_dtype(self) -> np.dtypes.UInt16DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + +@dataclass(frozen=True, kw_only=True) +class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): + dtype_cls = np.dtypes.Int32DType + _zarr_v3_name = "int32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: + # We override the base implementation to address a windows-specific, pre-numpy 2 issue where + # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` + # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, + # despite the two classes being different. Thus we will create an instance of `cls` with the + # latter dtype, after pulling in the byte order of the input + if dtype == np.dtypes.Int32DType(): + return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) + else: + return super().from_dtype(dtype) + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + + def to_dtype(self) -> np.dtypes.Int32DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + +@dataclass(frozen=True, kw_only=True) +class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): + dtype_cls = np.dtypes.UInt32DType + _zarr_v3_name = "uint32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + + def to_dtype(self) -> np.dtypes.UInt32DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + +@dataclass(frozen=True, kw_only=True) +class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): + dtype_cls = np.dtypes.Int64DType + _zarr_v3_name = "int64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + + def to_dtype(self) -> np.dtypes.Int64DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + +@dataclass(frozen=True, kw_only=True) +class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): + dtype_cls = np.dtypes.UInt64DType + _zarr_v3_name = "uint64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + + def to_dtype(self) -> np.dtypes.UInt64DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py new file mode 100644 index 0000000000..8d8ff57800 --- /dev/null +++ b/src/zarr/core/dtype/npy/sized.py @@ -0,0 +1,382 @@ +import base64 +import re +from collections.abc import Sequence +from dataclasses import dataclass +from typing import Any, ClassVar, Self, TypeGuard, cast + +import numpy as np + +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasLength +from zarr.core.dtype.npy.common import ( + EndiannessNumpy, + bytes_from_json, + bytes_to_json, + check_json_str, + endianness_from_numpy_str, + endianness_to_numpy_str, +) +from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): + dtype_cls = np.dtypes.BytesDType + _zarr_v3_name = "numpy.fixed_length_ascii" + item_size_bits: ClassVar[int] = 8 + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def to_dtype(self) -> np.dtypes.BytesDType[int]: + return self.dtype_cls(self.length) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "length_bits" in data["configuration"] + and isinstance(data["configuration"]["length_bits"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bits": self.length * self.item_size_bits}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.bytes_: + return np.bytes_(b"") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + raise TypeError(f"Invalid type: {data}. Expected a string.") + + def check_value(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes) + + def _cast_value_unsafe(self, value: object) -> np.bytes_: + return self.to_dtype().type(value) + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): + # np.dtypes.VoidDType is specified in an odd way in numpy + # it cannot be used to create instances of the dtype + # so we have to tell mypy to ignore this here + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] + _zarr_v3_name = "numpy.void" + item_size_bits: ClassVar[int] = 8 + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def to_dtype(self) -> np.dtypes.VoidDType[int]: + # Numpy does not allow creating a void type + # by invoking np.dtypes.VoidDType directly + return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # Check that the dtype is |V1, |V2, ... + return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and isinstance(data["name"], str) + and (re.match(r"^r\d+$", data["name"]) is not None) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": f"r{self.length * self.item_size_bits}"} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + """ + Numpy void dtype comes in two forms: + * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. + * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, + + In this check we ensure that ``fields`` is ``None``. + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] + + def default_value(self) -> np.void: + return self.to_dtype().type(("\x00" * self.length).encode("ascii")) + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(self.cast_value(data).tobytes()).decode("ascii") + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data)) + raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") + + def check_value(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes | np.void) + + def _cast_value_unsafe(self, value: object) -> np.void: + return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): + dtype_cls = np.dtypes.StrDType + _zarr_v3_name = "numpy.fixed_length_ucs4" + item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls( + length=dtype.itemsize // (cls.item_size_bits // 8), + endianness=endianness_from_numpy_str(byte_order), + ) + + def to_dtype(self) -> np.dtypes.StrDType[int]: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls(self.length).newbyteorder(byte_order) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match >U1, <]U\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "length_bits" in data["configuration"] + and isinstance(data["configuration"]["length_bits"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bits": self.length * self.item_size_bits}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.str_: + return np.str_("") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.to_dtype().type(data) + + def check_value(self, data: object) -> bool: + return isinstance(data, str | np.str_ | bytes) + + def _cast_value_unsafe(self, value: object) -> np.str_: + return self.to_dtype().type(value) + + +@dataclass(frozen=True, kw_only=True) +class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] + _zarr_v3_name = "structured" + fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] + + def default_value(self) -> np.void: + return self._cast_value_unsafe(0) + + def _cast_value_unsafe(self, value: object) -> np.void: + return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) + + @classmethod + def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: + """ + Check that this dtype is a numpy structured dtype + + Parameters + ---------- + dtype : np.dtypes.DTypeLike + The dtype to check. + + Returns + ------- + TypeGuard[np.dtypes.VoidDType] + True if the dtype matches, False otherwise. + """ + return super().check_dtype(dtype) and dtype.fields is not None + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + from zarr.core.dtype import get_data_type_from_native_dtype + + fields: list[tuple[str, ZDType[_BaseDType, _BaseScalar]]] = [] + + if dtype.fields is None: + raise ValueError("numpy dtype has no fields") + + # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only + # care about the first element in either case. + for key, (dtype_instance, *_) in dtype.fields.items(): + dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) + fields.append((key, dtype_wrapped)) + + return cls(fields=tuple(fields)) + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + fields = [ + (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields + ] + if zarr_format == 2: + return fields + elif zarr_format == 3: + base_dict = {"name": self._zarr_v3_name} + base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] + return cast("JSON", base_dict) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[dict[str, JSON] | list[Any]]: + # the actual JSON form is recursive and hard to annotate, so we give up and do + # list[Any] for now + if zarr_format == 2: + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and all( + not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 + for field in data + ) + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and "configuration" in data + and isinstance(data["configuration"], dict) + and "fields" in data["configuration"] + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + from zarr.core.dtype import get_data_type_from_json + + if cls.check_json(data, zarr_format=zarr_format): + if zarr_format == 2: + # structured dtypes are constructed directly from a list of lists + return cls( + fields=tuple( # type: ignore[misc] + (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + for f_name, f_dtype in data + ) + ) + elif zarr_format == 3: # noqa: SIM102 + if isinstance(data, dict) and "configuration" in data: + config = data["configuration"] + if isinstance(config, dict) and "fields" in config: + meta_fields = config["fields"] + fields = tuple( + (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + for f_name, f_dtype in meta_fields + ) + return cls(fields=fields) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + + def to_dtype(self) -> np.dtypes.VoidDType[int]: + return cast( + "np.dtypes.VoidDType[int]", + np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), + ) + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) + + def check_value(self, data: object) -> bool: + # not sure which values we should accept for structured dtypes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + as_bytes = bytes_from_json(data, zarr_format=zarr_format) + dtype = self.to_dtype() + return cast("np.void", np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py new file mode 100644 index 0000000000..15ccfb30f1 --- /dev/null +++ b/src/zarr/core/dtype/npy/string.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Self, TypeGuard + +import numpy as np + +from zarr.core.dtype.npy.common import check_json_str +from zarr.core.dtype.wrapper import ZDType + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import _BaseDType + +_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") + + +if _NUMPY_SUPPORTS_VLEN_STRING: + + @dataclass(frozen=True, kw_only=True) + class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[type-var] + dtype_cls = np.dtypes.StringDType + _zarr_v3_name = "numpy.variable_length_utf8" + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.StringDType: + return self.dtype_cls() + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy string dtype. + """ + if zarr_format == 2: + # TODO: take the entire metadata document in here, and + # check the compressors / filters for vlen-utf8 + # Note that we are checking for the object dtype name. + return data == "|O" + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + # Note: unlike many other numpy data types, we don't serialize the .str attribute + # of the data type to JSON. This is because Zarr was using `|O` for strings before the + # numpy variable length string data type existed, and we want to be consistent with + # that practice + return "|O" + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() + + def default_value(self) -> str: + return "" + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return data + + def check_value(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_value_unsafe(self, value: object) -> str: + return str(value) + +else: + # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. + @dataclass(frozen=True, kw_only=True) + class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] + dtype_cls = np.dtypes.ObjectDType + _zarr_v3_name = "numpy.variable_length_utf8" + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.ObjectDType: + return self.dtype_cls() + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy O dtype. + """ + if zarr_format == 2: + # TODO: take the entire metadata document in here, and + # check the compressors / filters for vlen-utf8 + return data == "|O" + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() + + def default_value(self) -> str: + return "" + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return data # type: ignore[return-value] + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + Strings pass through + """ + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return data + + def check_value(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_value_unsafe(self, value: object) -> str: + return str(value) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py new file mode 100644 index 0000000000..a10b9ae8a3 --- /dev/null +++ b/src/zarr/core/dtype/npy/time.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Self, TypeGuard, cast, get_args + +import numpy as np + +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness +from zarr.core.dtype.npy.common import ( + DateUnit, + EndiannessNumpy, + TimeUnit, + check_json_int, + endianness_from_numpy_str, + endianness_to_numpy_str, +) +from zarr.core.dtype.wrapper import ZDType, _BaseDType + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + + +def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: + """ + Convert a JSON integer to a datetime64. + + Parameters + ---------- + data : int + The JSON integer to convert. + unit : DateUnit or TimeUnit + The unit of the datetime64. + + Returns + ------- + np.datetime64 + The datetime64 value. + """ + return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) + + +def datetime_to_json(data: np.datetime64) -> int: + """ + Convert a datetime64 to a JSON integer. + + Parameters + ---------- + data : np.datetime64 + The datetime64 value to convert. + + Returns + ------- + int + The JSON representation of the datetime64. + """ + return data.view(np.int64).item() + + +@dataclass(frozen=True, kw_only=True, slots=True) +class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] + _zarr_v3_name = "numpy.datetime64" + unit: DateUnit | TimeUnit + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] + if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): + raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') + byteorder = cast("EndiannessNumpy", dtype.byteorder) + return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) + + def to_dtype(self) -> np.dtypes.DateTime64DType: + # Numpy does not allow creating datetime64 via + # np.dtypes.DateTime64Dtype() + return cast( + "np.dtypes.DateTime64DType", + np.dtype(f"datetime64[{self.unit}]").newbyteorder( + endianness_to_numpy_str(self.endianness) + ), + ) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # match M[M], etc + # consider making this a standalone function + return ( + isinstance(data, str) + and len(data) in (6, 7) + and data[0] in (">", "<") + and data[1:4] == "M8[" + and data[4:-1] in get_args(TimeUnit) + get_args(DateUnit) + and data[-1] == "]" + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and "unit" in data["configuration"] + and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.datetime64: + return np.datetime64("NaT") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + if check_json_int(data): + return datetime_from_json(data, self.unit) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + return datetime_to_json(data) # type: ignore[arg-type] + + def check_value(self, data: object) -> bool: + # TODO: decide which values we should accept for datetimes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False + + def _cast_value_unsafe(self, value: object) -> np.datetime64: + return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 23824520f7..aa2837f598 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -291,7 +291,7 @@ def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e -def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: +def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: """ Parse a potential fill value into a value that is compatible with the provided dtype. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 559298c13f..b82fb54270 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -173,6 +173,7 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) + # Note: relying on a type method is numpy-specific fill_value_parsed = data_type.to_dtype().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) diff --git a/tests/conftest.py b/tests/conftest.py index b416e56682..b2f57310e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,9 @@ from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config from zarr.core.dtype import data_type_registry, get_data_type_from_native_dtype -from zarr.core.dtype._numpy import DateTime64, HasLength, Structured +from zarr.core.dtype.common import HasLength +from zarr.core.dtype.npy.sized import Structured +from zarr.core.dtype.npy.time import DateTime64 from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index eed2ac43e5..941f7e71c2 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -9,7 +9,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import BytesLike -from zarr.core.dtype import Bool +from zarr.core.dtype.npy.bool import Bool class TestEntrypointCodec(ArrayBytesCodec): diff --git a/tests/test_array.py b/tests/test_array.py index ff544ad447..ade63f6e43 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -40,14 +40,14 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import ( - DateTime64, - Float64, - Int16, +from zarr.core.dtype.common import Endianness +from zarr.core.dtype.npy.common import endianness_from_numpy_str +from zarr.core.dtype.npy.float import Float64 +from zarr.core.dtype.npy.int import Int16 +from zarr.core.dtype.npy.sized import ( Structured, - endianness_from_numpy_str, ) -from zarr.core.dtype.common import Endianness +from zarr.core.dtype.npy.time import DateTime64 from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv diff --git a/tests/test_config.py b/tests/test_config.py index a2a84e7e7e..d897354690 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,7 +24,7 @@ from zarr.core.buffer import NDBuffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype._numpy import Int8, VariableLengthString +from zarr.core.dtype import Int8, VariableLengthString from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, diff --git a/tests/test_dtype.py b/tests/test_dtype.py index 122949664c..2b520383b1 100644 --- a/tests/test_dtype.py +++ b/tests/test_dtype.py @@ -7,6 +7,12 @@ import zarr from zarr.core.config import config +from zarr.core.dtype.npy.bool import Bool +from zarr.core.dtype.npy.complex import Complex64, Complex128 +from zarr.core.dtype.npy.float import Float16, Float32, Float64 +from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 +from zarr.core.dtype.npy.sized import FixedLengthAscii, FixedLengthBytes, FixedLengthUnicode +from zarr.core.dtype.npy.time import DateTime64 from .conftest import zdtype_examples @@ -26,28 +32,10 @@ data_type_registry, get_data_type_from_json, ) -from zarr.core.dtype._numpy import ( - Bool, - Complex64, - Complex128, - DateTime64, - FixedLengthAscii, - FixedLengthBytes, - FixedLengthUnicode, - Float16, - Float32, - Float64, - Int8, - Int16, - Int32, - Int64, +from zarr.core.dtype.common import DataTypeValidationError +from zarr.core.dtype.npy.sized import ( Structured, - UInt8, - UInt16, - UInt32, - UInt64, ) -from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.registry import DataTypeRegistry diff --git a/tests/test_info.py b/tests/test_info.py index 2e465b6a21..339a0ad419 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -5,7 +5,7 @@ from zarr.codecs.bytes import BytesCodec from zarr.core._info import ArrayInfo, GroupInfo, human_readable_size from zarr.core.common import ZarrFormat -from zarr.core.dtype._numpy import Int32 +from zarr.core.dtype.npy.int import Int32 ZARR_FORMATS = [2, 3] diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 2eec9a6c74..45913830c3 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -9,7 +9,8 @@ import zarr.storage from zarr.core.buffer import cpu from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype._numpy import Float32, Float64, Int16 +from zarr.core.dtype.npy.float import Float32, Float64 +from zarr.core.dtype.npy.int import Int16 from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata from zarr.core.metadata.v2 import parse_zarr_format diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index cd30f5cf3f..fa23dccf59 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,8 +12,8 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import DateTime64 -from zarr.core.dtype.common import check_json_complex_float +from zarr.core.dtype.npy.common import check_json_complex_float +from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, From afc98725e1d0117e4c45e50f24db2b1e49890adf Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 29 Apr 2025 21:50:15 +0200 Subject: [PATCH 069/130] add timedelta64 --- src/zarr/core/dtype/__init__.py | 4 +- src/zarr/core/dtype/npy/common.py | 3 +- src/zarr/core/dtype/npy/time.py | 239 +++++++++++++++++++++++++----- src/zarr/testing/strategies.py | 4 +- tests/test_array.py | 6 +- 5 files changed, 208 insertions(+), 48 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 63b593fd28..4cd71bb8bc 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -12,7 +12,7 @@ FixedLengthUnicode, Structured, ) -from zarr.core.dtype.npy.time import DateTime64 +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -43,6 +43,7 @@ "Int32", "Int64", "Structured", + "TimeDelta64", "UInt8", "UInt16", "UInt32", @@ -68,6 +69,7 @@ | FixedLengthBytes | Structured | DateTime64 + | TimeDelta64 ) ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[_BaseDType, _BaseScalar] | dict[str, JSON] diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 6571002bbb..c079664aa5 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -26,8 +26,7 @@ IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None -DateUnit = Literal["Y", "M", "W", "D"] -TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +DateTimeUnit = Literal["Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] EndiannessNumpy = Literal[">", "<", "|", "="] TFloatDType_co = TypeVar( diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index a10b9ae8a3..030b01c769 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,15 +1,15 @@ from __future__ import annotations +import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Self, TypeGuard, cast, get_args +from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, get_args import numpy as np from zarr.core.dtype.common import DataTypeValidationError, HasEndianness from zarr.core.dtype.npy.common import ( - DateUnit, + DateTimeUnit, EndiannessNumpy, - TimeUnit, check_json_int, endianness_from_numpy_str, endianness_to_numpy_str, @@ -19,15 +19,58 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat +_DTypeName = Literal["datetime64", "timedelta64"] -def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: + +def parse_timedtype_name(name: str) -> tuple[_DTypeName, DateTimeUnit | None]: + """ + Parse a string like "datetime64[s]" into a tuple like ("datetime64", "s"). """ - Convert a JSON integer to a datetime64. + dtype_name: _DTypeName + unit: DateTimeUnit | None + + if name.startswith("datetime64"): + dtype_name = "datetime64" + elif name.startswith("timedelta64"): + dtype_name = "timedelta64" + else: + msg = ( + f"Invalid dtype name. Expected a string starting with on of {get_args(_DTypeName)}. " + f"Got {name!r} instead." + ) + raise ValueError(msg) + + regex = re.search(r"\[(.*?)\]", name) + + if regex is None: + if dtype_name == "timedelta64": + unit = None + else: + msg = ( + "The name of a datetime64 dtype must end with a specification of a unit. " + 'For example, "datetime64[s].' + f"Got {name!r}, which does not follow this pattern." + ) + raise ValueError(msg) + else: + maybe_unit = regex.group(1) + unit_expected = get_args(DateTimeUnit) + if maybe_unit not in unit_expected: + msg = f"Invalid unit. Expected one of {unit_expected}. Got {maybe_unit} instead." + raise ValueError(msg) + unit = maybe_unit # type: ignore[assignment] + + return dtype_name, unit + + +def datetime_from_int(data: int, unit: DateTimeUnit) -> np.datetime64: + """ + Convert an integer to a datetime64. Parameters ---------- data : int - The JSON integer to convert. + The integer to convert. unit : DateUnit or TimeUnit The unit of the datetime64. @@ -39,33 +82,150 @@ def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) -def datetime_to_json(data: np.datetime64) -> int: +def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: """ - Convert a datetime64 to a JSON integer. + Convert a datetime64 or a timedelta64 to an integer. Parameters ---------- - data : np.datetime64 - The datetime64 value to convert. + data : np.datetime64 | np.timedelta64 + The value to convert. Returns ------- int - The JSON representation of the datetime64. + An integer representation of the scalar. """ return data.view(np.int64).item() +def timedelta_from_int(data: int, unit: DateTimeUnit | None) -> np.timedelta64: + """ + Convert an integer to a timedelta64. + + Parameters + ---------- + data : int + The integer to convert. + unit : DateUnit or TimeUnit + The unit of the timedelta64. + + Returns + ------- + np.timedelta64 + The timedelta64 value. + """ + if unit is not None: + dtype_name = f"timedelta64[{unit}]" + else: + dtype_name = "timedelta64" + return cast("np.timedelta64", np.int64(data).view(dtype_name)) + + +@dataclass(frozen=True, kw_only=True, slots=True) +class TimeDelta64(ZDType[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): + """ + A wrapper for the ``TimeDelta64`` data type defined in numpy. + Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. + Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the + unit for ``TimeDelta64`` is optional. + """ + + dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] + _zarr_v3_name = "numpy.timedelta64" + unit: DateTimeUnit | None = None + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + _, unit = parse_timedtype_name(dtype.name) + byteorder = cast("EndiannessNumpy", dtype.byteorder) + return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) + + def to_dtype(self) -> np.dtypes.TimeDelta64DType: + # Numpy does not allow creating timedelta64 via + # np.dtypes.TimeDelta64DType() + if self.unit is not None: + dtype_string = f"timedelta64[{self.unit}]" + else: + dtype_string = "timedelta64" + dt = np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) + return cast("np.dtypes.TimeDelta64DType", dt) + + def default_value(self) -> np.timedelta64: + return np.timedelta64("NaT") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + if check_json_int(data): + return timedelta_from_int(data, self.unit) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + return datetimelike_to_int(data) # type: ignore[arg-type] + + def check_value(self, data: object) -> bool: + # TODO: decide which values we should accept for datetimes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False + + def _cast_value_unsafe(self, value: object) -> np.timedelta64: + return self.to_dtype().type(value) # type: ignore[arg-type] + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # match m[M], etc + # consider making this a standalone function + if not (isinstance(data, str) and data[0] in (">", "<") and data[1:3] == "m8"): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either m8 + return True + if len(data) in (6, 7): + return data[4:-1] in get_args(DateTimeUnit) and data[-1] == "]" + else: + return False + elif zarr_format == 3: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and set(data.keys()) == {"name", "configuration"} + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) in ({"unit"}, {}) + and data["configuration"].get("unit", None) in (*get_args(DateTimeUnit), None) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" - unit: DateUnit | TimeUnit + unit: DateTimeUnit @classmethod def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] - if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): + unit: DateTimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] + if unit not in get_args(DateTimeUnit): raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') byteorder = cast("EndiannessNumpy", dtype.byteorder) return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) @@ -80,30 +240,6 @@ def to_dtype(self) -> np.dtypes.DateTime64DType: ), ) - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # match M[M], etc - # consider making this a standalone function - return ( - isinstance(data, str) - and len(data) in (6, 7) - and data[0] in (">", "<") - and data[1:4] == "M8[" - and data[4:-1] in get_args(TimeUnit) + get_args(DateUnit) - and data[-1] == "]" - ) - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and "unit" in data["configuration"] - and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.datetime64: return np.datetime64("NaT") @@ -124,11 +260,11 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): - return datetime_from_json(data, self.unit) + return datetime_from_int(data, self.unit) raise TypeError(f"Invalid type: {data}. Expected an integer.") def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: - return datetime_to_json(data) # type: ignore[arg-type] + return datetimelike_to_int(data) # type: ignore[arg-type] def check_value(self, data: object) -> bool: # TODO: decide which values we should accept for datetimes. @@ -140,3 +276,26 @@ def check_value(self, data: object) -> bool: def _cast_value_unsafe(self, value: object) -> np.datetime64: return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # match M[M], etc + # consider making this a standalone function + return ( + isinstance(data, str) + and len(data) in (6, 7) + and data[0] in (">", "<") + and data[1:4] == "M8[" + and data[4:-1] in get_args(DateTimeUnit) + and data[-1] == "]" + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and set(data["configuration"].keys()) == {"unit"} + and data["configuration"]["unit"] in get_args(DateTimeUnit) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index f371a88e83..af4ab831ec 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -55,7 +55,7 @@ def v3_dtypes() -> st.SearchStrategy[np.dtype]: ) -def v2_dtypes() -> st.SearchStrategy[np.dtype]: +def v2_dtypes() -> st.SearchStrategy[np.dtype[Any]]: return ( npst.boolean_dtypes() | npst.integer_dtypes(endianness="=") @@ -65,7 +65,7 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]: | npst.byte_string_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="=") | npst.datetime64_dtypes(endianness="=") - # | npst.timedelta64_dtypes() + | npst.timedelta64_dtypes(endianness="?") ) diff --git a/tests/test_array.py b/tests/test_array.py index ade63f6e43..aa6dfd0f07 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -47,7 +47,7 @@ from zarr.core.dtype.npy.sized import ( Structured, ) -from zarr.core.dtype.npy.time import DateTime64 +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv @@ -969,7 +969,7 @@ def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: Test that the fill value of an array is set to the default value for the dtype object """ a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype) - if isinstance(dtype, DateTime64) and np.isnat(a.fill_value): + if isinstance(dtype, DateTime64 | TimeDelta64) and np.isnat(a.fill_value): assert np.isnat(dtype.default_value()) else: assert a.fill_value == dtype.default_value() @@ -1350,7 +1350,7 @@ def test_default_endianness( """ dtype = Int16(endianness=endianness) arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) - assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness + assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness # type: ignore[union-attr] @pytest.mark.parametrize("value", [1, 1.4, "a", b"a", np.array(1)]) From e1bf90135d0fad1feba88b9b360bae102b681976 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 30 Apr 2025 22:20:40 +0200 Subject: [PATCH 070/130] refactor time dtypes --- src/zarr/core/buffer/core.py | 11 +- src/zarr/core/dtype/__init__.py | 1 + src/zarr/core/dtype/npy/common.py | 4 +- src/zarr/core/dtype/npy/time.py | 285 +++++++++++++----------------- src/zarr/testing/strategies.py | 4 +- tests/conftest.py | 14 +- tests/test_properties.py | 9 +- 7 files changed, 139 insertions(+), 189 deletions(-) diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 7be9dc8bf4..d50c50cc79 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -427,16 +427,7 @@ def as_scalar(self) -> ScalarType: """Returns the buffer as a scalar value""" if self._data.size != 1: raise ValueError("Buffer does not contain a single scalar value") - item = self.as_numpy_array().item() - scalar: ScalarType - - if np.issubdtype(self.dtype, np.datetime64): - unit: str = np.datetime_data(self.dtype)[0] # Extract the unit (e.g., 'Y', 'D', etc.) - scalar = np.datetime64(item, unit) - else: - scalar = self.dtype.type(item) # Regular conversion for non-datetime types - - return scalar + return cast(ScalarType, self.as_numpy_array()[()]) @property def dtype(self) -> np.dtype[Any]: diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 4cd71bb8bc..f535f62f35 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -44,6 +44,7 @@ "Int64", "Structured", "TimeDelta64", + "TimeDelta64", "UInt8", "UInt16", "UInt32", diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index c079664aa5..857c515c19 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -26,7 +26,9 @@ IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None -DateTimeUnit = Literal["Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +DateTimeUnit = Literal[ + "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", "generic" +] EndiannessNumpy = Literal[">", "<", "|", "="] TFloatDType_co = TypeVar( diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 030b01c769..056836a105 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,12 +1,23 @@ from __future__ import annotations -import re +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, get_args +from typing import ( + TYPE_CHECKING, + ClassVar, + Generic, + Literal, + Self, + TypedDict, + TypeGuard, + TypeVar, + cast, + get_args, +) import numpy as np -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness +from zarr.core.dtype.common import HasEndianness from zarr.core.dtype.npy.common import ( DateTimeUnit, EndiannessNumpy, @@ -22,48 +33,7 @@ _DTypeName = Literal["datetime64", "timedelta64"] -def parse_timedtype_name(name: str) -> tuple[_DTypeName, DateTimeUnit | None]: - """ - Parse a string like "datetime64[s]" into a tuple like ("datetime64", "s"). - """ - dtype_name: _DTypeName - unit: DateTimeUnit | None - - if name.startswith("datetime64"): - dtype_name = "datetime64" - elif name.startswith("timedelta64"): - dtype_name = "timedelta64" - else: - msg = ( - f"Invalid dtype name. Expected a string starting with on of {get_args(_DTypeName)}. " - f"Got {name!r} instead." - ) - raise ValueError(msg) - - regex = re.search(r"\[(.*?)\]", name) - - if regex is None: - if dtype_name == "timedelta64": - unit = None - else: - msg = ( - "The name of a datetime64 dtype must end with a specification of a unit. " - 'For example, "datetime64[s].' - f"Got {name!r}, which does not follow this pattern." - ) - raise ValueError(msg) - else: - maybe_unit = regex.group(1) - unit_expected = get_args(DateTimeUnit) - if maybe_unit not in unit_expected: - msg = f"Invalid unit. Expected one of {unit_expected}. Got {maybe_unit} instead." - raise ValueError(msg) - unit = maybe_unit # type: ignore[assignment] - - return dtype_name, unit - - -def datetime_from_int(data: int, unit: DateTimeUnit) -> np.datetime64: +def datetime_from_int(data: int, *, unit: DateTimeUnit, interval: int) -> np.datetime64: """ Convert an integer to a datetime64. @@ -71,15 +41,18 @@ def datetime_from_int(data: int, unit: DateTimeUnit) -> np.datetime64: ---------- data : int The integer to convert. - unit : DateUnit or TimeUnit + unit : DateTimeUnit The unit of the datetime64. + interval : int + The interval of the datetime64. Returns ------- np.datetime64 The datetime64 value. """ - return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) + dtype_name = f"datetime64[{interval}{unit}]" + return cast("np.datetime64", np.int64(data).view(dtype_name)) def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: @@ -99,80 +72,74 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: return data.view(np.int64).item() -def timedelta_from_int(data: int, unit: DateTimeUnit | None) -> np.timedelta64: - """ - Convert an integer to a timedelta64. +_BaseTimeDType_co = TypeVar( + "_BaseTimeDType_co", + bound=np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, + covariant=True, +) +_BaseTimeScalar = TypeVar("_BaseTimeScalar", bound=np.timedelta64 | np.datetime64) - Parameters - ---------- - data : int - The integer to convert. - unit : DateUnit or TimeUnit - The unit of the timedelta64. +TName = TypeVar("TName", bound=str) +TConfig = TypeVar("TConfig", bound=Mapping[str, object]) - Returns - ------- - np.timedelta64 - The timedelta64 value. - """ - if unit is not None: - dtype_name = f"timedelta64[{unit}]" - else: - dtype_name = "timedelta64" - return cast("np.timedelta64", np.int64(data).view(dtype_name)) +class NamedConfig(TypedDict, Generic[TName, TConfig]): + name: TName + configuration: TConfig -@dataclass(frozen=True, kw_only=True, slots=True) -class TimeDelta64(ZDType[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): - """ - A wrapper for the ``TimeDelta64`` data type defined in numpy. - Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. - Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the - unit for ``TimeDelta64`` is optional. - """ - dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] - _zarr_v3_name = "numpy.timedelta64" - unit: DateTimeUnit | None = None +class TimeConfig(TypedDict): + unit: DateTimeUnit + interval: int + + +# aspirational +DateTime64MetaParams = NamedConfig[Literal["numpy.datetime64"], TimeConfig] +TimeDelta64MetaParams = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] + + +@dataclass(frozen=True, kw_only=True, slots=True) +class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness): + _zarr_v2_names: ClassVar[tuple[str, ...]] + # this attribute exists so that we can programmatically create a numpy dtype instance + # because the particular numpy dtype we are wrapping does not allow direct construction via + # cls.dtype_cls() + _numpy_name: ClassVar[_DTypeName] + interval: int + unit: DateTimeUnit @classmethod def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - _, unit = parse_timedtype_name(dtype.name) + unit, interval = np.datetime_data(dtype.name) byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) - - def to_dtype(self) -> np.dtypes.TimeDelta64DType: - # Numpy does not allow creating timedelta64 via - # np.dtypes.TimeDelta64DType() - if self.unit is not None: - dtype_string = f"timedelta64[{self.unit}]" - else: - dtype_string = "timedelta64" - dt = np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) - return cast("np.dtypes.TimeDelta64DType", dt) + return cls(unit=unit, interval=interval, endianness=endianness_from_numpy_str(byteorder)) # type: ignore[arg-type] - def default_value(self) -> np.timedelta64: - return np.timedelta64("NaT") - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def to_dtype(self) -> _BaseTimeDType_co: + # Numpy does not allow creating datetime64 or timedelta64 via + # np.dtypes.{dtype_name}() + # so we use np.dtype with a formatted string. + dtype_string = f"{self._numpy_name}[{self.interval}{self.unit}]" + return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] + unit = data["configuration"]["unit"] # type: ignore[index, call-overload] + interval = data["configuration"]["interval"] # type: ignore[index, call-overload] + return cls(unit=unit, interval=interval) # type: ignore[arg-type] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: - if check_json_int(data): - return timedelta_from_int(data, self.unit) - raise TypeError(f"Invalid type: {data}. Expected an integer.") + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return cast("str", self.to_dtype().str) + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "interval": self.interval}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: return datetimelike_to_int(data) # type: ignore[arg-type] @@ -185,6 +152,31 @@ def check_value(self, data: object) -> bool: except ValueError: return False + +@dataclass(frozen=True, kw_only=True, slots=True) +class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): + """ + A wrapper for the ``TimeDelta64`` data type defined in numpy. + Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. + Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the + unit for ``TimeDelta64`` is optional. + """ + + dtype_cls = np.dtypes.TimeDelta64DType + _zarr_v3_name = "numpy.timedelta64" + _zarr_v2_names = (">m8", " np.timedelta64: + return np.timedelta64("NaT") + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + if check_json_int(data): + return self.to_dtype().type(data, f"{self.interval}{self.unit}") + raise TypeError(f"Invalid type: {data}. Expected an integer.") + def _cast_value_unsafe(self, value: object) -> np.timedelta64: return self.to_dtype().type(value) # type: ignore[arg-type] @@ -193,16 +185,16 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: if zarr_format == 2: # match m[M], etc # consider making this a standalone function - if not (isinstance(data, str) and data[0] in (">", "<") and data[1:3] == "m8"): + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): return False if len(data) == 3: # no unit, and # we already checked that this string is either m8 return True - if len(data) in (6, 7): - return data[4:-1] in get_args(DateTimeUnit) and data[-1] == "]" else: - return False + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" elif zarr_format == 3: return ( isinstance(data, dict) @@ -210,70 +202,29 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and data["name"] == cls._zarr_v3_name and set(data.keys()) == {"name", "configuration"} and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) in ({"unit"}, {}) - and data["configuration"].get("unit", None) in (*get_args(DateTimeUnit), None) + and set(data["configuration"].keys()) == {"unit", "interval"} + and data["configuration"]["unit"] in get_args(DateTimeUnit) ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True, slots=True) -class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): - dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] +class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): + dtype_cls = np.dtypes.DateTime64DType _zarr_v3_name = "numpy.datetime64" - unit: DateTimeUnit - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - unit: DateTimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] - if unit not in get_args(DateTimeUnit): - raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') - byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) - - def to_dtype(self) -> np.dtypes.DateTime64DType: - # Numpy does not allow creating datetime64 via - # np.dtypes.DateTime64Dtype() - return cast( - "np.dtypes.DateTime64DType", - np.dtype(f"datetime64[{self.unit}]").newbyteorder( - endianness_to_numpy_str(self.endianness) - ), - ) + _zarr_v2_names = (">M8", " np.datetime64: return np.datetime64("NaT") - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): - return datetime_from_int(data, self.unit) + return self.to_dtype().type(data, f"{self.interval}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: - return datetimelike_to_int(data) # type: ignore[arg-type] - - def check_value(self, data: object) -> bool: - # TODO: decide which values we should accept for datetimes. - try: - np.array([data], dtype=self.to_dtype()) - return True # noqa: TRY300 - except ValueError: - return False - def _cast_value_unsafe(self, value: object) -> np.datetime64: return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] @@ -282,20 +233,22 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: if zarr_format == 2: # match M[M], etc # consider making this a standalone function - return ( - isinstance(data, str) - and len(data) in (6, 7) - and data[0] in (">", "<") - and data[1:4] == "M8[" - and data[4:-1] in get_args(DateTimeUnit) - and data[-1] == "]" - ) + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either M8 + return True + else: + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" elif zarr_format == 3: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name - and set(data["configuration"].keys()) == {"unit"} + and set(data["configuration"].keys()) == {"unit", "interval"} and data["configuration"]["unit"] in get_args(DateTimeUnit) ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index af4ab831ec..4184112f5e 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -50,8 +50,8 @@ def v3_dtypes() -> st.SearchStrategy[np.dtype]: | npst.complex_number_dtypes(endianness="=") # | npst.byte_string_dtypes(endianness="=") # | npst.unicode_string_dtypes() - # | npst.datetime64_dtypes() - # | npst.timedelta64_dtypes() + | npst.datetime64_dtypes() + | npst.timedelta64_dtypes() ) diff --git a/tests/conftest.py b/tests/conftest.py index b2f57310e3..434763a4f3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,10 +20,14 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.dtype import data_type_registry, get_data_type_from_native_dtype +from zarr.core.dtype import ( + DateTime64, + Structured, + TimeDelta64, + data_type_registry, + get_data_type_from_native_dtype, +) from zarr.core.dtype.common import HasLength -from zarr.core.dtype.npy.sized import Structured -from zarr.core.dtype.npy.time import DateTime64 from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -426,7 +430,7 @@ def meta_from_array( zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) - elif issubclass(wrapper_cls, DateTime64): - zdtype_examples += (wrapper_cls(unit="s"),) + elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): + zdtype_examples += (wrapper_cls(unit="s", interval=10),) else: zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_properties.py b/tests/test_properties.py index 7c741ec873..15dd701582 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -1,4 +1,3 @@ -import dataclasses import json import numbers from typing import Any @@ -209,8 +208,8 @@ def test_roundtrip_array_metadata_from_json(data: st.DataObject, zarr_format: in zarray_dict = json.loads(buffer_dict[ZARR_JSON].to_bytes().decode()) metadata_roundtripped = ArrayV3Metadata.from_dict(zarray_dict) - orig = dataclasses.asdict(metadata) - rt = dataclasses.asdict(metadata_roundtripped) + orig = metadata.to_dict() + rt = metadata_roundtripped.to_dict() assert deep_equal(orig, rt), f"Roundtrip mismatch:\nOriginal: {orig}\nRoundtripped: {rt}" @@ -323,5 +322,5 @@ def test_array_metadata_meets_spec(meta: ArrayV2Metadata | ArrayV3Metadata) -> N elif dtype_native.kind == "c": # fill_value should be a two-element array [real, imag]. assert serialized_complex_float_is_valid(asdict_dict["fill_value"]) - elif dtype_native.kind == "M" and np.isnat(meta.fill_value): - assert asdict_dict["fill_value"] == "NaT" + elif dtype_native.kind in ("M", "m") and np.isnat(meta.fill_value): + assert asdict_dict["fill_value"] == -9223372036854775808 From 890077ef1d3fb61d08ff9dc8bc31c8e0a66ccbd4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 1 May 2025 11:12:35 +0200 Subject: [PATCH 071/130] widen dtype test strategies --- src/zarr/testing/strategies.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 4184112f5e..066239ff33 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -41,17 +41,17 @@ def paths(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> Any: return draw(st.just("/") | keys(max_num_nodes=max_num_nodes)) -def v3_dtypes() -> st.SearchStrategy[np.dtype]: +def v3_dtypes() -> st.SearchStrategy[np.dtype[Any]]: return ( npst.boolean_dtypes() | npst.integer_dtypes(endianness="=") | npst.unsigned_integer_dtypes(endianness="=") | npst.floating_dtypes(endianness="=") | npst.complex_number_dtypes(endianness="=") - # | npst.byte_string_dtypes(endianness="=") - # | npst.unicode_string_dtypes() - | npst.datetime64_dtypes() - | npst.timedelta64_dtypes() + | npst.byte_string_dtypes(endianness="=") + | npst.unicode_string_dtypes(endianness="=") + | npst.datetime64_dtypes(endianness="=") + | npst.timedelta64_dtypes(endianness="=") ) @@ -65,7 +65,7 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype[Any]]: | npst.byte_string_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="=") | npst.datetime64_dtypes(endianness="=") - | npst.timedelta64_dtypes(endianness="?") + | npst.timedelta64_dtypes(endianness="=") ) From a3f05f09559b2d411e826534ad7427c1b73bf92c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 2 May 2025 11:13:04 +0200 Subject: [PATCH 072/130] modify structured dtype fill value rt to avoid to_dict --- tests/test_metadata/test_v2.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index dc8cd49feb..aa8cfc4a31 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -10,7 +10,6 @@ import zarr.storage from zarr.core.buffer import cpu from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype.npy.common import bytes_to_json from zarr.core.dtype.npy.float import Float32, Float64 from zarr.core.dtype.npy.int import Int16 from zarr.core.group import ConsolidatedMetadata, GroupMetadata @@ -337,10 +336,5 @@ def test_structured_dtype_fill_value_serialization(tmp_path, fill_value): zarr.consolidate_metadata(root_group.store, zarr_format=zarr_format) root_group = zarr.open_group(group_path, mode="r") - observed = root_group.metadata.consolidated_metadata.to_dict()["metadata"]["structured_dtype"][ - "fill_value" - ] - if fill_value is None: - assert observed is None - else: - assert observed == bytes_to_json(fill_value, zarr_format=zarr_format) + observed = root_group.metadata.consolidated_metadata.metadata["structured_dtype"].fill_value + assert observed == fill_value From 4788f05242c342d07dea4b2761c912881ff4cc06 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 2 May 2025 16:48:49 +0200 Subject: [PATCH 073/130] wip: begin creating isomorphic test suite for dtypes --- src/zarr/abc/codec.py | 6 +- src/zarr/codecs/sharding.py | 4 +- src/zarr/codecs/transpose.py | 4 +- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 24 +- src/zarr/core/array_spec.py | 6 +- src/zarr/core/codec_pipeline.py | 4 +- src/zarr/core/dtype/__init__.py | 12 +- src/zarr/core/dtype/common.py | 3 +- src/zarr/core/dtype/npy/bool.py | 4 +- src/zarr/core/dtype/npy/common.py | 36 +-- src/zarr/core/dtype/npy/complex.py | 4 +- src/zarr/core/dtype/npy/float.py | 4 +- src/zarr/core/dtype/npy/int.py | 20 +- src/zarr/core/dtype/npy/sized.py | 18 +- src/zarr/core/dtype/npy/string.py | 6 +- src/zarr/core/dtype/npy/time.py | 4 +- src/zarr/core/dtype/registry.py | 12 +- src/zarr/core/dtype/wrapper.py | 14 +- src/zarr/core/metadata/v2.py | 4 +- src/zarr/core/metadata/v3.py | 10 +- tests/package_with_entrypoint/__init__.py | 4 +- tests/test_dtype/__init__.py | 0 tests/{ => test_dtype}/test_dtype.py | 185 +++------------ tests/test_dtype/test_npy/test_common.py | 277 ++++++++++++++++++++++ tests/test_dtype/test_npy/test_int.py | 0 tests/test_dtype_registry.py | 158 ++++++++++++ 27 files changed, 570 insertions(+), 257 deletions(-) create mode 100644 tests/test_dtype/__init__.py rename tests/{ => test_dtype}/test_dtype.py (58%) create mode 100644 tests/test_dtype/test_npy/test_common.py create mode 100644 tests/test_dtype/test_npy/test_int.py create mode 100644 tests/test_dtype_registry.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 31cb44d84e..d9e3520d42 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -15,7 +15,7 @@ from zarr.abc.store import ByteGetter, ByteSetter from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType from zarr.core.indexing import SelectorTuple __all__ = [ @@ -96,7 +96,7 @@ def validate( self, *, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid, ) -> None: """Validates that the codec configuration is compatible with the array metadata. @@ -291,7 +291,7 @@ def supports_partial_encode(self) -> bool: ... @abstractmethod def validate( - self, *, shape: ChunkCoords, dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid + self, *, shape: ChunkCoords, dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid ) -> None: """Validates that all codec configurations are compatible with the array metadata. Raises errors when a codec configuration is not compatible. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 12d709b599..882a956451 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -59,7 +59,7 @@ from typing import Self from zarr.core.common import JSON - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType MAX_UINT_64 = 2**64 - 1 ShardMapping = Mapping[ChunkCoords, Buffer] @@ -409,7 +409,7 @@ def validate( self, *, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid, ) -> None: if len(self.chunk_shape) != len(shape): diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 0e49e3db10..b0ba7888c1 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -16,7 +16,7 @@ from zarr.core.buffer import NDBuffer from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: @@ -49,7 +49,7 @@ def to_dict(self) -> dict[str, JSON]: def validate( self, shape: tuple[int, ...], - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid, ) -> None: if len(self.order) != len(shape): diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 525b80c65f..e6d30413b4 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -9,7 +9,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @dataclasses.dataclass(kw_only=True) @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: ZDType[_BaseDType, _BaseScalar] + _data_type: ZDType[TBaseDType, TBaseScalar] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 4f97b049ff..20af68cab7 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -129,7 +129,7 @@ from zarr.abc.codec import CodecPipeline from zarr.codecs.sharding import ShardingCodecIndexLocation - from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar from zarr.core.group import AsyncGroup from zarr.storage import StoreLike @@ -556,7 +556,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: ZDTypeLike | ZDType[_BaseDType, _BaseScalar], + dtype: ZDTypeLike | ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat = 3, fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, @@ -675,7 +675,7 @@ async def _create( @staticmethod def _create_metadata_v3( shape: ShapeLike, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_shape: ChunkCoords, fill_value: Any | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, @@ -726,7 +726,7 @@ async def _create_v3( store_path: StorePath, *, shape: ShapeLike, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_shape: ChunkCoords, config: ArrayConfig, fill_value: Any | None = None, @@ -774,7 +774,7 @@ async def _create_v3( @staticmethod def _create_metadata_v2( shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunks: ChunkCoords, order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, @@ -804,7 +804,7 @@ async def _create_v2( store_path: StorePath, *, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunks: ChunkCoords, order: MemoryOrder, config: ArrayConfig, @@ -1037,7 +1037,7 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec ) @property - def _zdtype(self) -> ZDType[_BaseDType, _BaseScalar]: + def _zdtype(self) -> ZDType[TBaseDType, TBaseScalar]: """ The zarr-specific representation of the array data type """ @@ -1047,7 +1047,7 @@ def _zdtype(self) -> ZDType[_BaseDType, _BaseScalar]: return self.metadata.data_type @property - def dtype(self) -> _BaseDType: + def dtype(self) -> TBaseDType: """Returns the data type of the array. Returns @@ -4599,7 +4599,7 @@ def _parse_chunk_key_encoding( def _get_default_chunk_encoding_v3( - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. @@ -4619,7 +4619,7 @@ def _get_default_chunk_encoding_v3( def _get_default_chunk_encoding_v2( - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Get the default chunk encoding for Zarr format 2 arrays, given a dtype @@ -4637,7 +4637,7 @@ def _parse_chunk_encoding_v2( *, compressor: CompressorsLike, filters: FiltersLike, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. @@ -4681,7 +4681,7 @@ def _parse_chunk_encoding_v3( compressors: CompressorsLike, filters: FiltersLike, serializer: SerializerLike, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index e8e451944f..279bf6edf0 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -17,7 +17,7 @@ from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType class ArrayConfigParams(TypedDict): @@ -89,7 +89,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: ZDType[_BaseDType, _BaseScalar] + dtype: ZDType[TBaseDType, TBaseScalar] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -97,7 +97,7 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 71600fee90..3d00fe5467 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -27,7 +27,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType T = TypeVar("T") U = TypeVar("U") @@ -133,7 +133,7 @@ def __iter__(self) -> Iterator[Codec]: yield from self.bytes_bytes_codecs def validate( - self, *, shape: ChunkCoords, dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid + self, *, shape: ChunkCoords, dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid ) -> None: for codec in self: codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index f535f62f35..1a18849a13 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, TypeAlias, get_args +from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.npy.bool import Bool from zarr.core.dtype.npy.complex import Complex64, Complex128 from zarr.core.dtype.npy.float import Float16, Float32, Float64 @@ -26,11 +27,12 @@ VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry -from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType __all__ = [ "Complex64", "Complex128", + "DataTypeValidationError", "DateTime64", "FixedLengthAscii", "FixedLengthBytes", @@ -73,14 +75,14 @@ | TimeDelta64 ) -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[_BaseDType, _BaseScalar] | dict[str, JSON] +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] for dtype in get_args(DTYPE): data_type_registry.register(dtype._zarr_v3_name, dtype) # TODO: find a better name for this function -def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, _BaseScalar]: +def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, TBaseScalar]: """ Get a data type wrapper (an instance of ``ZDType``) from a native data type, e.g. a numpy dtype. """ @@ -106,11 +108,11 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, def get_data_type_from_json( dtype: JSON, zarr_format: ZarrFormat -) -> ZDType[_BaseDType, _BaseScalar]: +) -> ZDType[TBaseDType, TBaseScalar]: return data_type_registry.match_json(dtype, zarr_format=zarr_format) -def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[_BaseDType, _BaseScalar]: +def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]: """ Interpret the input as a ZDType instance. """ diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 657f56bfb7..4249c57b1f 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -4,7 +4,8 @@ from typing import Literal Endianness = Literal["little", "big"] -JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] +SpecialFloats = Literal["NaN", "Infinity", "-Infinity"] +JSONFloat = float | SpecialFloats class DataTypeValidationError(ValueError): ... diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 293d8383c0..776acf4f8c 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -5,7 +5,7 @@ from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.npy.common import check_json_bool -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True, kw_only=True, slots=True) @@ -26,7 +26,7 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): dtype_cls = np.dtypes.BoolDType @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self: Self) -> np.dtypes.BoolDType: diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 857c515c19..8ef1286e6f 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -77,7 +77,7 @@ def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: # for dtypes without byte ordering semantics return None raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" + f"Invalid endianness: {endianness!r}. Expected one of {get_args(EndiannessNumpy)}" ) @@ -108,7 +108,7 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: case None: return "|" raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" + f"Invalid endianness: {endianness!r}. Expected one of {get_args(Endianness)} or None" ) @@ -155,7 +155,7 @@ def float_from_json_v3(data: JSONFloat) -> float: return float_from_json_v2(data) -def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: +def float_from_json(data: JSONFloat, *, zarr_format: ZarrFormat) -> float: """ Convert a JSON float to a float based on zarr format. @@ -177,7 +177,7 @@ def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: return float_from_json_v3(data) -def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: +def bytes_from_json(data: str, *, zarr_format: ZarrFormat) -> bytes: """ Convert a JSON string to bytes @@ -198,7 +198,7 @@ def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: # TODO: differentiate these as needed. This is a spec question. if zarr_format == 3: return base64.b64decode(data.encode("ascii")) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") # pragma: no cover def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: @@ -261,9 +261,11 @@ def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: return float_to_json_v2(data) -def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: +def complex_float_to_json_v3( + data: complex | np.complexfloating[Any, Any], +) -> tuple[JSONFloat, JSONFloat]: """ - Convert a complex number to JSON (v3). + Convert a complex number to JSON as defined by the Zarr V3 spec. Parameters ---------- @@ -278,13 +280,15 @@ def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JS return float_to_json_v3(data.real), float_to_json_v3(data.imag) -def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: +def complex_float_to_json_v2( + data: complex | np.complexfloating[Any, Any], +) -> tuple[JSONFloat, JSONFloat]: """ - Convert a complex number to JSON (v2). + Convert a complex number to JSON as defined by the Zarr V2 spec. Parameters ---------- - data : complex or np.complexfloating + data : complex | np.complexfloating The complex value to convert. Returns @@ -296,14 +300,14 @@ def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JS def complex_float_to_json( - data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat + data: complex | np.complexfloating[Any, Any], *, zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat]: """ Convert a complex number to JSON, parametrized by the zarr format version. Parameters ---------- - data : complex or np.complexfloating + data : complex | np.complexfloating The complex value to convert. zarr_format : ZarrFormat The zarr format version. @@ -314,19 +318,19 @@ def complex_float_to_json( The JSON representation of the complex number. """ if zarr_format == 2: - return complex_to_json_v2(data) + return complex_float_to_json_v2(data) else: - return complex_to_json_v3(data) + return complex_float_to_json_v3(data) raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: +def float_to_json(data: float | np.floating[Any], *, zarr_format: ZarrFormat) -> JSONFloat: """ Convert a float to JSON, parametrized by the zarr format version. Parameters ---------- - data : float or np.floating + data : float | np.floating The float value to convert. zarr_format : ZarrFormat The zarr format version. diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 22e1bd66a3..6e19266660 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -21,7 +21,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.dtype.npy.common import EndiannessNumpy @@ -33,7 +33,7 @@ class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 3f56919cf4..15baaaadaa 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -16,7 +16,7 @@ float_from_json, float_to_json, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True) @@ -25,7 +25,7 @@ class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 500f98bb73..7da7245162 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -11,7 +11,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType _NumpyIntDType = ( np.dtypes.Int8DType @@ -132,7 +132,7 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self: Self) -> np.dtypes.Int8DType: @@ -150,7 +150,7 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self: Self) -> np.dtypes.UInt8DType: @@ -168,7 +168,7 @@ class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -193,7 +193,7 @@ class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -217,7 +217,7 @@ class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: + def from_dtype(cls: type[Self], dtype: TBaseDType) -> Self: # We override the base implementation to address a windows-specific, pre-numpy 2 issue where # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, @@ -229,7 +229,7 @@ def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: return super().from_dtype(dtype) @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -253,7 +253,7 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -277,7 +277,7 @@ class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -301,7 +301,7 @@ class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 8d8ff57800..d9524a4891 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -16,7 +16,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @dataclass(frozen=True, kw_only=True) @@ -26,7 +26,7 @@ class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): item_size_bits: ClassVar[int] = 8 @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def to_dtype(self) -> np.dtypes.BytesDType[int]: @@ -98,7 +98,7 @@ class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): item_size_bits: ClassVar[int] = 8 @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def to_dtype(self) -> np.dtypes.VoidDType[int]: @@ -136,7 +136,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: """ Numpy void dtype comes in two forms: * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. @@ -181,7 +181,7 @@ class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.item_size_bits // 8), @@ -252,7 +252,7 @@ def _cast_value_unsafe(self, value: object) -> np.str_: class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "structured" - fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] + fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] def default_value(self) -> np.void: return self._cast_value_unsafe(0) @@ -261,7 +261,7 @@ def _cast_value_unsafe(self, value: object) -> np.void: return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) @classmethod - def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: + def check_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ Check that this dtype is a numpy structured dtype @@ -278,10 +278,10 @@ def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: return super().check_dtype(dtype) and dtype.fields is not None @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype - fields: list[tuple[str, ZDType[_BaseDType, _BaseScalar]]] = [] + fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] if dtype.fields is None: raise ValueError("numpy dtype has no fields") diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 15ccfb30f1..3849fd05ce 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat - from zarr.core.dtype.wrapper import _BaseDType + from zarr.core.dtype.wrapper import TBaseDType _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") @@ -23,7 +23,7 @@ class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[ _zarr_v3_name = "numpy.variable_length_utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.StringDType: @@ -83,7 +83,7 @@ class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[ _zarr_v3_name = "numpy.variable_length_utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.ObjectDType: diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 056836a105..f691bd88c8 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -25,7 +25,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -109,7 +109,7 @@ class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness): unit: DateTimeUnit @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: unit, interval = np.datetime_data(dtype.name) byteorder = cast("EndiannessNumpy", dtype.byteorder) return cls(unit=unit, interval=interval, endianness=endianness_from_numpy_str(byteorder)) # type: ignore[arg-type] diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 4ad2158f96..ae5c3d426e 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -9,7 +9,7 @@ from importlib.metadata import EntryPoint from zarr.core.common import JSON, ZarrFormat - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType # This class is different from the other registry classes, which inherit from @@ -17,7 +17,7 @@ # have just 1 registry class in use. @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: - contents: dict[str, type[ZDType[_BaseDType, _BaseScalar]]] = field( + contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( default_factory=dict, init=False ) lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) @@ -28,15 +28,15 @@ def lazy_load(self) -> None: self.lazy_load_list.clear() - def register(self: Self, key: str, cls: type[ZDType[_BaseDType, _BaseScalar]]) -> None: + def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) -> None: # don't register the same dtype twice if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls - def get(self, key: str) -> type[ZDType[_BaseDType, _BaseScalar]]: + def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: return self.contents[key] - def match_dtype(self, dtype: _BaseDType) -> ZDType[_BaseDType, _BaseScalar]: + def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: self.lazy_load() for val in self.contents.values(): try: @@ -45,7 +45,7 @@ def match_dtype(self, dtype: _BaseDType) -> ZDType[_BaseDType, _BaseScalar]: pass raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") - def match_json(self, data: JSON, zarr_format: ZarrFormat) -> ZDType[_BaseDType, _BaseScalar]: + def match_json(self, data: JSON, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]: self.lazy_load() for val in self.contents.values(): try: diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index ba1b78f096..be51db3ae5 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -35,15 +35,15 @@ # This the upper bound for the scalar types we support. It's numpy scalars + str, # because the new variable-length string dtype in numpy does not have a corresponding scalar type -_BaseScalar = np.generic | str +TBaseScalar = np.generic | str # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. -_BaseDType = np.dtype[np.generic] +TBaseDType = np.dtype[np.generic] # These two type parameters are covariant because we want # x : ZDType[BaseDType, BaseScalar] = ZDType[SubDType, SubScalar] # to type check -TScalar_co = TypeVar("TScalar_co", bound=_BaseScalar, covariant=True) -TDType_co = TypeVar("TDType_co", bound=_BaseDType, covariant=True) +TScalar_co = TypeVar("TScalar_co", bound=TBaseScalar, covariant=True) +TDType_co = TypeVar("TDType_co", bound=TBaseDType, covariant=True) @dataclass(frozen=True, kw_only=True, slots=True) @@ -69,7 +69,7 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): _zarr_v3_name: ClassVar[str] @classmethod - def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType_co]: + def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ Check that a data type matches the dtype_cls class attribute. Used as a type guard. @@ -86,7 +86,7 @@ def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType_co]: return type(dtype) is cls.dtype_cls @classmethod - def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: + def from_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a dtype object. @@ -113,7 +113,7 @@ def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: @classmethod @abstractmethod - def _from_dtype_unsafe(cls: type[Self], dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a native dtype without checking. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index aa2837f598..ab3da36cfe 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -9,7 +9,7 @@ from zarr.abc.metadata import Metadata from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.wrapper import TDType_co, TScalar_co, ZDType, _BaseDType, _BaseScalar +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType if TYPE_CHECKING: from typing import Literal, Self @@ -45,7 +45,7 @@ class ArrayV2MetadataDict(TypedDict): class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: ChunkCoords - dtype: ZDType[_BaseDType, _BaseScalar] + dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index b82fb54270..fe8ced1d3f 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -16,7 +16,7 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords - from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar import json @@ -82,7 +82,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[_BaseDType, _BaseScalar]) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseScalar]) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -141,7 +141,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: ZDType[_BaseDType, _BaseScalar] + data_type: ZDType[TBaseDType, TBaseScalar] chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -156,7 +156,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: ZDType[_BaseDType, _BaseScalar], + data_type: ZDType[TBaseDType, TBaseScalar], chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -222,7 +222,7 @@ def ndim(self) -> int: return len(self.shape) @property - def dtype(self) -> ZDType[_BaseDType, _BaseScalar]: + def dtype(self) -> ZDType[TBaseDType, TBaseScalar]: return self.data_type @property diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index 941f7e71c2..3b46740c35 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -8,7 +8,7 @@ from zarr.codecs import BytesCodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import BytesLike +from zarr.core.common import BytesLike, ZarrFormat from zarr.core.dtype.npy.bool import Bool @@ -81,5 +81,5 @@ def from_json(cls, data: Any, zarr_format: Literal[2, 3]) -> Self: return cls() raise ValueError - def to_json(self, zarr_format): + def to_json(self, zarr_format: ZarrFormat) -> str: return self._zarr_v3_name diff --git a/tests/test_dtype/__init__.py b/tests/test_dtype/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_dtype.py b/tests/test_dtype/test_dtype.py similarity index 58% rename from tests/test_dtype.py rename to tests/test_dtype/test_dtype.py index 2b520383b1..566a04b5fb 100644 --- a/tests/test_dtype.py +++ b/tests/test_dtype/test_dtype.py @@ -1,48 +1,42 @@ from __future__ import annotations -import os -import re -import sys from typing import TYPE_CHECKING, Any, get_args -import zarr -from zarr.core.config import config -from zarr.core.dtype.npy.bool import Bool -from zarr.core.dtype.npy.complex import Complex64, Complex128 -from zarr.core.dtype.npy.float import Float16, Float32, Float64 -from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 -from zarr.core.dtype.npy.sized import FixedLengthAscii, FixedLengthBytes, FixedLengthUnicode -from zarr.core.dtype.npy.time import DateTime64 +from zarr.core.dtype import ( + DTYPE, + Bool, + Complex64, + Complex128, + DateTime64, + FixedLengthAscii, + FixedLengthBytes, + FixedLengthUnicode, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + Structured, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthString, + ZDType, +) from .conftest import zdtype_examples if TYPE_CHECKING: - from collections.abc import Generator - from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar import numpy as np import pytest -from zarr.core.dtype import ( - DTYPE, - VariableLengthString, - ZDType, - data_type_registry, - get_data_type_from_json, -) from zarr.core.dtype.common import DataTypeValidationError -from zarr.core.dtype.npy.sized import ( - Structured, -) -from zarr.core.dtype.registry import DataTypeRegistry - - -@pytest.fixture -def data_type_registry_fixture() -> DataTypeRegistry: - return DataTypeRegistry() - _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") VLEN_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType @@ -177,7 +171,7 @@ def test_default_value(wrapper: ZDType[Any, Any], expected_default: Any) -> None ], ) def test_to_json_value_v2( - wrapper: ZDType[_BaseDType, _BaseScalar], input_value: Any, expected_json: Any + wrapper: ZDType[TBaseDType, TBaseScalar], input_value: Any, expected_json: Any ) -> None: """ Test the to_json_value method for each dtype wrapper for zarr v2 @@ -213,7 +207,7 @@ def test_to_json_value_v2( ], ) def test_to_json_value_v3( - wrapper: ZDType[_BaseDType, _BaseScalar], input_value: Any, expected_json: Any + wrapper: ZDType[TBaseDType, TBaseScalar], input_value: Any, expected_json: Any ) -> None: """ Test the to_json_value method for each dtype wrapper for zarr v3 @@ -246,132 +240,9 @@ def test_to_json_value_v3( ], ) def test_from_json_value( - wrapper: ZDType[_BaseDType, _BaseScalar], json_value: Any, expected_value: Any + wrapper: ZDType[TBaseDType, TBaseScalar], json_value: Any, expected_value: Any ) -> None: """ Test the from_json_value method for each dtype wrapper. """ assert wrapper.from_json_value(json_value, zarr_format=2) == expected_value - - -class TestRegistry: - @staticmethod - def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: - """ - Test that registering a dtype in a data type registry works. - """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) - assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) - - @staticmethod - def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: - """ - Test that registering a new dtype with the same name works (overriding the previous one). - """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) - - class NewBool(Bool): - def default_value(self) -> np.bool_: - return np.True_ - - data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) - - @staticmethod - @pytest.mark.parametrize( - ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicode, "|U4")] - ) - def test_match_dtype( - data_type_registry_fixture: DataTypeRegistry, - wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], - dtype_str: str, - ) -> None: - """ - Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. - """ - data_type_registry_fixture.register(wrapper_cls._zarr_v3_name, wrapper_cls) - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype(dtype_str)), wrapper_cls) - - @staticmethod - def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> None: - """ - Test that match_dtype raises an error if the dtype is not registered. - """ - outside_dtype = "int8" - with pytest.raises( - ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" - ): - data_type_registry_fixture.match_dtype(np.dtype(outside_dtype)) - - with pytest.raises(KeyError): - data_type_registry_fixture.get(outside_dtype) - - @staticmethod - @pytest.mark.parametrize("zdtype", zdtype_examples) - def test_registered_dtypes( - zdtype: ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat - ) -> None: - """ - Test that the registered dtypes can be retrieved from the registry. - """ - - assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype - assert ( - data_type_registry.match_json( - zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format - ) - == zdtype - ) - - @staticmethod - @pytest.mark.parametrize("zdtype", zdtype_examples) - def test_match_dtype_unique( - zdtype: ZDType[Any, Any], - data_type_registry_fixture: DataTypeRegistry, - zarr_format: ZarrFormat, - ) -> None: - """ - Test that the match_dtype method uniquely specifies a registered data type. We create a local registry - that excludes the data type class being tested, and ensure that an instance of the wrapped data type - fails to match anything in the registry - """ - for _cls in get_args(DTYPE): - if _cls is not type(zdtype): - data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) - - dtype_instance = zdtype.to_dtype() - - msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" - with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_dtype(dtype_instance) - - instance_dict = zdtype.to_json(zarr_format=zarr_format) - msg = f"No data type wrapper found that matches {instance_dict}" - with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) - - -# this is copied from the registry tests -- we should deduplicate -here = os.path.abspath(os.path.dirname(__file__)) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - - -@pytest.mark.usefixtures("set_path") -def test_entrypoint_codec(zarr_format: ZarrFormat) -> None: - from package_with_entrypoint import TestDataType - - instance = TestDataType() - dtype_json = instance.to_json(zarr_format=zarr_format) - assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py new file mode 100644 index 0000000000..f3082d0c3b --- /dev/null +++ b/tests/test_dtype/test_npy/test_common.py @@ -0,0 +1,277 @@ +from __future__ import annotations + +import base64 +import math +import re +import sys +from typing import TYPE_CHECKING, Any, get_args + +import numpy as np +import pytest + +from zarr.core.dtype.common import Endianness, JSONFloat, SpecialFloats +from zarr.core.dtype.npy.common import ( + EndiannessNumpy, + bytes_from_json, + bytes_to_json, + check_json_float, + check_json_float_v2, + check_json_float_v3, + check_json_int, + complex_float_to_json, + complex_float_to_json_v2, + complex_float_to_json_v3, + endianness_from_numpy_str, + endianness_to_numpy_str, + float_from_json, + float_from_json_v2, + float_from_json_v3, + float_to_json_v2, + float_to_json_v3, +) + +if TYPE_CHECKING: + from zarr.core.common import ZarrFormat + + +def nan_equal(a: object, b: object) -> bool: + """ + Convenience function for equality comparison between two values ``a`` and ``b``, that might both + be NaN. Returns True if both ``a`` and ``b`` are NaN, otherwise returns a == b + """ + if math.isnan(a) and math.isnan(b): # type: ignore[arg-type] + return True + return a == b + + +json_float_v2: list[tuple[JSONFloat, float | np.floating[Any]]] = [ + ("Infinity", float("inf")), + ("Infinity", np.inf), + ("-Infinity", float("-inf")), + ("-Infinity", -np.inf), + ("NaN", float("nan")), + ("NaN", np.nan), + (1.0, 1.0), +] + +# exactly the same as v2, for now, until we get support for the special NaN encoding defined in the +# v3 spec +json_float_v3: list[tuple[JSONFloat, float | np.floating[Any]]] = [ + ("Infinity", float("inf")), + ("Infinity", np.inf), + ("-Infinity", float("-inf")), + ("-Infinity", -np.inf), + ("NaN", float("nan")), + ("NaN", np.nan), + (1.0, 1.0), +] + + +@pytest.mark.parametrize( + ("data", "expected"), + [(">", "big"), ("<", "little"), ("=", sys.byteorder), ("|", None), ("err", "")], +) +def test_endianness_from_numpy_str(data: str, expected: str | None) -> None: + """ + Test that endianness_from_numpy_str correctly converts a numpy str literal to a human-readable literal value. + This test also checks that an invalid string input raises a ``ValueError`` + """ + if data in get_args(EndiannessNumpy): + assert endianness_from_numpy_str(data) == expected # type: ignore[arg-type] + else: + msg = f"Invalid endianness: {data!r}. Expected one of {get_args(EndiannessNumpy)}" + with pytest.raises(ValueError, match=re.escape(msg)): + endianness_from_numpy_str(data) # type: ignore[arg-type] + + +@pytest.mark.parametrize( + ("data", "expected"), + [("big", ">"), ("little", "<"), (None, "|"), ("err", "")], +) +def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: + """ + Test that endianness_to_numpy_str correctly converts a human-readable literal value to a numpy str literal. + This test also checks that an invalid string input raises a ``ValueError`` + """ + if data in get_args(Endianness) + (None,): + assert endianness_to_numpy_str(data) == expected # type: ignore[arg-type] + else: + msg = f"Invalid endianness: {data!r}. Expected one of {get_args(Endianness)}" + with pytest.raises(ValueError, match=re.escape(msg)): + endianness_to_numpy_str(data) # type: ignore[arg-type] + + +@pytest.mark.parametrize(("data", "expected"), json_float_v2 + [("SHOULD_ERR", "")]) +def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> None: + """ + Test that float_from_json_v2 correctly converts a JSON string representation of a float to a float. + This test also checks that an invalid string input raises a ``ValueError`` + """ + if data in get_args(SpecialFloats) or isinstance(data, float): + assert nan_equal(float_from_json_v2(data), expected) # type: ignore[arg-type] + else: + msg = f"could not convert string to float: {data!r}" + with pytest.raises(ValueError, match=msg): + float_from_json_v2(data) # type: ignore[arg-type] + + +@pytest.mark.parametrize(("data", "expected"), json_float_v3 + [("SHOULD_ERR", "")]) +def test_float_from_json_v3(data: JSONFloat | str, expected: float | str) -> None: + """ + Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. + This test also checks that an invalid string input raises a ``ValueError`` + """ + if data in get_args(SpecialFloats) or isinstance(data, float): + assert nan_equal(float_from_json_v3(data), expected) # type: ignore[arg-type] + else: + msg = f"could not convert string to float: {data!r}" + with pytest.raises(ValueError, match=msg): + float_from_json_v3(data) # type: ignore[arg-type] + + +@pytest.mark.parametrize(("data", "expected"), json_float_v2) +def test_float_from_json(data: JSONFloat, expected: float | str, zarr_format: ZarrFormat) -> None: + """ + Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. + This test also checks that an invalid string input raises a ``ValueError`` + """ + observed = float_from_json(data, zarr_format=zarr_format) + if zarr_format == 2: + expected = float_from_json_v2(data) + else: + expected = float_from_json_v3(data) + assert nan_equal(observed, expected) + + +# note the order of parameters relative to the order of the parametrized variable. +@pytest.mark.parametrize(("expected", "data"), json_float_v2) +def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) -> None: + """ + Test that floats are JSON-encoded properly for zarr v2 + """ + observed = float_to_json_v2(data) + assert observed == expected + + +# note the order of parameters relative to the order of the parametrized variable. +@pytest.mark.parametrize(("expected", "data"), json_float_v3) +def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloat) -> None: + """ + Test that floats are JSON-encoded properly for zarr v3 + """ + observed = float_to_json_v3(data) + assert observed == expected + + +def test_bytes_from_json(zarr_format: ZarrFormat) -> None: + """ + Test that a string is interpreted as base64-encoded bytes using the ascii alphabet. + This test takes zarr_format as a parameter but doesn't actually do anything with it, because at + present there is no zarr-format-specific logic in the code being tested, but such logic may + exist in the future. + """ + data = "\00" + assert bytes_from_json(data, zarr_format=zarr_format) == base64.b64decode(data.encode("ascii")) + + +def test_bytes_to_json(zarr_format: ZarrFormat) -> None: + """ + Test that bytes are encoded with base64 using the ascii alphabet. + + This test takes zarr_format as a parameter but doesn't actually do anything with it, because at + present there is no zarr-format-specific logic in the code being tested, but such logic may + exist in the future. + """ + + data = b"asdas" + assert bytes_to_json(data, zarr_format=zarr_format) == base64.b64encode(data).decode("ascii") + + +# note the order of parameters relative to the order of the parametrized variable. +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2) +def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: + """ + Test that complex numbers are correctly converted to JSON in v2 format. + + This use the same test input as the float tests, but the conversion is tested + for complex numbers with real and imaginary parts equal to the float + values provided in the test cases. + """ + cplx = complex(float_data, float_data) + cplx_npy = np.complex128(cplx) + assert complex_float_to_json_v2(cplx) == (json_expected, json_expected) + assert complex_float_to_json_v2(cplx_npy) == (json_expected, json_expected) + + +# note the order of parameters relative to the order of the parametrized variable. +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3) +def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: + """ + Test that complex numbers are correctly converted to JSON in v3 format. + + This use the same test input as the float tests, but the conversion is tested + for complex numbers with real and imaginary parts equal to the float + values provided in the test cases. + """ + cplx = complex(float_data, float_data) + cplx_npy = np.complex128(cplx) + assert complex_float_to_json_v3(cplx) == (json_expected, json_expected) + assert complex_float_to_json_v3(cplx_npy) == (json_expected, json_expected) + + +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3) +def test_complex_float_to_json( + float_data: float | np.floating[Any], json_expected: JSONFloat, zarr_format: ZarrFormat +) -> None: + """ + Test that complex numbers are correctly converted to JSON in v2 or v3 formats, depending + on the ``zarr_format`` keyword argument. + + This use the same test input as the float tests, but the conversion is tested + for complex numbers with real and imaginary parts equal to the float + values provided in the test cases. + """ + + cplx = complex(float_data, float_data) + cplx_npy = np.complex128(cplx) + assert complex_float_to_json(cplx, zarr_format=zarr_format) == (json_expected, json_expected) + assert complex_float_to_json(cplx_npy, zarr_format=zarr_format) == ( + json_expected, + json_expected, + ) + + +check_json_float_cases = get_args(SpecialFloats) + (1.0, 2) + + +@pytest.mark.parametrize("data", check_json_float_cases) +def test_check_json_float_v2_valid(data: JSONFloat | int) -> None: + assert check_json_float_v2(data) + + +def test_check_json_float_v2_invalid() -> None: + assert not check_json_float_v2("invalid") + + +@pytest.mark.parametrize("data", check_json_float_cases) +def test_check_json_float_v3_valid(data: JSONFloat | int) -> None: + assert check_json_float_v3(data) + + +def test_check_json_float_v3_invalid() -> None: + assert not check_json_float_v3("invalid") + + +@pytest.mark.parametrize("data", check_json_float_cases) +def test_check_json_float(data: JSONFloat | int, zarr_format: ZarrFormat) -> None: + observed = check_json_float(data, zarr_format=zarr_format) + if zarr_format == 2: + expected = check_json_float_v2(data) + else: + expected = check_json_float_v3(data) + assert observed == expected + + +def test_check_json_int() -> None: + assert check_json_int(0) + assert not check_json_int(1.0) diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py new file mode 100644 index 0000000000..5e87945b3a --- /dev/null +++ b/tests/test_dtype_registry.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import re +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Any, get_args + +import numpy as np +import pytest + +import zarr +from zarr.core.config import config +from zarr.core.dtype import ( + DTYPE, + Bool, + FixedLengthUnicode, + TBaseDType, + TBaseScalar, + ZDType, + data_type_registry, + get_data_type_from_json, +) +from zarr.core.dtype.registry import DataTypeRegistry + +from .conftest import zdtype_examples + +if TYPE_CHECKING: + from collections.abc import Generator + + from zarr.core.common import ZarrFormat + + +@pytest.fixture +def data_type_registry_fixture() -> DataTypeRegistry: + return DataTypeRegistry() + + +class TestRegistry: + @staticmethod + def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: + """ + Test that registering a dtype in a data type registry works. + """ + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) + + @staticmethod + def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: + """ + Test that registering a new dtype with the same name works (overriding the previous one). + """ + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + + class NewBool(Bool): + def default_value(self) -> np.bool_: + return np.True_ + + data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) + + @staticmethod + @pytest.mark.parametrize( + ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicode, "|U4")] + ) + def test_match_dtype( + data_type_registry_fixture: DataTypeRegistry, + wrapper_cls: type[ZDType[TBaseDType, TBaseScalar]], + dtype_str: str, + ) -> None: + """ + Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. + """ + data_type_registry_fixture.register(wrapper_cls._zarr_v3_name, wrapper_cls) + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype(dtype_str)), wrapper_cls) + + @staticmethod + def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> None: + """ + Test that match_dtype raises an error if the dtype is not registered. + """ + outside_dtype = "int8" + with pytest.raises( + ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" + ): + data_type_registry_fixture.match_dtype(np.dtype(outside_dtype)) + + with pytest.raises(KeyError): + data_type_registry_fixture.get(outside_dtype) + + @staticmethod + @pytest.mark.parametrize("zdtype", zdtype_examples) + def test_registered_dtypes( + zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat + ) -> None: + """ + Test that the registered dtypes can be retrieved from the registry. + """ + + assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype + assert ( + data_type_registry.match_json( + zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format + ) + == zdtype + ) + + @staticmethod + @pytest.mark.parametrize("zdtype", zdtype_examples) + def test_match_dtype_unique( + zdtype: ZDType[Any, Any], + data_type_registry_fixture: DataTypeRegistry, + zarr_format: ZarrFormat, + ) -> None: + """ + Test that the match_dtype method uniquely specifies a registered data type. We create a local registry + that excludes the data type class being tested, and ensure that an instance of the wrapped data type + fails to match anything in the registry + """ + for _cls in get_args(DTYPE): + if _cls is not type(zdtype): + data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) + + dtype_instance = zdtype.to_dtype() + + msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" + with pytest.raises(ValueError, match=re.escape(msg)): + data_type_registry_fixture.match_dtype(dtype_instance) + + instance_dict = zdtype.to_json(zarr_format=zarr_format) + msg = f"No data type wrapper found that matches {instance_dict}" + with pytest.raises(ValueError, match=re.escape(msg)): + data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) + + +# this is copied from the registry tests -- we should deduplicate +here = str(Path(__file__).parent.absolute()) + + +@pytest.fixture +def set_path() -> Generator[None, None, None]: + sys.path.append(here) + zarr.registry._collect_entrypoints() + yield + sys.path.remove(here) + registries = zarr.registry._collect_entrypoints() + for registry in registries: + registry.lazy_load_list.clear() + config.reset() + + +@pytest.mark.usefixtures("set_path") +def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: + from package_with_entrypoint import TestDataType + + instance = TestDataType() + dtype_json = instance.to_json(zarr_format=zarr_format) + assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance From d3f92043449b3d155318ac6494baa21a6a055064 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 2 May 2025 18:47:18 +0200 Subject: [PATCH 074/130] finish common tests --- src/zarr/core/dtype/npy/common.py | 162 +++++++++++------------ tests/conftest.py | 20 --- tests/test_dtype/confttest.py | 22 +++ tests/test_dtype/test_npy/test_common.py | 87 ++++++++++-- 4 files changed, 179 insertions(+), 112 deletions(-) create mode 100644 tests/test_dtype/confttest.py diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 8ef1286e6f..8033e48291 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -261,6 +261,29 @@ def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: return float_to_json_v2(data) +def float_to_json(data: float | np.floating[Any], *, zarr_format: ZarrFormat) -> JSONFloat: + """ + Convert a float to JSON, parametrized by the zarr format version. + + Parameters + ---------- + data : float | np.floating + The float value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSONFloat + The JSON representation of the float. + """ + if zarr_format == 2: + return float_to_json_v2(data) + else: + return float_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + def complex_float_to_json_v3( data: complex | np.complexfloating[Any, Any], ) -> tuple[JSONFloat, JSONFloat]: @@ -324,26 +347,60 @@ def complex_float_to_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def float_to_json(data: float | np.floating[Any], *, zarr_format: ZarrFormat) -> JSONFloat: +def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: """ - Convert a float to JSON, parametrized by the zarr format version. + Convert a JSON complex float to a complex number (v2). Parameters ---------- - data : float | np.floating - The float value to convert. + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + + Returns + ------- + np.complexfloating + The complex number. + """ + return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) + + +def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: + """ + Convert a JSON complex float to a complex number (v3). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + + Returns + ------- + np.complexfloating + The complex number. + """ + return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) + + +def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: + """ + Convert a JSON complex float to a complex number based on zarr format. + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. zarr_format : ZarrFormat The zarr format version. Returns ------- - JSONFloat - The JSON representation of the float. + np.complexfloating + The complex number. """ if zarr_format == 2: - return float_to_json_v2(data) + return complex_float_from_json_v2(data) else: - return float_to_json_v3(data) + return complex_float_from_json_v3(data) raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") @@ -366,9 +423,9 @@ def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: return isinstance(data, float | int) -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: +def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: """ - Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x + Check if a JSON value represents a float (v3). Parameters ---------- @@ -378,20 +435,15 @@ def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFl Returns ------- Bool - True if the data is a complex float, False otherwise. + True if the data is a float, False otherwise. """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v2(data[0]) - and check_json_float_v2(data[1]) - ) + # TODO: handle the special JSON serialization of different NaN values + return check_json_float_v2(data) -def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: """ - Check if a JSON value represents a float (v3). + Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x Parameters ---------- @@ -401,10 +453,15 @@ def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: Returns ------- Bool - True if the data is a float, False otherwise. + True if the data is a complex float, False otherwise. """ - # TODO: handle the special JSON serialization of different NaN values - return check_json_float_v2(data) + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v2(data[0]) + and check_json_float_v2(data[1]) + ) def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: @@ -434,7 +491,7 @@ def check_json_complex_float( data: JSON, zarr_format: ZarrFormat ) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: """ - Check if a JSON value represents a complex float based on zarr format. + Check if a JSON value represents a complex float, given a zarr format. Parameters ---------- @@ -524,60 +581,3 @@ def check_json_bool(data: JSON) -> TypeGuard[bool]: True if the data is a boolean, False otherwise. """ return isinstance(data, bool) - - -def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: - """ - Convert a JSON complex float to a complex number (v2). - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - - Returns - ------- - np.complexfloating - The complex number. - """ - return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) - - -def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: - """ - Convert a JSON complex float to a complex number (v3). - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - - Returns - ------- - np.complexfloating - The complex number. - """ - return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) - - -def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: - """ - Convert a JSON complex float to a complex number based on zarr format. - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - np.complexfloating - The complex number. - """ - if zarr_format == 2: - return complex_float_from_json_v2(data) - else: - return complex_float_from_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") diff --git a/tests/conftest.py b/tests/conftest.py index 434763a4f3..7a075cb9ac 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,13 +21,8 @@ from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config from zarr.core.dtype import ( - DateTime64, - Structured, - TimeDelta64, - data_type_registry, get_data_type_from_native_dtype, ) -from zarr.core.dtype.common import HasLength from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -43,7 +38,6 @@ from zarr.core.array import CompressorsLike, FiltersLike, SerializerLike, ShardsLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import ChunkCoords, MemoryOrder, ShapeLike, ZarrFormat - from zarr.core.dtype.wrapper import ZDType async def parse_store( @@ -420,17 +414,3 @@ def meta_from_array( chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, ) - - -# Generate a collection of zdtype instances for use in testing. -zdtype_examples: tuple[ZDType[Any, Any], ...] = () -for wrapper_cls in data_type_registry.contents.values(): - # The Structured dtype has to be constructed with some actual fields - if wrapper_cls is Structured: - zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) - elif issubclass(wrapper_cls, HasLength): - zdtype_examples += (wrapper_cls(length=1),) - elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): - zdtype_examples += (wrapper_cls(unit="s", interval=10),) - else: - zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_dtype/confttest.py b/tests/test_dtype/confttest.py new file mode 100644 index 0000000000..aba08a08c5 --- /dev/null +++ b/tests/test_dtype/confttest.py @@ -0,0 +1,22 @@ +# Generate a collection of zdtype instances for use in testing. +from typing import Any + +import numpy as np + +from zarr.core.dtype import data_type_registry +from zarr.core.dtype.common import HasLength +from zarr.core.dtype.npy.sized import Structured +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 +from zarr.core.dtype.wrapper import ZDType + +zdtype_examples: tuple[ZDType[Any, Any], ...] = () +for wrapper_cls in data_type_registry.contents.values(): + # The Structured dtype has to be constructed with some actual fields + if wrapper_cls is Structured: + zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) + elif issubclass(wrapper_cls, HasLength): + zdtype_examples += (wrapper_cls(length=1),) + elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): + zdtype_examples += (wrapper_cls(unit="s", interval=10),) + else: + zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index f3082d0c3b..69beae38e3 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -14,10 +14,15 @@ EndiannessNumpy, bytes_from_json, bytes_to_json, + check_json_bool, + check_json_complex_float, + check_json_complex_float_v2, + check_json_complex_float_v3, check_json_float, check_json_float_v2, check_json_float_v3, check_json_int, + check_json_str, complex_float_to_json, complex_float_to_json_v2, complex_float_to_json_v3, @@ -31,7 +36,7 @@ ) if TYPE_CHECKING: - from zarr.core.common import ZarrFormat + from zarr.core.common import JSON, ZarrFormat def nan_equal(a: object, b: object) -> bool: @@ -44,7 +49,7 @@ def nan_equal(a: object, b: object) -> bool: return a == b -json_float_v2: list[tuple[JSONFloat, float | np.floating[Any]]] = [ +json_float_v2_cases: list[tuple[JSONFloat, float | np.floating[Any]]] = [ ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -56,7 +61,7 @@ def nan_equal(a: object, b: object) -> bool: # exactly the same as v2, for now, until we get support for the special NaN encoding defined in the # v3 spec -json_float_v3: list[tuple[JSONFloat, float | np.floating[Any]]] = [ +json_float_v3_cases: list[tuple[JSONFloat, float | np.floating[Any]]] = [ ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -101,7 +106,7 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: endianness_to_numpy_str(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v2 + [("SHOULD_ERR", "")]) +@pytest.mark.parametrize(("data", "expected"), json_float_v2_cases + [("SHOULD_ERR", "")]) def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> None: """ Test that float_from_json_v2 correctly converts a JSON string representation of a float to a float. @@ -115,7 +120,7 @@ def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> Non float_from_json_v2(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v3 + [("SHOULD_ERR", "")]) +@pytest.mark.parametrize(("data", "expected"), json_float_v3_cases + [("SHOULD_ERR", "")]) def test_float_from_json_v3(data: JSONFloat | str, expected: float | str) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. @@ -129,7 +134,7 @@ def test_float_from_json_v3(data: JSONFloat | str, expected: float | str) -> Non float_from_json_v3(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v2) +@pytest.mark.parametrize(("data", "expected"), json_float_v2_cases) def test_float_from_json(data: JSONFloat, expected: float | str, zarr_format: ZarrFormat) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. @@ -144,7 +149,7 @@ def test_float_from_json(data: JSONFloat, expected: float | str, zarr_format: Za # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("expected", "data"), json_float_v2) +@pytest.mark.parametrize(("expected", "data"), json_float_v2_cases) def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) -> None: """ Test that floats are JSON-encoded properly for zarr v2 @@ -154,7 +159,7 @@ def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) - # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("expected", "data"), json_float_v3) +@pytest.mark.parametrize(("expected", "data"), json_float_v3_cases) def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloat) -> None: """ Test that floats are JSON-encoded properly for zarr v3 @@ -188,7 +193,7 @@ def test_bytes_to_json(zarr_format: ZarrFormat) -> None: # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_cases) def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: """ Test that complex numbers are correctly converted to JSON in v2 format. @@ -204,7 +209,7 @@ def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: """ Test that complex numbers are correctly converted to JSON in v3 format. @@ -219,7 +224,7 @@ def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: assert complex_float_to_json_v3(cplx_npy) == (json_expected, json_expected) -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) def test_complex_float_to_json( float_data: float | np.floating[Any], json_expected: JSONFloat, zarr_format: ZarrFormat ) -> None: @@ -272,6 +277,66 @@ def test_check_json_float(data: JSONFloat | int, zarr_format: ZarrFormat) -> Non assert observed == expected +check_json_complex_float_true_cases = ( + [0.0, 1.0], + (0.0, 1.0), + [-1.0, "NaN"], + ["Infinity", 1.0], + ["Infinity", "NaN"], +) + +check_json_complex_float_false_cases = ( + 0.0, + "foo", + [0.0], + [1.0, 2.0, 3.0], + [1.0, "_infinity_"], + {"hello": 1.0}, +) + + +@pytest.mark.parametrize("data", check_json_complex_float_true_cases) +def test_check_json_complex_float_v2_true(data: JSON) -> None: + assert check_json_complex_float_v2(data) + + +@pytest.mark.parametrize("data", check_json_complex_float_false_cases) +def test_check_json_complex_float_v2_false(data: JSON) -> None: + assert not check_json_complex_float_v2(data) + + +@pytest.mark.parametrize("data", check_json_complex_float_true_cases) +def test_check_json_complex_float_v3_true(data: JSON) -> None: + assert check_json_complex_float_v3(data) + + +@pytest.mark.parametrize("data", check_json_complex_float_false_cases) +def test_check_json_complex_float_v3_false(data: JSON) -> None: + assert not check_json_complex_float_v3(data) + + +@pytest.mark.parametrize("data", check_json_complex_float_true_cases) +def test_check_json_complex_float_true(data: JSON, zarr_format: ZarrFormat) -> None: + assert check_json_complex_float(data, zarr_format=zarr_format) + + +@pytest.mark.parametrize("data", check_json_complex_float_false_cases) +def test_check_json_complex_float_false(data: JSON, zarr_format: ZarrFormat) -> None: + assert not check_json_complex_float(data, zarr_format=zarr_format) + + def test_check_json_int() -> None: assert check_json_int(0) assert not check_json_int(1.0) + + +def test_check_json_str() -> None: + assert check_json_str("0") + assert not check_json_str(1.0) + + +def test_check_json_bool() -> None: + assert check_json_bool(True) + assert check_json_bool(False) + assert not check_json_bool(1.0) + assert not check_json_bool("True") From fdf17e391e6e4285d3f2b6c9ec08bbbaf4ba6260 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 7 May 2025 13:15:34 +0200 Subject: [PATCH 075/130] wip: test infrastructure for dtypes --- tests/test_array.py | 2 +- .../test_dtype/{confttest.py => conftest.py} | 6 ++ tests/test_dtype/test_npy/test_common.py | 10 +-- tests/test_dtype/test_npy/test_int.py | 32 +++++++ tests/test_dtype/test_wrapper.py | 86 +++++++++++++++++++ tests/test_dtype_registry.py | 4 +- 6 files changed, 128 insertions(+), 12 deletions(-) rename tests/test_dtype/{confttest.py => conftest.py} (80%) create mode 100644 tests/test_dtype/test_wrapper.py diff --git a/tests/test_array.py b/tests/test_array.py index 125672658a..450d1375a8 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -55,7 +55,7 @@ from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath -from .conftest import zdtype_examples +from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike diff --git a/tests/test_dtype/confttest.py b/tests/test_dtype/conftest.py similarity index 80% rename from tests/test_dtype/confttest.py rename to tests/test_dtype/conftest.py index aba08a08c5..6e171cb435 100644 --- a/tests/test_dtype/confttest.py +++ b/tests/test_dtype/conftest.py @@ -20,3 +20,9 @@ zdtype_examples += (wrapper_cls(unit="s", interval=10),) else: zdtype_examples += (wrapper_cls(),) + + +def pytest_generate_tests(metafunc): + for fixture_name in metafunc.fixturenames: + if hasattr(metafunc.cls, fixture_name): + metafunc.parametrize(fixture_name, getattr(metafunc.cls, fixture_name), scope="class") diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index 69beae38e3..69a14a92b0 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -61,15 +61,7 @@ def nan_equal(a: object, b: object) -> bool: # exactly the same as v2, for now, until we get support for the special NaN encoding defined in the # v3 spec -json_float_v3_cases: list[tuple[JSONFloat, float | np.floating[Any]]] = [ - ("Infinity", float("inf")), - ("Infinity", np.inf), - ("-Infinity", float("-inf")), - ("-Infinity", -np.inf), - ("NaN", float("nan")), - ("NaN", np.nan), - (1.0, 1.0), -] +json_float_v3_cases = json_float_v2_cases @pytest.mark.parametrize( diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index e69de29bb2..a90af53c58 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import numpy as np + +from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.int import Int8 + + +class TestInt8(_TestZDType): + test_cls = Int8 + valid_dtype = (np.dtype(np.int8),) + invalid_dtype = ( + np.dtype(np.int16), + np.dtype(np.uint16), + np.dtype(np.float64), + ) + valid_json_v2 = ("|i1",) + valid_json_v3_cases = ("int8",) + invalid_json_v2 = ( + ">i1", + "int8", + "|f8", + ) + invalid_json_v3 = ( + "|i1", + "|f8", + {"name": "int8", "configuration": {"endianness": "little"}}, + ) + + def test_check_value(self) -> None: + assert self.test_cls().check_value(1) + assert not self.test_cls().check_value(["foo"]) diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py new file mode 100644 index 0000000000..c6093ebb01 --- /dev/null +++ b/tests/test_dtype/test_wrapper.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from typing import Any, ClassVar + +import hypothesis.strategies as st +import numpy as np +from hypothesis.extra import numpy as npst + +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + + +def all_dtypes() -> st.SearchStrategy[np.dtype[np.generic]]: + return ( + npst.boolean_dtypes() + | npst.integer_dtypes(endianness="=") + | npst.unsigned_integer_dtypes(endianness="=") + | npst.floating_dtypes(endianness="=") + | npst.complex_number_dtypes(endianness="=") + | npst.byte_string_dtypes(endianness="=") + | npst.unicode_string_dtypes(endianness="=") + | npst.datetime64_dtypes(endianness="=") + | npst.timedelta64_dtypes(endianness="=") + ) + + +def get_classvar_attributes(cls: type) -> dict[str, Any]: + classvar_attributes = {} + for name, annotation in cls.__annotations__.items(): + if getattr(annotation, "__origin__", None) is ClassVar: + classvar_attributes[name] = getattr(cls, name) + return classvar_attributes + + +class _TestZDType: + test_cls: type[ZDType[TBaseDType, TBaseScalar]] + + valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () + invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () + + valid_json_v2: ClassVar[tuple[str | dict[str, Any], ...]] = () + invalid_json_v2: ClassVar[tuple[str | dict[str, Any], ...]] = () + + valid_json_v3: ClassVar[tuple[str | dict[str, Any], ...]] = () + invalid_json_v3: ClassVar[tuple[str | dict[str, Any], ...]] = () + + def test_check_dtype_valid(self, valid_dtype: Any) -> None: + assert self.test_cls.check_dtype(valid_dtype) + + def test_check_dtype_invalid(self, invalid_dtype: Any) -> None: + assert not self.test_cls.check_dtype(invalid_dtype) + + def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: + zdtype = self.test_cls.from_dtype(valid_dtype) + assert zdtype.to_dtype() == valid_dtype + + """ @abc.abstractmethod + def test_cast_value(self, value: Any) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_check_value(self) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_default_value(self) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_check_json(self, value: Any) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_from_json_roundtrip_v2(self, value: Any) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_from_json_roundtrip_v3(self, value: Any) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_from_json_value_roundtrip_v2(self, value: Any) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_from_json_value_roundtrip_v3(self, value: Any) -> None: + raise NotImplementedError """ diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 5e87945b3a..98380b86f7 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -22,13 +22,13 @@ ) from zarr.core.dtype.registry import DataTypeRegistry -from .conftest import zdtype_examples - if TYPE_CHECKING: from collections.abc import Generator from zarr.core.common import ZarrFormat +from .test_dtype.conftest import zdtype_examples + @pytest.fixture def data_type_registry_fixture() -> DataTypeRegistry: From 4afa42af137a5a5736e0dfaf9fbc5e4747abc750 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 7 May 2025 18:03:00 +0200 Subject: [PATCH 076/130] wip: use class-based tests for all dtypes --- tests/test_dtype/conftest.py | 32 ++++- tests/test_dtype/test_npy/test_bool.py | 28 ++++ tests/test_dtype/test_npy/test_complex.py | 50 +++++++ tests/test_dtype/test_npy/test_float.py | 72 ++++++++++ tests/test_dtype/test_npy/test_int.py | 162 +++++++++++++++++++++- tests/test_dtype/test_npy/test_sized.py | 131 +++++++++++++++++ tests/test_dtype/test_npy/test_string.py | 50 +++++++ tests/test_dtype/test_npy/test_time.py | 54 ++++++++ tests/test_dtype/test_wrapper.py | 24 ++-- 9 files changed, 588 insertions(+), 15 deletions(-) create mode 100644 tests/test_dtype/test_npy/test_bool.py create mode 100644 tests/test_dtype/test_npy/test_complex.py create mode 100644 tests/test_dtype/test_npy/test_float.py create mode 100644 tests/test_dtype/test_npy/test_sized.py create mode 100644 tests/test_dtype/test_npy/test_string.py create mode 100644 tests/test_dtype/test_npy/test_time.py diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 6e171cb435..2b4bb0b685 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -22,7 +22,37 @@ zdtype_examples += (wrapper_cls(),) -def pytest_generate_tests(metafunc): +def pytest_generate_tests(metafunc: Any) -> None: + """ + pytest hook to parametrize class-scoped fixtures. + + This hook allows us to define class-scoped fixtures as class attributes and then + generate the parametrize calls for pytest. This allows the fixtures to be + reused across multiple tests within the same class. + + For example, if you had a regular pytest class like this: + + class TestClass: + @pytest.mark.parametrize("param_a", [1, 2, 3]) + def test_method(self, param_a): + ... + + Child classes inheriting from ``TestClass`` would not be able to override the ``param_a`` fixture + + this implementation of ``pytest_generate_tests`` allows you to define class-scoped fixtures as + class attributes, which allows the following to work: + + class TestExample: + param_a = [1, 2, 3] + + def test_example(self, param_a): + ... + + # this class will have its test_example method parametrized with the values of TestB.param_a + class TestB(TestExample): + param_a = [1, 2, 100, 10] + + """ for fixture_name in metafunc.fixturenames: if hasattr(metafunc.cls, fixture_name): metafunc.parametrize(fixture_name, getattr(metafunc.cls, fixture_name), scope="class") diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py new file mode 100644 index 0000000000..e4e5dd541e --- /dev/null +++ b/tests/test_dtype/test_npy/test_bool.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import numpy as np + +from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.bool import Bool + + +class TestBool(_TestZDType): + test_cls = Bool + valid_dtype = (np.dtype(np.bool_),) + invalid_dtype = ( + np.dtype(np.int8), + np.dtype(np.float64), + np.dtype(np.uint16), + ) + valid_json_v2 = Bool._zarr_v2_names + valid_json_v3_cases = (Bool._zarr_v3_name,) + invalid_json_v2 = ( + "|b1", + "bool", + "|f8", + ) + invalid_json_v3 = ( + "|b1", + "|f8", + {"name": "bool", "configuration": {"endianness": "little"}}, + ) diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py new file mode 100644 index 0000000000..6621d625d9 --- /dev/null +++ b/tests/test_dtype/test_npy/test_complex.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import numpy as np + +from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.complex import Complex64, Complex128 + + +class TestComplex64(_TestZDType): + test_cls = Complex64 + valid_dtype = (np.dtype(">c8"), np.dtype("c16"), np.dtype("f2"), np.dtype("f4"), np.dtype("f8"), np.dtype("i1", "int8", @@ -27,6 +27,156 @@ class TestInt8(_TestZDType): {"name": "int8", "configuration": {"endianness": "little"}}, ) - def test_check_value(self) -> None: - assert self.test_cls().check_value(1) - assert not self.test_cls().check_value(["foo"]) + +class TestInt16(_TestZDType): + test_cls = Int16 + valid_dtype = (np.dtype(">i2"), np.dtype("i4"), np.dtype("i8"), np.dtype("u2"), np.dtype("u4"), np.dtype("u8"), np.dtype("U10"), np.dtype("U10", "i4"), ("field2", ">f8")], + [("field1", ">i8"), ("field2", ">i4")], + ) + valid_json_v3_cases = ( + { + "name": "structured", + "configuration": { + "fields": [ + ("field1", {"name": "int32", "configuration": {"endianness": "big"}}), + ("field2", {"name": "float64", "configuration": {"endianness": "big"}}), + ] + }, + }, + { + "name": "structured", + "configuration": { + "fields": [ + ("field1", {"name": "int64", "configuration": {"endianness": "big"}}), + ("field2", {"name": "int32", "configuration": {"endianness": "big"}}), + ] + }, + }, + ) + invalid_json_v2 = ( + [("field1", "|i1"), ("field2", "|f8")], + [("field1", "|S10"), ("field2", "|f8")], + ) + invalid_json_v3 = ( + { + "name": "structured", + "configuration": { + "fields": [ + ("field1", {"name": "int32", "configuration": {"endianness": "invalid"}}), + ("field2", {"name": "float64", "configuration": {"endianness": "big"}}), + ] + }, + }, + {"name": "invalid_name"}, + ) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py new file mode 100644 index 0000000000..2f77379f01 --- /dev/null +++ b/tests/test_dtype/test_npy/test_string.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import numpy as np + +from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthString + +if _NUMPY_SUPPORTS_VLEN_STRING: + + class TestVariableLengthString(_TestZDType): + test_cls = VariableLengthString + valid_dtype = (np.dtypes.StringDType(),) + invalid_dtype = ( + np.dtype(np.int8), + np.dtype(np.float64), + np.dtype("|S10"), + ) + valid_json_v2 = ("|O",) + valid_json_v3_cases = ({"name": "numpy.variable_length_utf8"},) + invalid_json_v2 = ( + "|S10", + "|f8", + "invalid", + ) + invalid_json_v3 = ( + {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, + {"name": "invalid_name"}, + ) + +else: + + class TestVariableLengthString(_TestZDType): + test_cls = VariableLengthString + valid_dtype = (np.dtype("O"),) + invalid_dtype = ( + np.dtype(np.int8), + np.dtype(np.float64), + np.dtype("|S10"), + ) + valid_json_v2 = ("|O",) + valid_json_v3_cases = ({"name": "numpy.variable_length_utf8"},) + invalid_json_v2 = ( + "|S10", + "|f8", + "invalid", + ) + invalid_json_v3 = ( + {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, + {"name": "invalid_name"}, + ) diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py new file mode 100644 index 0000000000..a5d2cce545 --- /dev/null +++ b/tests/test_dtype/test_npy/test_time.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import numpy as np + +from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 + + +class TestDateTime64(_TestZDType): + test_cls = DateTime64 + valid_dtype = (np.dtype("datetime64[10ns]"), np.dtype("datetime64[us]"), np.dtype("datetime64")) + invalid_dtype = ( + np.dtype(np.int8), + np.dtype(np.float64), + np.dtype("timedelta64[ns]"), + ) + valid_json_v2 = (">M8", ">M8[s]", " None: - assert self.test_cls.check_dtype(valid_dtype) + def test_check_dtype_valid(self, valid_dtype: object) -> None: + assert self.test_cls.check_dtype(valid_dtype) # type: ignore[arg-type] - def test_check_dtype_invalid(self, invalid_dtype: Any) -> None: - assert not self.test_cls.check_dtype(invalid_dtype) + def test_check_dtype_invalid(self, invalid_dtype: object) -> None: + assert not self.test_cls.check_dtype(invalid_dtype) # type: ignore[arg-type] def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: zdtype = self.test_cls.from_dtype(valid_dtype) assert zdtype.to_dtype() == valid_dtype + def test_from_json_roundtrip_v2(self, valid_json_v2: Any) -> None: + zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) + assert zdtype.to_json(zarr_format=2) == valid_json_v2 + + def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: + zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) + assert zdtype.to_json(zarr_format=3) == valid_json_v3 + """ @abc.abstractmethod def test_cast_value(self, value: Any) -> None: raise NotImplementedError From 1458aadadb8162e7326809fa8ff186024dcec91a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 8 May 2025 17:14:43 +0200 Subject: [PATCH 077/130] fill out more tests, and adjust sized dtypes --- src/zarr/core/dtype/npy/sized.py | 49 +++++++++---------- src/zarr/core/dtype/npy/time.py | 38 +++++++++------ src/zarr/core/dtype/wrapper.py | 13 +++++ tests/test_dtype/conftest.py | 2 +- tests/test_dtype/test_npy/test_bool.py | 7 ++- tests/test_dtype/test_npy/test_complex.py | 22 +++++++-- tests/test_dtype/test_npy/test_float.py | 30 ++++++++++-- tests/test_dtype/test_npy/test_int.py | 56 +++++++++++++++------- tests/test_dtype/test_npy/test_sized.py | 56 +++++++++++++++++----- tests/test_dtype/test_npy/test_string.py | 4 +- tests/test_dtype/test_npy/test_time.py | 54 ++++++++++++++++++--- tests/test_dtype/test_wrapper.py | 58 ++++++++++++----------- 12 files changed, 275 insertions(+), 114 deletions(-) diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index d9524a4891..032a1ec5c0 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -23,11 +23,10 @@ class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): dtype_cls = np.dtypes.BytesDType _zarr_v3_name = "numpy.fixed_length_ascii" - item_size_bits: ClassVar[int] = 8 @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + return cls(length=dtype.itemsize) def to_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) @@ -43,12 +42,10 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: elif zarr_format == 3: return ( isinstance(data, dict) - and "name" in data + and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name - and "configuration" in data and isinstance(data["configuration"], dict) - and "length_bits" in data["configuration"] - and isinstance(data["configuration"]["length_bits"], int) + and "length_bytes" in data["configuration"] ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -58,7 +55,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: elif zarr_format == 3: return { "name": self._zarr_v3_name, - "configuration": {"length_bits": self.length * self.item_size_bits}, + "configuration": {"length_bytes": self.length}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -67,7 +64,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.bytes_: @@ -94,12 +91,11 @@ class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "numpy.void" - item_size_bits: ClassVar[int] = 8 + _zarr_v3_name = "numpy.fixed_length_bytes" @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + return cls(length=dtype.itemsize) def to_dtype(self) -> np.dtypes.VoidDType[int]: # Numpy does not allow creating a void type @@ -114,9 +110,10 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: elif zarr_format == 3: return ( isinstance(data, dict) - and "name" in data - and isinstance(data["name"], str) - and (re.match(r"^r\d+$", data["name"]) is not None) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -124,7 +121,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: - return {"name": f"r{self.length * self.item_size_bits}"} + return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod @@ -132,7 +129,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod @@ -178,13 +175,13 @@ def _cast_value_unsafe(self, value: object) -> np.void: class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): dtype_cls = np.dtypes.StrDType _zarr_v3_name = "numpy.fixed_length_ucs4" - item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point + item_size_bytes: ClassVar[int] = 4 # UCS4 is 4 bytes per code point @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( - length=dtype.itemsize // (cls.item_size_bits // 8), + length=dtype.itemsize // (cls.item_size_bytes), endianness=endianness_from_numpy_str(byte_order), ) @@ -203,12 +200,12 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: elif zarr_format == 3: return ( isinstance(data, dict) - and "name" in data + and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name and "configuration" in data and isinstance(data["configuration"], dict) - and "length_bits" in data["configuration"] - and isinstance(data["configuration"]["length_bits"], int) + and set(data["configuration"].keys()) == {"length_bytes"} + and isinstance(data["configuration"]["length_bytes"], int) ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -218,7 +215,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: elif zarr_format == 3: return { "name": self._zarr_v3_name, - "configuration": {"length_bits": self.length * self.item_size_bits}, + "configuration": {"length_bytes": self.length * self.item_size_bytes}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -227,7 +224,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + return cls(length=data["configuration"]["length_bytes"] // cls.item_size_bytes) # type: ignore[arg-type, index, call-overload, operator] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.str_: @@ -344,7 +341,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: for f_name, f_dtype in data ) ) - elif zarr_format == 3: # noqa: SIM102 + elif zarr_format == 3: if isinstance(data, dict) and "configuration" in data: config = data["configuration"] if isinstance(config, dict) and "fields" in config: @@ -354,6 +351,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: for f_name, f_dtype in meta_fields ) return cls(fields=fields) + else: + raise TypeError(f"Invalid type: {data}. Expected a dictionary.") + else: + raise TypeError(f"Invalid type: {data}. Expected a dictionary.") raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index f691bd88c8..b8fc85b297 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -105,20 +105,31 @@ class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness): # because the particular numpy dtype we are wrapping does not allow direct construction via # cls.dtype_cls() _numpy_name: ClassVar[_DTypeName] - interval: int + scale_factor: int unit: DateTimeUnit + def __post_init__(self) -> None: + if self.scale_factor < 1: + raise ValueError(f"scale_factor must be > 0, got {self.scale_factor}.") + if self.scale_factor >= 2**31: + raise ValueError(f"scale_factor must be < 2147483648, got {self.scale_factor}.") + if self.unit not in get_args(DateTimeUnit): + raise ValueError(f"unit must be one of {get_args(DateTimeUnit)}, got {self.unit!r}.") + @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - unit, interval = np.datetime_data(dtype.name) + unit, scale_factor = np.datetime_data(dtype.name) + unit = cast("DateTimeUnit", unit) byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls(unit=unit, interval=interval, endianness=endianness_from_numpy_str(byteorder)) # type: ignore[arg-type] + return cls( + unit=unit, scale_factor=scale_factor, endianness=endianness_from_numpy_str(byteorder) + ) def to_dtype(self) -> _BaseTimeDType_co: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. - dtype_string = f"{self._numpy_name}[{self.interval}{self.unit}]" + dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] @classmethod @@ -127,8 +138,8 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: unit = data["configuration"]["unit"] # type: ignore[index, call-overload] - interval = data["configuration"]["interval"] # type: ignore[index, call-overload] - return cls(unit=unit, interval=interval) # type: ignore[arg-type] + scale_factor = data["configuration"]["scale_factor"] # type: ignore[index, call-overload] + return cls(unit=unit, scale_factor=scale_factor) # type: ignore[arg-type] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: @@ -137,7 +148,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: elif zarr_format == 3: return { "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "interval": self.interval}, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -166,7 +177,7 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has _zarr_v3_name = "numpy.timedelta64" _zarr_v2_names = (">m8", " np.timedelta64: @@ -174,7 +185,7 @@ def default_value(self) -> np.timedelta64: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: if check_json_int(data): - return self.to_dtype().type(data, f"{self.interval}{self.unit}") + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") def _cast_value_unsafe(self, value: object) -> np.timedelta64: @@ -202,8 +213,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and data["name"] == cls._zarr_v3_name and set(data.keys()) == {"name", "configuration"} and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"unit", "interval"} - and data["configuration"]["unit"] in get_args(DateTimeUnit) + and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -215,14 +225,14 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd _zarr_v2_names = (">M8", " np.datetime64: return np.datetime64("NaT") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): - return self.to_dtype().type(data, f"{self.interval}{self.unit}") + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") def _cast_value_unsafe(self, value: object) -> np.datetime64: @@ -248,7 +258,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name - and set(data["configuration"].keys()) == {"unit", "interval"} + and set(data["configuration"].keys()) == {"unit", "scale_factor"} and data["configuration"]["unit"] in get_args(DateTimeUnit) ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index be51db3ae5..0600fab80b 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -22,6 +22,7 @@ from __future__ import annotations +import warnings from abc import ABC, abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING, ClassVar, Generic, Self, TypeGuard, TypeVar @@ -329,3 +330,15 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal The native scalar value. """ ... + + +def v3_unstable_dtype_warning(dtype: ZDType[TBaseDType, TBaseScalar]) -> None: + msg = ( + f"You are using a data type ({dtype}) that does not have a stable Zarr V3 specification." + "Be advised that arrays stored with this data type may be unreadable by other Zarr " + "libraries, and possibly future versions of Zarr-Python as well. " + "Use this data type at your own risk." + "See https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for a list" + "of data types with a stable Zarr V3 specification." + ) + warnings.warn(msg, category=FutureWarning, stacklevel=2) diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 2b4bb0b685..d8ef17a039 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -17,7 +17,7 @@ elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): - zdtype_examples += (wrapper_cls(unit="s", interval=10),) + zdtype_examples += (wrapper_cls(unit="s", scale_factor=10),) else: zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index e4e5dd541e..1040683846 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -14,8 +14,8 @@ class TestBool(_TestZDType): np.dtype(np.float64), np.dtype(np.uint16), ) - valid_json_v2 = Bool._zarr_v2_names - valid_json_v3_cases = (Bool._zarr_v3_name,) + valid_json_v2 = ("|b1",) + valid_json_v3 = ("bool",) invalid_json_v2 = ( "|b1", "bool", @@ -26,3 +26,6 @@ class TestBool(_TestZDType): "|f8", {"name": "bool", "configuration": {"endianness": "little"}}, ) + + scalar_v2_params = (("|b1", True), ("|b1", False)) + scalar_v3_params = (("bool", True), ("bool", False)) diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index 6621d625d9..aac514028d 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -14,8 +14,8 @@ class TestComplex64(_TestZDType): np.dtype(np.float64), np.dtype(np.complex128), ) - valid_json_v2 = Complex64._zarr_v2_names - valid_json_v3_cases = (Complex64._zarr_v3_name,) + valid_json_v2 = (">c8", ">c8") + valid_json_v3 = ("complex64",) invalid_json_v2 = ( "|c8", "complex64", @@ -27,6 +27,13 @@ class TestComplex64(_TestZDType): {"name": "complex64", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = ((">c8", (1.0, 1.0)), ("c8", (0, "NaN"))) + scalar_v3_params = ( + ("complex64", (1.0, 1.0)), + ("complex64", (-1.0, "Infinity")), + ("complex64", (0, "NaN")), + ) + class TestComplex128(_TestZDType): test_cls = Complex128 @@ -36,8 +43,8 @@ class TestComplex128(_TestZDType): np.dtype(np.float64), np.dtype(np.complex64), ) - valid_json_v2 = Complex128._zarr_v2_names - valid_json_v3_cases = (Complex128._zarr_v3_name,) + valid_json_v2 = (">c16", "c16", (1.0, 1.0)), ("c16", (0, "NaN"))) + scalar_v3_params = ( + ("complex128", (1.0, 1.0)), + ("complex128", (-1.0, "Infinity")), + ("complex128", (0, "NaN")), + ) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index a9de0145c6..232ed1e32c 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -15,7 +15,7 @@ class TestFloat16(_TestZDType): np.dtype(np.float32), ) valid_json_v2 = Float16._zarr_v2_names - valid_json_v3_cases = (Float16._zarr_v3_name,) + valid_json_v3 = (Float16._zarr_v3_name,) invalid_json_v2 = ( "|f2", "float16", @@ -27,6 +27,14 @@ class TestFloat16(_TestZDType): {"name": "float16", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = ((">f2", 1.0), ("f2", "Infinity")) + scalar_v3_params = ( + ("float16", 1.0), + ("float16", -1.0), + ("float16", "NaN"), + ("float16", "Infinity"), + ) + class TestFloat32(_TestZDType): test_cls = Float32 @@ -37,7 +45,7 @@ class TestFloat32(_TestZDType): np.dtype(np.float64), ) valid_json_v2 = Float32._zarr_v2_names - valid_json_v3_cases = (Float32._zarr_v3_name,) + valid_json_v3 = (Float32._zarr_v3_name,) invalid_json_v2 = ( "|f4", "float32", @@ -49,6 +57,14 @@ class TestFloat32(_TestZDType): {"name": "float32", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = ((">f4", 1.0), ("f4", "Infinity")) + scalar_v3_params = ( + ("float32", 1.0), + ("float32", -1.0), + ("float32", "NaN"), + ("float32", "Infinity"), + ) + class TestFloat64(_TestZDType): test_cls = Float64 @@ -59,7 +75,7 @@ class TestFloat64(_TestZDType): np.dtype(np.float32), ) valid_json_v2 = Float64._zarr_v2_names - valid_json_v3_cases = (Float64._zarr_v3_name,) + valid_json_v3 = (Float64._zarr_v3_name,) invalid_json_v2 = ( "|f8", "float64", @@ -70,3 +86,11 @@ class TestFloat64(_TestZDType): "|i1", {"name": "float64", "configuration": {"endianness": "little"}}, ) + + scalar_v2_params = ((">f8", 1.0), ("f8", "Infinity")) + scalar_v3_params = ( + ("float64", 1.0), + ("float64", -1.0), + ("float64", "NaN"), + ("float64", "Infinity"), + ) diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index 2f149ff58f..99f698fc8e 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -14,8 +14,8 @@ class TestInt8(_TestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = Int8._zarr_v2_names - valid_json_v3_cases = (Int8._zarr_v3_name,) + valid_json_v2 = ("|i1",) + valid_json_v3 = ("int8",) invalid_json_v2 = ( ">i1", "int8", @@ -27,6 +27,9 @@ class TestInt8(_TestZDType): {"name": "int8", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = (("|i1", 1), ("|i1", -1)) + scalar_v3_params = (("int8", 1), ("int8", -1)) + class TestInt16(_TestZDType): test_cls = Int16 @@ -36,8 +39,8 @@ class TestInt16(_TestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = Int16._zarr_v2_names - valid_json_v3_cases = (Int16._zarr_v3_name,) + valid_json_v2 = (">i2", "i2", -1)) + scalar_v3_params = (("int16", 1), ("int16", -1)) + class TestInt32(_TestZDType): test_cls = Int32 @@ -58,8 +64,8 @@ class TestInt32(_TestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = Int32._zarr_v2_names - valid_json_v3_cases = (Int32._zarr_v3_name,) + valid_json_v2 = (">i4", "i4", -1)) + scalar_v3_params = (("int32", 1), ("int32", -1)) + class TestInt64(_TestZDType): test_cls = Int64 @@ -80,8 +89,8 @@ class TestInt64(_TestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = Int64._zarr_v2_names - valid_json_v3_cases = (Int64._zarr_v3_name,) + valid_json_v2 = (">i8", "i8", -1)) + scalar_v3_params = (("int64", 1), ("int64", -1)) + class TestUInt8(_TestZDType): test_cls = UInt8 @@ -102,8 +114,8 @@ class TestUInt8(_TestZDType): np.dtype(np.int16), np.dtype(np.float64), ) - valid_json_v2 = UInt8._zarr_v2_names - valid_json_v3_cases = (UInt8._zarr_v3_name,) + valid_json_v2 = ("|u1",) + valid_json_v3 = ("uint8",) invalid_json_v2 = ( "|u1", "uint8", @@ -115,6 +127,9 @@ class TestUInt8(_TestZDType): {"name": "uint8", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = (("|u1", 1), ("|u1", 0)) + scalar_v3_params = (("uint8", 1), ("uint8", 0)) + class TestUInt16(_TestZDType): test_cls = UInt16 @@ -124,8 +139,8 @@ class TestUInt16(_TestZDType): np.dtype(np.int16), np.dtype(np.float64), ) - valid_json_v2 = UInt16._zarr_v2_names - valid_json_v3_cases = (UInt16._zarr_v3_name,) + valid_json_v2 = (">u2", "u2", 0)) + scalar_v3_params = (("uint16", 1), ("uint16", 0)) + class TestUInt32(_TestZDType): test_cls = UInt32 @@ -146,8 +164,8 @@ class TestUInt32(_TestZDType): np.dtype(np.int16), np.dtype(np.float64), ) - valid_json_v2 = UInt32._zarr_v2_names - valid_json_v3_cases = (UInt32._zarr_v3_name,) + valid_json_v2 = (">u4", "u4", 0)) + scalar_v3_params = (("uint32", 1), ("uint32", 0)) + class TestUInt64(_TestZDType): test_cls = UInt64 @@ -168,8 +189,8 @@ class TestUInt64(_TestZDType): np.dtype(np.int16), np.dtype(np.float64), ) - valid_json_v2 = UInt64._zarr_v2_names - valid_json_v3_cases = (UInt64._zarr_v3_name,) + valid_json_v2 = (">u8", "u8", 0)) + scalar_v3_params = (("uint64", 1), ("uint64", 0)) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 887d734fd3..17f4b2af2d 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -20,9 +20,7 @@ class TestFixedLengthAscii(_TestZDType): np.dtype("|U10"), ) valid_json_v2 = ("|S0", "|S2", "|S4") - valid_json_v3_cases = ( - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": 80}}, - ) + valid_json_v3 = ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 10}},) invalid_json_v2 = ( "|S", "|U10", @@ -33,6 +31,13 @@ class TestFixedLengthAscii(_TestZDType): {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, ) + scalar_v2_params = (("|S0", ""), ("|S2", "YWI="), ("|S4", "YWJjZA==")) + scalar_v3_params = ( + ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 0}}, ""), + ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 16}}, "YWI="), + ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 32}}, "YWJjZA=="), + ) + class TestFixedLengthBytes(_TestZDType): test_cls = FixedLengthBytes @@ -43,17 +48,28 @@ class TestFixedLengthBytes(_TestZDType): np.dtype("|S10"), ) valid_json_v2 = ("|V10",) - valid_json_v3_cases = ({"name": "r80"},) + valid_json_v3 = ( + {"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 0}}, + {"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 8}}, + ) + invalid_json_v2 = ( "|V", "|S10", "|f8", ) invalid_json_v3 = ( - {"name": "r0"}, + {"name": "r10"}, {"name": "r-80"}, ) + scalar_v2_params = (("|V0", ""), ("|V2", "YWI="), ("|V4", "YWJjZA==")) + scalar_v3_params = ( + ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 2}}, ""), + ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 2}}, "YWI="), + ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 4}}, "YWJjZA=="), + ) + class TestFixedLengthUnicode(_TestZDType): test_cls = FixedLengthUnicode @@ -64,9 +80,7 @@ class TestFixedLengthUnicode(_TestZDType): np.dtype("|S10"), ) valid_json_v2 = (">U10", "U0", ""), ("i4"), ("field2", ">f8")], [("field1", ">i8"), ("field2", ">i4")], ) - valid_json_v3_cases = ( + valid_json_v3 = ( { "name": "structured", "configuration": { "fields": [ - ("field1", {"name": "int32", "configuration": {"endianness": "big"}}), - ("field2", {"name": "float64", "configuration": {"endianness": "big"}}), + ("field1", "int32"), + ("field2", "float64"), ] }, }, @@ -107,8 +128,17 @@ class TestStructured(_TestZDType): "name": "structured", "configuration": { "fields": [ - ("field1", {"name": "int64", "configuration": {"endianness": "big"}}), - ("field2", {"name": "int32", "configuration": {"endianness": "big"}}), + ( + "field1", + { + "name": "numpy.datetime64", + "configuration": {"unit": "s", "scale_factor": 1}, + }, + ), + ( + "field2", + {"name": "numpy.fixed_length_ucs4", "configuration": {"length_bytes": 32}}, + ), ] }, }, diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 2f77379f01..fbb0aaa86d 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -16,7 +16,7 @@ class TestVariableLengthString(_TestZDType): np.dtype("|S10"), ) valid_json_v2 = ("|O",) - valid_json_v3_cases = ({"name": "numpy.variable_length_utf8"},) + valid_json_v3 = ("numpy.variable_length_utf8",) invalid_json_v2 = ( "|S10", "|f8", @@ -38,7 +38,7 @@ class TestVariableLengthString(_TestZDType): np.dtype("|S10"), ) valid_json_v2 = ("|O",) - valid_json_v3_cases = ({"name": "numpy.variable_length_utf8"},) + valid_json_v3 = ("numpy.variable_length_utf8",) invalid_json_v2 = ( "|S10", "|f8", diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index a5d2cce545..2a8ff6ac98 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -1,6 +1,9 @@ from __future__ import annotations +import re + import numpy as np +import pytest from tests.test_dtype.test_wrapper import _TestZDType from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 @@ -15,9 +18,9 @@ class TestDateTime64(_TestZDType): np.dtype("timedelta64[ns]"), ) valid_json_v2 = (">M8", ">M8[s]", " None: + """ + Test that an invalid unit raises a ValueError. + """ + unit = "invalid" + msg = f"unit must be one of ('Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'μs', 'ns', 'ps', 'fs', 'as', 'generic'), got {unit!r}." + with pytest.raises(ValueError, match=re.escape(msg)): + DateTime64(unit=unit) # type: ignore[arg-type] + with pytest.raises(ValueError, match=re.escape(msg)): + TimeDelta64(unit=unit) # type: ignore[arg-type] + + +def test_time_scale_factor_too_low() -> None: + """ + Test that an invalid unit raises a ValueError. + """ + scale_factor = 0 + msg = f"scale_factor must be > 0, got {scale_factor}." + with pytest.raises(ValueError, match=msg): + DateTime64(scale_factor=scale_factor) + with pytest.raises(ValueError, match=msg): + TimeDelta64(scale_factor=scale_factor) + + +def test_time_scale_factor_too_high() -> None: + """ + Test that an invalid unit raises a ValueError. + """ + scale_factor = 2**31 + msg = f"scale_factor must be < 2147483648, got {scale_factor}." + with pytest.raises(ValueError, match=msg): + DateTime64(scale_factor=scale_factor) + with pytest.raises(ValueError, match=msg): + TimeDelta64(scale_factor=scale_factor) diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index bbe74d9a0f..49e05340e0 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -1,34 +1,9 @@ from __future__ import annotations -from typing import Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar -import hypothesis.strategies as st -import numpy as np -from hypothesis.extra import numpy as npst - -from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType - - -def all_dtypes() -> st.SearchStrategy[np.dtype[np.generic]]: - return ( - npst.boolean_dtypes() - | npst.integer_dtypes(endianness="=") - | npst.unsigned_integer_dtypes(endianness="=") - | npst.floating_dtypes(endianness="=") - | npst.complex_number_dtypes(endianness="=") - | npst.byte_string_dtypes(endianness="=") - | npst.unicode_string_dtypes(endianness="=") - | npst.datetime64_dtypes(endianness="=") - | npst.timedelta64_dtypes(endianness="=") - ) - - -def get_classvar_attributes(cls: type) -> dict[str, Any]: - classvar_attributes = {} - for name, annotation in cls.__annotations__.items(): - if getattr(annotation, "__origin__", None) is ClassVar: - classvar_attributes[name] = getattr(cls, name) - return classvar_attributes +if TYPE_CHECKING: + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType class _TestZDType: @@ -43,6 +18,13 @@ class _TestZDType: valid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () invalid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () + # for testing scalar round-trip serialization, we need a tuple of (data type json, scalar json) + # pairs. the first element of the pair is used to create a dtype instance, and the second + # element is the json serialization of the scalar that we want to round-trip. + + scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + def test_check_dtype_valid(self, valid_dtype: object) -> None: assert self.test_cls.check_dtype(valid_dtype) # type: ignore[arg-type] @@ -61,6 +43,26 @@ def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 + def test_scalar_roundtrip_v2(self, scalar_v2_params: Any) -> None: + dtype_json, scalar_json = scalar_v2_params + zdtype = self.test_cls.from_json(dtype_json, zarr_format=2) + scalar = zdtype.from_json_value(scalar_json, zarr_format=2) + assert self._scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=2)) + + def test_scalar_roundtrip_v3(self, scalar_v3_params: Any) -> None: + dtype_json, scalar_json = scalar_v3_params + zdtype = self.test_cls.from_json(dtype_json, zarr_format=3) + scalar = zdtype.from_json_value(scalar_json, zarr_format=3) + assert self._scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=3)) + + @staticmethod + def _scalar_equals(a: object, b: object) -> bool: + """ + Compare two scalars for equality. Subclasses that test dtypes with scalars that don't allow + simple equality like nans should override this method. + """ + return a == b + """ @abc.abstractmethod def test_cast_value(self, value: Any) -> None: raise NotImplementedError From aa11df4425d7b3ee6f5af397f774db564b78b5a8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 12 May 2025 13:37:52 +0200 Subject: [PATCH 078/130] wip: json schema test --- tests/test_dtype/conftest.py | 10 +++++- tests/test_dtype/test_wrapper.py | 61 ++++++++++---------------------- 2 files changed, 28 insertions(+), 43 deletions(-) diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index d8ef17a039..9c7825c0d1 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -53,6 +53,14 @@ class TestB(TestExample): param_a = [1, 2, 100, 10] """ + # Iterate over all the fixtures defined in the class + # and parametrize them with the values defined in the class + # This allows us to define class-scoped fixtures as class attributes + # and then generate the parametrize calls for pytest for fixture_name in metafunc.fixturenames: if hasattr(metafunc.cls, fixture_name): - metafunc.parametrize(fixture_name, getattr(metafunc.cls, fixture_name), scope="class") + params = getattr(metafunc.cls, fixture_name) + if len(params) == 0: + msg = f"{metafunc.cls}.{fixture_name} is empty. Please provide a non-empty sequence of values." + raise ValueError(msg) + metafunc.parametrize(fixture_name, params, scope="class") diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 49e05340e0..defd3fffc5 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -5,6 +5,23 @@ if TYPE_CHECKING: from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +import pytest +import requests + + +class _TestZDTypeSchema: + # subclasses define the URL for the schema, if available + schema_url: ClassVar[str] = "" + + @pytest.fixture(scope="class") + def get_schema(self) -> object: + response = requests.get(self.schema_url) + response.raise_for_status() + return json_schema.loads(response.text) + + def test_schema(self, schema: json_schema.Schema) -> None: + assert schema.is_valid(self.test_cls.to_json(zarr_format=2)) + class _TestZDType: test_cls: type[ZDType[TBaseDType, TBaseScalar]] @@ -47,50 +64,10 @@ def test_scalar_roundtrip_v2(self, scalar_v2_params: Any) -> None: dtype_json, scalar_json = scalar_v2_params zdtype = self.test_cls.from_json(dtype_json, zarr_format=2) scalar = zdtype.from_json_value(scalar_json, zarr_format=2) - assert self._scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=2)) + assert scalar_json == zdtype.to_json_value(scalar, zarr_format=2) def test_scalar_roundtrip_v3(self, scalar_v3_params: Any) -> None: dtype_json, scalar_json = scalar_v3_params zdtype = self.test_cls.from_json(dtype_json, zarr_format=3) scalar = zdtype.from_json_value(scalar_json, zarr_format=3) - assert self._scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=3)) - - @staticmethod - def _scalar_equals(a: object, b: object) -> bool: - """ - Compare two scalars for equality. Subclasses that test dtypes with scalars that don't allow - simple equality like nans should override this method. - """ - return a == b - - """ @abc.abstractmethod - def test_cast_value(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_check_value(self) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_default_value(self) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_check_json(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_from_json_roundtrip_v2(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_from_json_roundtrip_v3(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_from_json_value_roundtrip_v2(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_from_json_value_roundtrip_v3(self, value: Any) -> None: - raise NotImplementedError """ + assert scalar_json == zdtype.to_json_value(scalar, zarr_format=3) From 52518c24fa96a4532e62b4d996d24540e6bc4e63 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:04:20 +0200 Subject: [PATCH 079/130] add casting tests --- src/zarr/core/dtype/npy/bool.py | 5 +- src/zarr/core/dtype/npy/complex.py | 4 +- src/zarr/core/dtype/npy/float.py | 4 +- src/zarr/core/dtype/npy/int.py | 4 +- src/zarr/core/dtype/npy/sized.py | 37 ++-- src/zarr/core/dtype/npy/time.py | 16 +- src/zarr/core/dtype/wrapper.py | 8 +- tests/test_dtype/test_dtype.py | 248 ---------------------- tests/test_dtype/test_npy/test_bool.py | 13 +- tests/test_dtype/test_npy/test_complex.py | 49 ++++- tests/test_dtype/test_npy/test_float.py | 76 +++++-- tests/test_dtype/test_npy/test_int.py | 72 +++++-- tests/test_dtype/test_npy/test_sized.py | 79 +++++-- tests/test_dtype/test_npy/test_string.py | 30 ++- tests/test_dtype/test_npy/test_time.py | 63 +++++- tests/test_dtype/test_wrapper.py | 38 ++-- tests/test_properties.py | 1 + 17 files changed, 381 insertions(+), 366 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 776acf4f8c..c80033c54e 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -101,14 +101,11 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: """ if check_json_bool(data): return self._cast_value_unsafe(data) - raise TypeError(f"Invalid type: {data}. Expected a boolean.") + raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover def check_value(self, data: object) -> bool: # Anything can become a bool return True - def cast_value(self, value: object) -> np.bool_: - return self._cast_value_unsafe(value) - def _cast_value_unsafe(self, value: object) -> np.bool_: return np.bool_(value) diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 6e19266660..fab4ca9893 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -84,9 +84,7 @@ def check_value(self, value: object) -> bool: return isinstance(value, ComplexLike) def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[arg-type, return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to a complex scalar.") + return self.to_dtype().type(value) # type: ignore[arg-type, return-value] def default_value(self) -> TComplexScalar_co: """ diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 15baaaadaa..bedd6a4751 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -76,9 +76,7 @@ def check_value(self, value: object) -> TypeGuard[FloatLike]: return isinstance(value, FloatLike) def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to a float.") + return self.to_dtype().type(value) # type: ignore[return-value, arg-type] def default_value(self) -> TFloatScalar_co: """ diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 7da7245162..78d9499243 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -71,9 +71,7 @@ def check_value(self, value: object) -> TypeGuard[IntLike]: return isinstance(value, IntLike) def _cast_value_unsafe(self, value: object) -> TIntScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to an integer.") + return self.to_dtype().type(value) # type: ignore[return-value, arg-type] def default_value(self) -> TIntScalar_co: """ diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 032a1ec5c0..281c634856 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -76,7 +76,7 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_str(data): return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes) @@ -162,7 +162,7 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): return self.to_dtype().type(base64.standard_b64decode(data)) - raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) @@ -234,9 +234,9 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.to_dtype().type(data) + if check_json_str(data): + return self.to_dtype().type(data) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: return isinstance(data, str | np.str_ | bytes) @@ -332,6 +332,7 @@ def check_json( def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: from zarr.core.dtype import get_data_type_from_json + # This is a horrible mess, because this data type is recursive if cls.check_json(data, zarr_format=zarr_format): if zarr_format == 2: # structured dtypes are constructed directly from a list of lists @@ -352,9 +353,13 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: ) return cls(fields=fields) else: - raise TypeError(f"Invalid type: {data}. Expected a dictionary.") + raise TypeError( + f"Invalid type: {data}. Expected a dictionary." + ) # pragma: no cover else: - raise TypeError(f"Invalid type: {data}. Expected a dictionary.") + raise TypeError( + f"Invalid type: {data}. Expected a dictionary." + ) # pragma: no cover raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") @@ -368,16 +373,12 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) def check_value(self, data: object) -> bool: - # not sure which values we should accept for structured dtypes. - try: - np.array([data], dtype=self.to_dtype()) - return True # noqa: TRY300 - except ValueError: - return False + # TODO: implement something here! + return True def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - as_bytes = bytes_from_json(data, zarr_format=zarr_format) - dtype = self.to_dtype() - return cast("np.void", np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) + if check_json_str(data): + as_bytes = bytes_from_json(data, zarr_format=zarr_format) + dtype = self.to_dtype() + return cast("np.void", np.array([as_bytes]).view(dtype)[0]) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index b8fc85b297..bbdd41d13f 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -33,7 +33,7 @@ _DTypeName = Literal["datetime64", "timedelta64"] -def datetime_from_int(data: int, *, unit: DateTimeUnit, interval: int) -> np.datetime64: +def datetime_from_int(data: int, *, unit: DateTimeUnit, scale_factor: int) -> np.datetime64: """ Convert an integer to a datetime64. @@ -43,15 +43,15 @@ def datetime_from_int(data: int, *, unit: DateTimeUnit, interval: int) -> np.dat The integer to convert. unit : DateTimeUnit The unit of the datetime64. - interval : int - The interval of the datetime64. + scale_factor : int + The scale factor of the datetime64. Returns ------- np.datetime64 The datetime64 value. """ - dtype_name = f"datetime64[{interval}{unit}]" + dtype_name = f"datetime64[{scale_factor}{unit}]" return cast("np.datetime64", np.int64(data).view(dtype_name)) @@ -184,9 +184,9 @@ def default_value(self) -> np.timedelta64: return np.timedelta64("NaT") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: - if check_json_int(data): + if check_json_int(data) or data == "NaT": return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") - raise TypeError(f"Invalid type: {data}. Expected an integer.") + raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, value: object) -> np.timedelta64: return self.to_dtype().type(value) # type: ignore[arg-type] @@ -231,9 +231,9 @@ def default_value(self) -> np.datetime64: return np.datetime64("NaT") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_int(data): + if check_json_int(data) or data == "NaT": return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") - raise TypeError(f"Invalid type: {data}. Expected an integer.") + raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, value: object) -> np.datetime64: return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 0600fab80b..199cbda5d8 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -159,7 +159,13 @@ def cast_value(self, data: object) -> TScalar_co: """ if self.check_value(data): return self._cast_value_unsafe(data) - raise TypeError(f"Invalid value: {data}") + msg = ( + f"The value {data} failed a type check." + f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}." + f"Consult the documentation for {self} to determine the possible values that can" + "be cast to scalars of the wrapped data type." + ) + raise TypeError(msg) @abstractmethod def check_value(self, data: object) -> bool: diff --git a/tests/test_dtype/test_dtype.py b/tests/test_dtype/test_dtype.py index 566a04b5fb..e69de29bb2 100644 --- a/tests/test_dtype/test_dtype.py +++ b/tests/test_dtype/test_dtype.py @@ -1,248 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, get_args - -from zarr.core.dtype import ( - DTYPE, - Bool, - Complex64, - Complex128, - DateTime64, - FixedLengthAscii, - FixedLengthBytes, - FixedLengthUnicode, - Float16, - Float32, - Float64, - Int8, - Int16, - Int32, - Int64, - Structured, - UInt8, - UInt16, - UInt32, - UInt64, - VariableLengthString, - ZDType, -) - -from .conftest import zdtype_examples - -if TYPE_CHECKING: - from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar - -import numpy as np -import pytest - -from zarr.core.dtype.common import DataTypeValidationError - -_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") -VLEN_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType -if _NUMPY_SUPPORTS_VLEN_STRING: - VLEN_STRING_DTYPE = np.dtypes.StringDType() - VLEN_STRING_CODE = "T" -else: - VLEN_STRING_DTYPE = np.dtypes.ObjectDType() - VLEN_STRING_CODE = "O" - - -def test_zdtype_examples() -> None: - """ - Test that all the elements of the exported union type DTYPE have an example in the variable - zdtype_examples, which we use for testing. - - If this test fails, that means that either there is a data type that does not have an example, - or there is a data type that is missing from the DTYPE union type. - """ - assert set(map(type, zdtype_examples)) == set(get_args(DTYPE)) - - -@pytest.mark.parametrize( - ("wrapper_cls", "np_dtype"), - [ - (Bool, "bool"), - (Int8, "int8"), - (Int16, "int16"), - (Int32, "int32"), - (Int64, "int64"), - (UInt8, "uint8"), - (UInt16, "uint16"), - (UInt32, "uint32"), - (UInt64, "uint64"), - (Float32, "float32"), - (Float64, "float64"), - (Complex64, "complex64"), - (Complex128, "complex128"), - (FixedLengthUnicode, "U"), - (FixedLengthAscii, "S"), - (FixedLengthBytes, "V"), - (VariableLengthString, VLEN_STRING_CODE), - (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), - (DateTime64, "datetime64[s]"), - ], -) -def test_wrap(wrapper_cls: type[ZDType[Any, Any]], np_dtype: np.dtype[np.generic] | str) -> None: - """ - Test that the wrapper class has the correct dtype class bound to the dtype_cls variable - Test that the ``wrap`` method produces an instance of the wrapper class - Test that the ``unwrap`` method returns the original dtype - """ - dt = np.dtype(np_dtype) - assert wrapper_cls.dtype_cls is type(dt) - wrapped = wrapper_cls.from_dtype(dt) - - with pytest.raises(DataTypeValidationError, match="Invalid dtype"): - wrapper_cls.from_dtype("not a dtype") # type: ignore[arg-type] - assert isinstance(wrapped, wrapper_cls) - assert wrapped.to_dtype() == dt - - -@pytest.mark.parametrize("zdtype", zdtype_examples) -def test_to_json_roundtrip(zdtype: ZDType[Any, Any], zarr_format: ZarrFormat) -> None: - """ - Test that a zdtype instance can round-trip through its JSON form - """ - as_dict = zdtype.to_json(zarr_format=zarr_format) - assert zdtype.from_json(as_dict, zarr_format=zarr_format) == zdtype - - -@pytest.mark.parametrize( - ("wrapper", "expected_default"), - [ - (Bool(), np.False_), - (Int8(), np.int8(0)), - (UInt8(), np.uint8(0)), - (Int16(), np.int16(0)), - (UInt16(), np.uint16(0)), - (Int32(), np.int32(0)), - (UInt32(), np.uint32(0)), - (Int64(), np.int64(0)), - (UInt64(), np.uint64(0)), - (Float16(), np.float16(0)), - (Float32(), np.float32(0)), - (Float64(), np.float64(0)), - (Complex64(), np.complex64(0)), - (Complex128(), np.complex128(0)), - (FixedLengthAscii(length=3), np.bytes_(b"")), - (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), - (FixedLengthUnicode(length=3), np.str_("")), - ( - Structured(fields=(("a", Float64()), ("b", Int8()))), - np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], - ), - (VariableLengthString(), ""), - (DateTime64(unit="s"), np.datetime64("NaT")), - ], -) -def test_default_value(wrapper: ZDType[Any, Any], expected_default: Any) -> None: - """ - Test that the default_value method is correctly set for each dtype wrapper. - """ - if isinstance(wrapper, DateTime64): - assert np.isnan(wrapper.default_value()) - else: - assert wrapper.default_value() == expected_default - - -@pytest.mark.parametrize( - ("wrapper", "input_value", "expected_json"), - [ - (Bool(), np.bool_(True), True), - (Int8(), np.int8(42), 42), - (UInt8(), np.uint8(42), 42), - (Int16(), np.int16(42), 42), - (UInt16(), np.uint16(42), 42), - (Int32(), np.int32(42), 42), - (UInt32(), np.uint32(42), 42), - (Int64(), np.int64(42), 42), - (UInt64(), np.uint64(42), 42), - (Float16(), np.float16(42.0), 42.0), - (Float32(), np.float32(42.0), 42.0), - (Float64(), np.float64(42.0), 42.0), - (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), - (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), - (FixedLengthAscii(length=4), np.bytes_(b"test"), "dGVzdA=="), - (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), - (FixedLengthUnicode(length=4), np.str_("test"), "test"), - (VariableLengthString(), "test", "test"), - (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), - ], -) -def test_to_json_value_v2( - wrapper: ZDType[TBaseDType, TBaseScalar], input_value: Any, expected_json: Any -) -> None: - """ - Test the to_json_value method for each dtype wrapper for zarr v2 - """ - assert wrapper.to_json_value(input_value, zarr_format=2) == expected_json - - -# NOTE! This test is currently a direct copy of the v2 version. When or if we change JSON serialization -# in a v3-specific manner, this test must be changed. -# TODO: Apply zarr-v3-specific changes to this test as needed -@pytest.mark.parametrize( - ("wrapper", "input_value", "expected_json"), - [ - (Bool(), np.bool_(True), True), - (Int8(), np.int8(42), 42), - (UInt8(), np.uint8(42), 42), - (Int16(), np.int16(42), 42), - (UInt16(), np.uint16(42), 42), - (Int32(), np.int32(42), 42), - (UInt32(), np.uint32(42), 42), - (Int64(), np.int64(42), 42), - (UInt64(), np.uint64(42), 42), - (Float16(), np.float16(42.0), 42.0), - (Float32(), np.float32(42.0), 42.0), - (Float64(), np.float64(42.0), 42.0), - (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), - (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), - (FixedLengthAscii(length=4), np.bytes_(b"test"), "dGVzdA=="), - (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), - (FixedLengthUnicode(length=4), np.str_("test"), "test"), - (VariableLengthString(), "test", "test"), - (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), - ], -) -def test_to_json_value_v3( - wrapper: ZDType[TBaseDType, TBaseScalar], input_value: Any, expected_json: Any -) -> None: - """ - Test the to_json_value method for each dtype wrapper for zarr v3 - """ - assert wrapper.to_json_value(input_value, zarr_format=3) == expected_json - - -@pytest.mark.parametrize( - ("wrapper", "json_value", "expected_value"), - [ - (Bool(), True, np.bool_(True)), - (Int8(), 42, np.int8(42)), - (UInt8(), 42, np.uint8(42)), - (Int16(), 42, np.int16(42)), - (UInt16(), 42, np.uint16(42)), - (Int32(), 42, np.int32(42)), - (UInt32(), 42, np.uint32(42)), - (Int64(), 42, np.int64(42)), - (UInt64(), 42, np.uint64(42)), - (Float16(), 42.0, np.float16(42.0)), - (Float32(), 42.0, np.float32(42.0)), - (Float64(), 42.0, np.float64(42.0)), - (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), - (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), - (FixedLengthAscii(length=4), "dGVzdA==", np.bytes_(b"test")), - (FixedLengthBytes(length=4), "dGVzdA==", np.void(b"test")), - (FixedLengthUnicode(length=4), "test", np.str_("test")), - (VariableLengthString(), "test", "test"), - (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), - ], -) -def test_from_json_value( - wrapper: ZDType[TBaseDType, TBaseScalar], json_value: Any, expected_value: Any -) -> None: - """ - Test the from_json_value method for each dtype wrapper. - """ - assert wrapper.from_json_value(json_value, zarr_format=2) == expected_value diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 1040683846..086a2cfee8 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -8,6 +8,7 @@ class TestBool(_TestZDType): test_cls = Bool + valid_dtype = (np.dtype(np.bool_),) invalid_dtype = ( np.dtype(np.int8), @@ -27,5 +28,13 @@ class TestBool(_TestZDType): {"name": "bool", "configuration": {"endianness": "little"}}, ) - scalar_v2_params = (("|b1", True), ("|b1", False)) - scalar_v3_params = (("bool", True), ("bool", False)) + scalar_v2_params = ((Bool(), True), (Bool(), False)) + scalar_v3_params = ((Bool(), True), (Bool(), False)) + + cast_value_params = ( + (Bool(), "true", np.True_), + (Bool(), True, np.True_), + (Bool(), False, np.False_), + (Bool(), np.True_, np.True_), + (Bool(), np.False_, np.False_), + ) diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index aac514028d..b24bc4d7c8 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -1,12 +1,21 @@ from __future__ import annotations +import math + import numpy as np from tests.test_dtype.test_wrapper import _TestZDType from zarr.core.dtype.npy.complex import Complex64, Complex128 -class TestComplex64(_TestZDType): +class _BaseTestFloat(_TestZDType): + def scalar_equals(self, scalar1: object, scalar2: object) -> bool: + if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] + return True + return super().scalar_equals(scalar1, scalar2) + + +class TestComplex64(_BaseTestFloat): test_cls = Complex64 valid_dtype = (np.dtype(">c8"), np.dtype("c8", ">c8") + valid_json_v2 = (">c8", "c8", (1.0, 1.0)), ("c8", (0, "NaN"))) + scalar_v2_params = ( + (Complex64(), (1.0, 1.0)), + (Complex64(), (-1.0, "Infinity")), + (Complex64(), (0, "NaN")), + ) scalar_v3_params = ( - ("complex64", (1.0, 1.0)), - ("complex64", (-1.0, "Infinity")), - ("complex64", (0, "NaN")), + (Complex64(), (1.0, 1.0)), + (Complex64(), (-1.0, "Infinity")), + (Complex64(), (0, "NaN")), + ) + cast_value_params = ( + (Complex64(), complex(1.0, 1.0), np.complex64(complex(1.0, 1.0))), + (Complex64(), complex(-1.0, math.inf), np.complex64(complex(-1.0, math.inf))), + (Complex64(), complex(0, math.nan), np.complex64(complex(0, math.nan))), ) -class TestComplex128(_TestZDType): +class TestComplex128(_BaseTestFloat): test_cls = Complex128 valid_dtype = (np.dtype(">c16"), np.dtype("c16", (1.0, 1.0)), ("c16", (0, "NaN"))) + scalar_v2_params = ( + (Complex128(), (1.0, 1.0)), + (Complex128(), (-1.0, "Infinity")), + (Complex128(), (0, "NaN")), + ) scalar_v3_params = ( - ("complex128", (1.0, 1.0)), - ("complex128", (-1.0, "Infinity")), - ("complex128", (0, "NaN")), + (Complex128(), (1.0, 1.0)), + (Complex128(), (-1.0, "Infinity")), + (Complex128(), (0, "NaN")), + ) + cast_value_params = ( + (Complex128(), complex(1.0, 1.0), np.complex128(complex(1.0, 1.0))), + (Complex128(), complex(-1.0, math.inf), np.complex128(complex(-1.0, math.inf))), + (Complex128(), complex(0, math.nan), np.complex128(complex(0, math.nan))), ) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index 232ed1e32c..5981d09514 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -6,7 +6,14 @@ from zarr.core.dtype.npy.float import Float16, Float32, Float64 -class TestFloat16(_TestZDType): +class _BaseTestFloat(_TestZDType): + def scalar_equals(self, scalar1: object, scalar2: object) -> bool: + if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] + return True + return super().scalar_equals(scalar1, scalar2) + + +class TestFloat16(_BaseTestFloat): test_cls = Float16 valid_dtype = (np.dtype(">f2"), np.dtype("f2", 1.0), ("f2", "Infinity")) + scalar_v2_params = ( + (Float16(), 1.0), + (Float16(), -1.0), + (Float16(), "NaN"), + (Float16(), "Infinity"), + ) scalar_v3_params = ( - ("float16", 1.0), - ("float16", -1.0), - ("float16", "NaN"), - ("float16", "Infinity"), + (Float16(), 1.0), + (Float16(), -1.0), + (Float16(), "NaN"), + (Float16(), "Infinity"), + ) + cast_value_params = ( + (Float16(), 1.0, np.float16(1.0)), + (Float16(), -1.0, np.float16(-1.0)), + (Float16(), "NaN", np.float16("NaN")), ) -class TestFloat32(_TestZDType): +class TestFloat32(_BaseTestFloat): test_cls = Float32 + scalar_type = np.float32 valid_dtype = (np.dtype(">f4"), np.dtype("f4", 1.0), ("f4", "Infinity")) + scalar_v2_params = ( + (Float32(), 1.0), + (Float32(), -1.0), + (Float32(), "NaN"), + (Float32(), "Infinity"), + ) scalar_v3_params = ( - ("float32", 1.0), - ("float32", -1.0), - ("float32", "NaN"), - ("float32", "Infinity"), + (Float32(), 1.0), + (Float32(), -1.0), + (Float32(), "NaN"), + (Float32(), "Infinity"), + ) + + cast_value_params = ( + (Float32(), 1.0, np.float32(1.0)), + (Float32(), -1.0, np.float32(-1.0)), + (Float32(), "NaN", np.float32("NaN")), ) -class TestFloat64(_TestZDType): +class TestFloat64(_BaseTestFloat): test_cls = Float64 valid_dtype = (np.dtype(">f8"), np.dtype("f8", 1.0), ("f8", "Infinity")) + scalar_v2_params = ( + (Float64(), 1.0), + (Float64(), -1.0), + (Float64(), "NaN"), + (Float64(), "Infinity"), + ) scalar_v3_params = ( - ("float64", 1.0), - ("float64", -1.0), - ("float64", "NaN"), - ("float64", "Infinity"), + (Float64(), 1.0), + (Float64(), -1.0), + (Float64(), "NaN"), + (Float64(), "Infinity"), + ) + + cast_value_params = ( + (Float64(), 1.0, np.float64(1.0)), + (Float64(), -1.0, np.float64(-1.0)), + (Float64(), "NaN", np.float64("NaN")), ) diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index 99f698fc8e..637b594e1b 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -8,6 +8,7 @@ class TestInt8(_TestZDType): test_cls = Int8 + scalar_type = np.int8 valid_dtype = (np.dtype(np.int8),) invalid_dtype = ( np.dtype(np.int16), @@ -27,12 +28,17 @@ class TestInt8(_TestZDType): {"name": "int8", "configuration": {"endianness": "little"}}, ) - scalar_v2_params = (("|i1", 1), ("|i1", -1)) - scalar_v3_params = (("int8", 1), ("int8", -1)) + scalar_v2_params = ((Int8(), 1), (Int8(), -1)) + scalar_v3_params = ((Int8(), 1), (Int8(), -1)) + cast_value_params = ( + (Int8(), 1, np.int8(1)), + (Int8(), -1, np.int8(-1)), + ) class TestInt16(_TestZDType): test_cls = Int16 + scalar_type = np.int16 valid_dtype = (np.dtype(">i2"), np.dtype("i2", -1)) - scalar_v3_params = (("int16", 1), ("int16", -1)) + scalar_v2_params = ((Int16(), 1), (Int16(), -1)) + scalar_v3_params = ((Int16(), 1), (Int16(), -1)) + cast_value_params = ( + (Int16(), 1, np.int16(1)), + (Int16(), -1, np.int16(-1)), + ) class TestInt32(_TestZDType): test_cls = Int32 + scalar_type = np.int32 valid_dtype = (np.dtype(">i4"), np.dtype("i4", -1)) - scalar_v3_params = (("int32", 1), ("int32", -1)) + scalar_v2_params = ((Int32(), 1), (Int32(), -1)) + scalar_v3_params = ((Int32(), 1), (Int32(), -1)) + cast_value_params = ( + (Int32(), 1, np.int32(1)), + (Int32(), -1, np.int32(-1)), + ) class TestInt64(_TestZDType): test_cls = Int64 + scalar_type = np.int64 valid_dtype = (np.dtype(">i8"), np.dtype("i8", -1)) - scalar_v3_params = (("int64", 1), ("int64", -1)) + scalar_v2_params = ((Int64(), 1), (Int64(), -1)) + scalar_v3_params = ((Int64(), 1), (Int64(), -1)) + cast_value_params = ( + (Int64(), 1, np.int64(1)), + (Int64(), -1, np.int64(-1)), + ) class TestUInt8(_TestZDType): test_cls = UInt8 + scalar_type = np.uint8 valid_dtype = (np.dtype(np.uint8),) invalid_dtype = ( np.dtype(np.int8), @@ -127,12 +148,17 @@ class TestUInt8(_TestZDType): {"name": "uint8", "configuration": {"endianness": "little"}}, ) - scalar_v2_params = (("|u1", 1), ("|u1", 0)) - scalar_v3_params = (("uint8", 1), ("uint8", 0)) + scalar_v2_params = ((UInt8(), 1), (UInt8(), 0)) + scalar_v3_params = ((UInt8(), 1), (UInt8(), 0)) + cast_value_params = ( + (UInt8(), 1, np.uint8(1)), + (UInt8(), 0, np.uint8(0)), + ) class TestUInt16(_TestZDType): test_cls = UInt16 + scalar_type = np.uint16 valid_dtype = (np.dtype(">u2"), np.dtype("u2", 0)) - scalar_v3_params = (("uint16", 1), ("uint16", 0)) + scalar_v2_params = ((UInt16(), 1), (UInt16(), 0)) + scalar_v3_params = ((UInt16(), 1), (UInt16(), 0)) + cast_value_params = ( + (UInt16(), 1, np.uint16(1)), + (UInt16(), 0, np.uint16(0)), + ) class TestUInt32(_TestZDType): test_cls = UInt32 + scalar_type = np.uint32 valid_dtype = (np.dtype(">u4"), np.dtype("u4", 0)) - scalar_v3_params = (("uint32", 1), ("uint32", 0)) + scalar_v2_params = ((UInt32(), 1), (UInt32(), 0)) + scalar_v3_params = ((UInt32(), 1), (UInt32(), 0)) + cast_value_params = ( + (UInt32(), 1, np.uint32(1)), + (UInt32(), 0, np.uint32(0)), + ) class TestUInt64(_TestZDType): test_cls = UInt64 + scalar_type = np.uint64 valid_dtype = (np.dtype(">u8"), np.dtype("u8", 0)) - scalar_v3_params = (("uint64", 1), ("uint64", 0)) + scalar_v2_params = ((UInt64(), 1), (UInt64(), 0)) + scalar_v3_params = ((UInt64(), 1), (UInt64(), 0)) + cast_value_params = ( + (UInt64(), 1, np.uint64(1)), + (UInt64(), 0, np.uint64(0)), + ) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 17f4b2af2d..2ded5bbb7c 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -1,8 +1,12 @@ from __future__ import annotations +from typing import Any + import numpy as np from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.float import Float16, Float64 +from zarr.core.dtype.npy.int import Int32, Int64 from zarr.core.dtype.npy.sized import ( FixedLengthAscii, FixedLengthBytes, @@ -31,11 +35,20 @@ class TestFixedLengthAscii(_TestZDType): {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, ) - scalar_v2_params = (("|S0", ""), ("|S2", "YWI="), ("|S4", "YWJjZA==")) + scalar_v2_params = ( + (FixedLengthAscii(length=0), ""), + (FixedLengthAscii(length=2), "YWI="), + (FixedLengthAscii(length=4), "YWJjZA=="), + ) scalar_v3_params = ( - ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 0}}, ""), - ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 16}}, "YWI="), - ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 32}}, "YWJjZA=="), + (FixedLengthAscii(length=0), ""), + (FixedLengthAscii(length=2), "YWI="), + (FixedLengthAscii(length=4), "YWJjZA=="), + ) + cast_value_params = ( + (FixedLengthAscii(length=0), "", np.bytes_("")), + (FixedLengthAscii(length=2), "ab", np.bytes_("ab")), + (FixedLengthAscii(length=4), "abcd", np.bytes_("abcd")), ) @@ -63,11 +76,20 @@ class TestFixedLengthBytes(_TestZDType): {"name": "r-80"}, ) - scalar_v2_params = (("|V0", ""), ("|V2", "YWI="), ("|V4", "YWJjZA==")) + scalar_v2_params = ( + (FixedLengthBytes(length=0), ""), + (FixedLengthBytes(length=2), "YWI="), + (FixedLengthBytes(length=4), "YWJjZA=="), + ) scalar_v3_params = ( - ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 2}}, ""), - ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 2}}, "YWI="), - ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 4}}, "YWJjZA=="), + (FixedLengthBytes(length=0), ""), + (FixedLengthBytes(length=2), "YWI="), + (FixedLengthBytes(length=4), "YWJjZA=="), + ) + cast_value_params = ( + (FixedLengthBytes(length=0), b"", np.void(b"")), + (FixedLengthBytes(length=2), b"ab", np.void(b"ab")), + (FixedLengthBytes(length=4), b"abcd", np.void(b"abcd")), ) @@ -91,11 +113,17 @@ class TestFixedLengthUnicode(_TestZDType): {"name": "numpy.fixed_length_ucs4", "configuration": {"length_bits": "invalid"}}, ) - scalar_v2_params = ((">U0", ""), (" bool: + if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): + return np.array_equal(scalar1, scalar2) + return super().scalar_equals(scalar1, scalar2) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index fbb0aaa86d..c87f538be5 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -8,8 +8,8 @@ if _NUMPY_SUPPORTS_VLEN_STRING: class TestVariableLengthString(_TestZDType): - test_cls = VariableLengthString - valid_dtype = (np.dtypes.StringDType(),) + test_cls = VariableLengthString # type: ignore[assignment] + valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment] invalid_dtype = ( np.dtype(np.int8), np.dtype(np.float64), @@ -27,10 +27,21 @@ class TestVariableLengthString(_TestZDType): {"name": "invalid_name"}, ) + scalar_v2_params = ((VariableLengthString(), ""), (VariableLengthString(), "hi")) + scalar_v3_params = ( + (VariableLengthString(), ""), + (VariableLengthString(), "hi"), + ) + + cast_value_params = ( + (VariableLengthString(), "", np.str_("")), + (VariableLengthString(), "hi", np.str_("hi")), + ) + else: - class TestVariableLengthString(_TestZDType): - test_cls = VariableLengthString + class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] + test_cls = VariableLengthString # type: ignore[assignment] valid_dtype = (np.dtype("O"),) invalid_dtype = ( np.dtype(np.int8), @@ -48,3 +59,14 @@ class TestVariableLengthString(_TestZDType): {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, {"name": "invalid_name"}, ) + + scalar_v2_params = ((VariableLengthString(), ""), (VariableLengthString(), "hi")) + scalar_v3_params = ( + (VariableLengthString(), ""), + (VariableLengthString(), "hi"), + ) + + cast_value_params = ( + (VariableLengthString(), "", np.str_("")), + (VariableLengthString(), "hi", np.str_("hi")), + ) diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index 2a8ff6ac98..f8f8b5ae47 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -1,15 +1,32 @@ from __future__ import annotations import re +from typing import get_args import numpy as np import pytest from tests.test_dtype.test_wrapper import _TestZDType -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 +from zarr.core.dtype.npy.common import DateTimeUnit +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64, datetime_from_int -class TestDateTime64(_TestZDType): +class _TestTimeBase(_TestZDType): + def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: + # This method gets overridden here to support the equivalency between NaT and + # -9223372036854775808 fill values + nat_scalars = (-9223372036854775808, "NaT") + if scalar1 in nat_scalars and scalar2 in nat_scalars: + return True + return scalar1 == scalar2 + + def scalar_equals(self, scalar1: object, scalar2: object) -> bool: + if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] + return True + return super().scalar_equals(scalar1, scalar2) + + +class TestDateTime64(_TestTimeBase): test_cls = DateTime64 valid_dtype = (np.dtype("datetime64[10ns]"), np.dtype("datetime64[us]"), np.dtype("datetime64")) invalid_dtype = ( @@ -32,8 +49,23 @@ class TestDateTime64(_TestZDType): {"name": "datetime64", "configuration": {"unit": 123}}, ) + scalar_v2_params = ( + (DateTime64(unit="ns", scale_factor=1), 1), + (DateTime64(unit="ns", scale_factor=1), "NaT"), + ) + scalar_v3_params = ( + (DateTime64(unit="ns", scale_factor=1), 1), + (DateTime64(unit="ns", scale_factor=1), "NaT"), + ) -class TestTimeDelta64(_TestZDType): + cast_value_params = ( + (DateTime64(unit="Y", scale_factor=1), "1", np.datetime64("1", "Y")), + (DateTime64(unit="s", scale_factor=1), "2005-02-25", np.datetime64("2005-02-25", "s")), + (DateTime64(unit="ns", scale_factor=1), "NaT", np.datetime64("NaT")), + ) + + +class TestTimeDelta64(_TestTimeBase): test_cls = TimeDelta64 valid_dtype = (np.dtype("timedelta64[ns]"), np.dtype("timedelta64[us]")) invalid_dtype = ( @@ -57,6 +89,20 @@ class TestTimeDelta64(_TestZDType): {"name": "timedelta64", "configuration": {"unit": 123}}, ) + scalar_v2_params = ( + (TimeDelta64(unit="ns", scale_factor=1), 1), + (TimeDelta64(unit="ns", scale_factor=1), "NaT"), + ) + scalar_v3_params = ( + (TimeDelta64(unit="ns", scale_factor=1), 1), + (TimeDelta64(unit="ns", scale_factor=1), "NaT"), + ) + + cast_value_params = ( + (TimeDelta64(unit="ns", scale_factor=1), "1", np.timedelta64(1, "ns")), + (TimeDelta64(unit="ns", scale_factor=1), "NaT", np.timedelta64("NaT")), + ) + def test_time_invalid_unit() -> None: """ @@ -92,3 +138,14 @@ def test_time_scale_factor_too_high() -> None: DateTime64(scale_factor=scale_factor) with pytest.raises(ValueError, match=msg): TimeDelta64(scale_factor=scale_factor) + + +@pytest.mark.parametrize("unit", get_args(DateTimeUnit)) +@pytest.mark.parametrize("scale_factor", [1, 10]) +@pytest.mark.parametrize("value", [0, 1, 10]) +def test_datetime_from_int(unit: DateTimeUnit, scale_factor: int, value: int) -> None: + """ + Test datetime_from_int. + """ + expected = np.int64(value).view(f"datetime64[{scale_factor}{unit}]") + assert datetime_from_int(value, unit=unit, scale_factor=scale_factor) == expected diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index defd3fffc5..ddf43524e0 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -5,10 +5,8 @@ if TYPE_CHECKING: from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType -import pytest -import requests - +""" class _TestZDTypeSchema: # subclasses define the URL for the schema, if available schema_url: ClassVar[str] = "" @@ -21,11 +19,12 @@ def get_schema(self) -> object: def test_schema(self, schema: json_schema.Schema) -> None: assert schema.is_valid(self.test_cls.to_json(zarr_format=2)) +""" class _TestZDType: test_cls: type[ZDType[TBaseDType, TBaseScalar]] - + scalar_type: ClassVar[type[TBaseScalar]] valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () @@ -42,6 +41,18 @@ class _TestZDType: scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + cast_value_params: ClassVar[tuple[tuple[Any, Any, Any], ...]] + + def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: + # An equality check for json-encoded scalars. This defaults to regular equality, + # but some classes may need to override this for special cases + return scalar1 == scalar2 + + def scalar_equals(self, scalar1: object, scalar2: object) -> bool: + # An equality check for scalars. This defaults to regular equality, + # but some classes may need to override this for special cases + return scalar1 == scalar2 + def test_check_dtype_valid(self, valid_dtype: object) -> None: assert self.test_cls.check_dtype(valid_dtype) # type: ignore[arg-type] @@ -60,14 +71,17 @@ def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 - def test_scalar_roundtrip_v2(self, scalar_v2_params: Any) -> None: - dtype_json, scalar_json = scalar_v2_params - zdtype = self.test_cls.from_json(dtype_json, zarr_format=2) + def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[Any, Any]) -> None: + zdtype, scalar_json = scalar_v2_params scalar = zdtype.from_json_value(scalar_json, zarr_format=2) - assert scalar_json == zdtype.to_json_value(scalar, zarr_format=2) + assert self.json_scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=2)) - def test_scalar_roundtrip_v3(self, scalar_v3_params: Any) -> None: - dtype_json, scalar_json = scalar_v3_params - zdtype = self.test_cls.from_json(dtype_json, zarr_format=3) + def test_scalar_roundtrip_v3(self, scalar_v3_params: tuple[Any, Any]) -> None: + zdtype, scalar_json = scalar_v3_params scalar = zdtype.from_json_value(scalar_json, zarr_format=3) - assert scalar_json == zdtype.to_json_value(scalar, zarr_format=3) + assert self.json_scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=3)) + + def test_cast_value(self, cast_value_params: tuple[Any, Any, Any]) -> None: + zdtype, value, expected = cast_value_params + observed = zdtype.cast_value(value) + assert self.scalar_equals(expected, observed) diff --git a/tests/test_properties.py b/tests/test_properties.py index 15dd701582..68427dd8fe 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -75,6 +75,7 @@ def deep_equal(a: Any, b: Any) -> bool: return a == b +@settings(deadline=300) @given(data=st.data(), zarr_format=zarr_formats) def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format))) From 4ab1c58722297a526ab79c5936c5726b97d351fa Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:11:53 +0200 Subject: [PATCH 080/130] use relative link for changes --- changes/2874.feature.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst index 26eda3a257..d0adcd6533 100644 --- a/changes/2874.feature.rst +++ b/changes/2874.feature.rst @@ -1,2 +1,2 @@ Adds zarr-specific data type classes. This replaces the direct use of numpy data types for zarr -v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation `_ \ No newline at end of file +v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation documentation `_ \ No newline at end of file From e4c89f30b874e53b47f7d36454a8cd428c35f64f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:12:23 +0200 Subject: [PATCH 081/130] typo --- changes/2874.feature.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst index d0adcd6533..50634e5395 100644 --- a/changes/2874.feature.rst +++ b/changes/2874.feature.rst @@ -1,2 +1,2 @@ Adds zarr-specific data type classes. This replaces the direct use of numpy data types for zarr -v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation documentation `_ \ No newline at end of file +v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation `_ \ No newline at end of file From e386c2bfa93b0583d2ac6a4247c43950489ccd64 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:24:08 +0200 Subject: [PATCH 082/130] make bytes codec dtype logic a bit more literate --- src/zarr/codecs/bytes.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 6c28bfe543..a87df060e7 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -58,7 +58,10 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if array_spec.dtype.to_dtype().itemsize == 1: + # Note: this check is numpy-dtype-specific + # For single-byte (e.g., uint8) or 0-byte (e.g., S0) dtypes, + # endianness does not apply. + if array_spec.dtype.to_dtype().itemsize < 2: if self.endian is not None: return replace(self, endian=None) elif self.endian is None: @@ -77,7 +80,8 @@ async def _decode_single( endian_str = cast( "Endianness | None", self.endian.value if self.endian is not None else None ) - dtype = chunk_spec.dtype.to_dtype().newbyteorder(endianness_to_numpy_str(endian_str)) + new_byte_order = endianness_to_numpy_str(endian_str) + dtype = chunk_spec.dtype.to_dtype().newbyteorder(new_byte_order) as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): From 703192cae67c9f064604403a20056dcfb30a8d1a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:25:05 +0200 Subject: [PATCH 083/130] increase deadline to 500ms --- tests/test_properties.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_properties.py b/tests/test_properties.py index 68427dd8fe..2809e9564b 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -75,7 +75,7 @@ def deep_equal(a: Any, b: Any) -> bool: return a == b -@settings(deadline=300) +@settings(deadline=500) @given(data=st.data(), zarr_format=zarr_formats) def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format))) From 0fab5e514f1840fb29819e0fe20a89de2cde7af1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:40:20 +0200 Subject: [PATCH 084/130] fewer commented sections of problematic lru_store_cache section of the sharding codecs --- src/zarr/codecs/sharding.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 779cbc6f88..5c08815979 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -357,10 +357,13 @@ def __init__( object.__setattr__(self, "index_location", index_location_parsed) # Use instance-local lru_cache to avoid memory leaks - # TODO: fix these when we don't get hashability errors for certain numpy dtypes + + # numpy void scalars are not hashable, which means an array spec with a fill value that is + # a numpy void scalar will break the lru_cache. This is commented for now but should be + # fixed # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) - # object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) - # object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) + object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) + object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) # todo: typedict return type def __getstate__(self) -> dict[str, Any]: @@ -374,7 +377,7 @@ def __setstate__(self, state: dict[str, Any]) -> None: object.__setattr__(self, "index_location", parse_index_location(config["index_location"])) # Use instance-local lru_cache to avoid memory leaks - object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) + # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) From 2f945bf25186a6fa8401348565025d3e374023a2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:56:49 +0200 Subject: [PATCH 085/130] add link to gh issue about lru_cache for sharding codec --- src/zarr/codecs/sharding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 5c08815979..15036e88d2 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -360,7 +360,7 @@ def __init__( # numpy void scalars are not hashable, which means an array spec with a fill value that is # a numpy void scalar will break the lru_cache. This is commented for now but should be - # fixed + # fixed. See https://github.com/zarr-developers/zarr-python/issues/3054 # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) From 63a6af4392c47802ae8ec5f026cae5a1503c3ddd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 14:28:22 +0200 Subject: [PATCH 086/130] attempt to speed up hypothesis tests by reducing max array size --- src/zarr/testing/strategies.py | 4 +++- tests/test_properties.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 6c3abfca85..4e5c9536fc 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -120,7 +120,9 @@ def clear_store(x: Store) -> Store: compressors = st.sampled_from([None, "default"]) zarr_formats: st.SearchStrategy[ZarrFormat] = st.sampled_from([3, 2]) # We de-prioritize arrays having dim sizes 0, 1, 2 -array_shapes = npst.array_shapes(max_dims=4, min_side=3) | npst.array_shapes(max_dims=4, min_side=0) +array_shapes = npst.array_shapes(max_dims=4, min_side=3, max_side=5) | npst.array_shapes( + max_dims=4, min_side=0 +) @st.composite diff --git a/tests/test_properties.py b/tests/test_properties.py index 2809e9564b..15dd701582 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -75,7 +75,6 @@ def deep_equal(a: Any, b: Any) -> bool: return a == b -@settings(deadline=500) @given(data=st.data(), zarr_format=zarr_formats) def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format))) From 56e7c84abd57203146c9d6f8dc9b62e1bf80dac1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 15:57:30 +0200 Subject: [PATCH 087/130] clean up docs --- docs/user-guide/data_types.rst | 116 +++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 36 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index a281b349de..81a09a6485 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -5,8 +5,10 @@ Zarr's data type model ---------------------- Every Zarr array has a "data type", which defines the meaning and physical layout of the -array's elements. Zarr is heavily influenced by `NumPy `_, and -Zarr-Python supports creating arrays with Numpy data types:: +array's elements. As Zarr Python is tightly integrated with `NumPy `_, +it's easy to create arrays with NumPy data types: + +.. code-block:: python >>> import zarr >>> import numpy as np @@ -14,58 +16,103 @@ Zarr-Python supports creating arrays with Numpy data types:: >>> z -Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. -This means Zarr data types must be interpreted correctly when clients read an array. So each Zarr data type defines a procedure for -encoding/decoding that data type to/from Zarr array metadata, and also encoding/decoding **instances** of that data type to/from -array metadata. These serialization procedures depend on the Zarr format. +Unlike NumPy arrays, Zarr arrays are designed to accessed by Zarr +implementations in different programming languages. This means Zarr data types must be interpreted +correctly when clients read an array. Each Zarr data type defines procedures for +encoding and decoding both the data type itself, and scalars from that data type to and from Zarr array metadata. And these serialization procedures +depend on the Zarr format. Data types in Zarr version 2 ----------------------------- -Version 2 of the Zarr format defined its data types relative to `Numpy's data types `_, and added a few non-Numpy data types as well. -Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype:: +Version 2 of the Zarr format defined its data types relative to +`NumPy's data types `_, +and added a few non-NumPy data types as well. Thus the JSON identifier for a NumPy-compatible data +type is just the NumPy ``str`` attribute of that data type: + +.. code-block:: python - >>> import zarr - >>> import numpy as np - >>> import json - >>> store = {} - >>> np_dtype = np.dtype('int64') - >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) - >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] - >>> assert dtype_meta == np_dtype.str # True - >>> dtype_meta - '>> import zarr + >>> import numpy as np + >>> import json + >>> + >>> store = {} + >>> np_dtype = np.dtype('int64') + >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) + >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] + >>> dtype_meta + '>> assert dtype_meta == np_dtype.str .. note:: - The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following Numpy's example, - in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. + The ``<`` character in the data type metadata encodes the + `endianness `_, + or "byte order", of the data type. Following NumPy's example, + in Zarr version 2 each data type has an endianness where applicable. + However, Zarr version 3 data types do not store endianness information. + +In addition to defining a representation of the data type itself (which in the example above was +just a simple string ``"i2`` **or** ``M[10s]"`` in + Zarr V2. This is more compact, but can be harder to parse. + +For more about data types in Zarr V3, see the +`V3 specification `_. + +Data types in Zarr Python ------------------------- -The two Zarr formats that Zarr-Python supports specify data types in two different ways: -data types in Zarr version 2 are encoded as Numpy-compatible strings, while data types in Zarr version +The two Zarr formats that Zarr Python supports specify data types in two different ways: +data types in Zarr version 2 are encoded as NumPy-compatible strings, while data types in Zarr version 3 are encoded as either strings or ``JSON`` objects, and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. -To abstract over these syntactical and semantic differences, Zarr-Python uses a class called `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ to wrap native data types (e.g., Numpy data types) and provide Zarr V2 and Zarr V3 compatibility routines. -Each data type supported by Zarr-Python is modeled by a subclass of ``ZDType``, which provides an API for the following operations: +To abstract over these syntactical and semantic differences, Zarr Python uses a class called +`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ provide Zarr V2 and Zarr V3 compatibility +routines for ""native" data types. In this context, a "native" data type is a Python class, +typically defined in another library, that models an array's data type. For example, ``np.uint8`` is a native +data type defined in NumPy, which Zarr Python wraps with a ``ZDType`` instance called +`UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. + +Each data type supported by Zarr Python is modeled by ``ZDType`` subclass, which provides an +API for the following operations: - Wrapping / unwrapping a native data type - Encoding / decoding a data type to / from Zarr V2 and Zarr V3 array metadata. @@ -104,7 +151,4 @@ Example Usage Custom Data Types ~~~~~~~~~~~~~~~~~ -Users can define custom data types by subclassing `ZDType` and implementing the required methods. -Once defined, the custom data type can be registered with Zarr-Python to enable seamless integration with the library. - \ No newline at end of file From eee0d7bc8c7b8444a3bcd9e9a545c4f89db5dcb2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 16:01:52 +0200 Subject: [PATCH 088/130] remove placeholder --- docs/user-guide/data_types.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 81a09a6485..ff43dd8d19 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -147,8 +147,3 @@ Example Usage # Deserialize a scalar value scalar_value = int8.from_json_value(42, zarr_format=3) assert scalar_value == np.int8(42) - -Custom Data Types -~~~~~~~~~~~~~~~~~ - - \ No newline at end of file From 1dc8e722b80e4f6668ab9121b0370dde84fc5ba4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 16:36:07 +0200 Subject: [PATCH 089/130] make final example section doctested and more readable --- docs/user-guide/data_types.rst | 58 +++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index ff43dd8d19..777a69816e 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -9,7 +9,6 @@ array's elements. As Zarr Python is tightly integrated with `NumPy >> import zarr >>> import numpy as np >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) @@ -122,28 +121,51 @@ API for the following operations: Example Usage ~~~~~~~~~~~~~ +Create a ``ZDType`` from a native data type: + +.. code-block:: python + + >>> from zarr.core.dtype import Int8 + >>> import numpy as np + >>> int8 = Int8.from_dtype(np.dtype('int8')) + +Convert back to native data type: + +.. code-block:: python + + >>> native_dtype = int8.to_dtype() + >>> assert native_dtype == np.dtype('int8') + +Get the default scalar value for the data type: + .. code-block:: python - from zarr.core.dtype.wrapper import Int8 + >>> default_value = int8.default_value() + >>> assert default_value == np.int8(0) - # Create a ZDType instance from a native dtype - int8 = Int8.from_dtype(np.dtype('int8')) - # Convert back to native dtype - native_dtype = int8.to_dtype() - assert native_dtype == np.dtype('int8') +Serialize to JSON for Zarr V2 and V3 - # Get the default value - default_value = int8.default_value() - assert default_value == np.int8(0) +.. code-block:: python - # Serialize to JSON - json_representation = int8.to_json(zarr_format=3) + >>> json_v2 = int8.to_json(zarr_format=2) + >>> json_v2 + '|i1' + >>> json_v3 = int8.to_json(zarr_format=3) + >>> json_v3 + 'int8' - # Serialize a scalar value - json_value = int8.to_json_value(42, zarr_format=3) - assert json_value == 42 +Serialize a scalar value to JSON: + +.. code-block:: python + + >>> json_value = int8.to_json_value(42, zarr_format=3) + >>> json_value + 42 + +Deserialize a scalar value from JSON: + +.. code-block:: python - # Deserialize a scalar value - scalar_value = int8.from_json_value(42, zarr_format=3) - assert scalar_value == np.int8(42) + >>> scalar_value = int8.from_json_value(42, zarr_format=3) + >>> assert scalar_value == np.int8(42) From 13ca2304b1d9e3cf576bd53c367f03408d9f5653 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 16:40:57 +0200 Subject: [PATCH 090/130] revert change to auto chunking --- docs/user-guide/performance.rst | 2 +- src/zarr/core/chunk_grids.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 40882fbf1f..aa380735d5 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -52,7 +52,7 @@ a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: >>> z4 = zarr.create_array(store={}, shape=(10000, 10000), chunks='auto', dtype='int32') >>> z4.chunks - (313, 625) + (625, 625) If you know you are always going to be loading the entire array into memory, you can turn off chunks by providing ``chunks`` equal to ``shape``, in which case there diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 74bf9b6ba8..6701aca182 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -63,7 +63,7 @@ def _guess_chunks( """ if isinstance(shape, int): shape = (shape,) - typesize = max(typesize, 8) + typesize = max(typesize, 1) ndims = len(shape) # require chunks to have non-zero length for all dimensions chunks = np.maximum(np.array(shape, dtype="=f8"), 1) From 2a42205ad3ae8eae3ecf4b7a76189c50335ae9a7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 16:46:21 +0200 Subject: [PATCH 091/130] revert quotation of literal type --- src/zarr/core/array_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 279bf6edf0..5d4321da82 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -63,7 +63,7 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: """ kwargs_out: ArrayConfigParams = {} for f in fields(ArrayConfig): - field_name = cast("Literal['order', 'write_empty_chunks']", f.name) + field_name = cast(Literal["order", "write_empty_chunks"], f.name) if field_name not in data: kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") else: From 3f775c83665b24bbb7683393f381523c61a4e8cb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 18:17:55 +0200 Subject: [PATCH 092/130] lint --- src/zarr/core/dtype/__init__.py | 52 ++++++++++++++++++++++++--------- src/zarr/core/dtype/npy/time.py | 8 ++--- tests/test_config.py | 6 +++- tests/test_dtype/conftest.py | 2 +- tests/test_dtype/test_dtype.py | 0 tests/test_dtype_registry.py | 35 ++++++++++++++++++++-- 6 files changed, 81 insertions(+), 22 deletions(-) delete mode 100644 tests/test_dtype/test_dtype.py diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 1a18849a13..a8cdfc0cbc 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypeAlias, get_args +from typing import TYPE_CHECKING, TypeAlias from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.npy.bool import Bool @@ -30,8 +30,10 @@ from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType __all__ = [ + "Bool", "Complex64", "Complex128", + "DataTypeRegistry", "DataTypeValidationError", "DateTime64", "FixedLengthAscii", @@ -45,6 +47,8 @@ "Int32", "Int64", "Structured", + "TBaseDType", + "TBaseScalar", "TimeDelta64", "TimeDelta64", "UInt8", @@ -59,25 +63,47 @@ data_type_registry = DataTypeRegistry() -INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 -FLOAT_DTYPE = Float16 | Float32 | Float64 -COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = FixedLengthUnicode | VariableLengthString | FixedLengthAscii -DTYPE = ( +IntegerDType = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 +INTEGER_DTYPE = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 + +FloatDType = Float16 | Float32 | Float64 +FLOAT_DTYPE = Float16, Float32, Float64 + +ComplexFloatDType = Complex64 | Complex128 +COMPLEX_FLOAT_DTYPE = Complex64, Complex128 + +StringDType = FixedLengthUnicode | VariableLengthString | FixedLengthAscii +STRING_DTYPE = FixedLengthUnicode, VariableLengthString, FixedLengthAscii + +TimeDType = DateTime64 | TimeDelta64 +TIME_DTYPE = DateTime64, TimeDelta64 + +AnyDType = ( Bool - | INTEGER_DTYPE - | FLOAT_DTYPE - | COMPLEX_DTYPE - | STRING_DTYPE + | IntegerDType + | FloatDType + | ComplexFloatDType + | StringDType | FixedLengthBytes | Structured - | DateTime64 - | TimeDelta64 + | TimeDType +) +# mypy has trouble inferring the type of variablelengthstring dtype, because its class definition +# depends on the installed numpy version. That's why the type: ignore statement is needed here. +ANY_DTYPE: tuple[type[ZDType[TBaseDType, TBaseScalar]], ...] = ( # type: ignore[assignment] + Bool, + *INTEGER_DTYPE, + *FLOAT_DTYPE, + *COMPLEX_FLOAT_DTYPE, + *STRING_DTYPE, + FixedLengthBytes, + Structured, + *TIME_DTYPE, ) ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] -for dtype in get_args(DTYPE): +for dtype in ANY_DTYPE: data_type_registry.register(dtype._zarr_v3_name, dtype) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index bbdd41d13f..ea44d76b56 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -173,7 +173,7 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has unit for ``TimeDelta64`` is optional. """ - dtype_cls = np.dtypes.TimeDelta64DType + dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] _zarr_v3_name = "numpy.timedelta64" _zarr_v2_names = (">m8", " np.timedelta64: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: if check_json_int(data) or data == "NaT": - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, value: object) -> np.timedelta64: @@ -220,7 +220,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): - dtype_cls = np.dtypes.DateTime64DType + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" _zarr_v2_names = (">M8", " np.datetime64: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data) or data == "NaT": - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, value: object) -> np.datetime64: diff --git a/tests/test_config.py b/tests/test_config.py index f32b3e6840..58f88ec806 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ import os from collections.abc import Iterable -from typing import Any +from typing import TYPE_CHECKING, Any from unittest import mock from unittest.mock import Mock @@ -46,6 +46,9 @@ TestNDArrayLike, ) +if TYPE_CHECKING: + from zarr.core.dtype.wrapper import ZDType + def test_config_defaults_set() -> None: # regression test for available defaults @@ -307,6 +310,7 @@ async def test_default_codecs(dtype_category: str) -> None: """ Test that the default compressors are sensitive to the current setting of the config. """ + zdtype: ZDType[Any, Any] if dtype_category == "variable-length-string": zdtype = VariableLengthString() else: diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 9c7825c0d1..bf58a17556 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -24,7 +24,7 @@ def pytest_generate_tests(metafunc: Any) -> None: """ - pytest hook to parametrize class-scoped fixtures. + This is a pytest hook to parametrize class-scoped fixtures. This hook allows us to define class-scoped fixtures as class attributes and then generate the parametrize calls for pytest. This allows the fixtures to be diff --git a/tests/test_dtype/test_dtype.py b/tests/test_dtype/test_dtype.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 98380b86f7..aaca2f0862 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -11,16 +11,20 @@ import zarr from zarr.core.config import config from zarr.core.dtype import ( - DTYPE, + AnyDType, Bool, + DataTypeRegistry, + DateTime64, FixedLengthUnicode, + Int8, + Int16, TBaseDType, TBaseScalar, ZDType, data_type_registry, get_data_type_from_json, + parse_data_type, ) -from zarr.core.dtype.registry import DataTypeRegistry if TYPE_CHECKING: from collections.abc import Generator @@ -117,7 +121,7 @@ def test_match_dtype_unique( that excludes the data type class being tested, and ensure that an instance of the wrapped data type fails to match anything in the registry """ - for _cls in get_args(DTYPE): + for _cls in get_args(AnyDType): if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) @@ -156,3 +160,28 @@ def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance + + +@pytest.mark.parametrize( + ("dtype_params", "expected", "zarr_format"), + [ + ("int8", Int8(), 3), + (Int8(), Int8(), 3), + (">i2", Int16(endianness="big"), 2), + ("datetime64[10s]", DateTime64(unit="s", scale_factor=10), 2), + ( + {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, + DateTime64(unit="s", scale_factor=10), + 3, + ), + ], +) +def test_parse_data_type( + dtype_params: Any, expected: ZDType[Any, Any], zarr_format: ZarrFormat +) -> None: + """ + Test that parse_data_type accepts alternative representations of ZDType instances, and resolves + those inputs to the expected ZDType instance. + """ + observed = parse_data_type(dtype_params, zarr_format=zarr_format) + assert observed == expected From b525b8e53257982ae238172b2b3d82a755969fd1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 18:34:14 +0200 Subject: [PATCH 093/130] fix broken code block --- docs/user-guide/data_types.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 777a69816e..a4d8314a5e 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -9,6 +9,7 @@ array's elements. As Zarr Python is tightly integrated with `NumPy >> import zarr >>> import numpy as np >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) From ec94878746679070f59094ca5ac293831ef5fd5c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 18:57:02 +0200 Subject: [PATCH 094/130] specialize test to handle stringdtype changes coming in numpy 2.3 --- tests/test_array.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index dc81c7ea36..3108332201 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -47,6 +47,7 @@ from zarr.core.dtype.npy.sized import ( Structured, ) +from zarr.core.dtype.npy.string import VariableLengthString from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup @@ -996,14 +997,26 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor # Structured dtypes do not have a numpy string representation that uniquely identifies them if not isinstance(dtype, Structured): - c = zarr.create_array( - store, - name="c", - shape=(5,), - chunks=(5,), - dtype=dtype.to_dtype().str, - zarr_format=zarr_format, - ) + if isinstance(dtype, VariableLengthString): + # in numpy 2.3, StringDType().str becomes the string 'StringDType()' which numpy + # does not accept as a string representation of the dtype. + c = zarr.create_array( + store, + name="c", + shape=(5,), + chunks=(5,), + dtype=dtype.to_dtype().char, + zarr_format=zarr_format, + ) + else: + c = zarr.create_array( + store, + name="c", + shape=(5,), + chunks=(5,), + dtype=dtype.to_dtype().str, + zarr_format=zarr_format, + ) assert a.dtype == c.dtype @staticmethod From 3af98aa1c872a99e92d44de84a9fed7e504ef0cd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 21:56:57 +0200 Subject: [PATCH 095/130] add docstring to _TestZDType class --- tests/test_dtype/test_wrapper.py | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index ddf43524e0..608e272690 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -23,6 +23,40 @@ def test_schema(self, schema: json_schema.Schema) -> None: class _TestZDType: + """ + A base class for testing ZDType subclasses. This class works in conjunction with the custom + pytest collection function ``pytest_generate_tests`` defined in conftest.py, which applies the + following procedure when generating tests: + + At test generation time, for each test fixture referenced by a method on this class + pytest will look for an attribute with the same name as that fixture. Pytest will assume that + this class attribute is a tuple of values to be used for generating a parametrized test fixture. + + This means that child classes can, by using different values for these class attributes, have + customized test parametrization. + + Attributes + ---------- + test_cls : type[ZDType[TBaseDType, TBaseScalar]] + The ZDType subclass being tested. + scalar_type : ClassVar[type[TBaseScalar]] + The expected scalar type for the ZDType. + valid_dtype : ClassVar[tuple[TBaseDType, ...]] + A tuple of valid numpy dtypes for the ZDType. + invalid_dtype : ClassVar[tuple[TBaseDType, ...]] + A tuple of invalid numpy dtypes for the ZDType. + valid_json_v2 : ClassVar[tuple[str | dict[str, object] | list[object], ...]] + A tuple of valid JSON representations for Zarr format version 2. + invalid_json_v2 : ClassVar[tuple[str | dict[str, object] | list[object], ...]] + A tuple of invalid JSON representations for Zarr format version 2. + valid_json_v3 : ClassVar[tuple[str | dict[str, object], ...]] + A tuple of valid JSON representations for Zarr format version 3. + invalid_json_v3 : ClassVar[tuple[str | dict[str, object], ...]] + A tuple of invalid JSON representations for Zarr format version 3. + cast_value_params : ClassVar[tuple[tuple[Any, Any, Any], ...]] + A tuple of (dtype, value, expected) tuples for testing ZDType.cast_value. + """ + test_cls: type[ZDType[TBaseDType, TBaseScalar]] scalar_type: ClassVar[type[TBaseScalar]] valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () From d8c3672c4499148bb25c61a06ba20c0d7492a1d3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 15 May 2025 12:52:58 +0200 Subject: [PATCH 096/130] type hints --- src/zarr/core/dtype/__init__.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index a8cdfc0cbc..b973691f0f 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypeAlias +from typing import TYPE_CHECKING, Final, TypeAlias from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.npy.bool import Bool @@ -64,19 +64,19 @@ data_type_registry = DataTypeRegistry() IntegerDType = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 -INTEGER_DTYPE = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 +INTEGER_DTYPE: Final = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 FloatDType = Float16 | Float32 | Float64 -FLOAT_DTYPE = Float16, Float32, Float64 +FLOAT_DTYPE: Final = Float16, Float32, Float64 ComplexFloatDType = Complex64 | Complex128 -COMPLEX_FLOAT_DTYPE = Complex64, Complex128 +COMPLEX_FLOAT_DTYPE: Final = Complex64, Complex128 StringDType = FixedLengthUnicode | VariableLengthString | FixedLengthAscii -STRING_DTYPE = FixedLengthUnicode, VariableLengthString, FixedLengthAscii +STRING_DTYPE: Final = FixedLengthUnicode, VariableLengthString, FixedLengthAscii TimeDType = DateTime64 | TimeDelta64 -TIME_DTYPE = DateTime64, TimeDelta64 +TIME_DTYPE: Final = DateTime64, TimeDelta64 AnyDType = ( Bool @@ -90,7 +90,7 @@ ) # mypy has trouble inferring the type of variablelengthstring dtype, because its class definition # depends on the installed numpy version. That's why the type: ignore statement is needed here. -ANY_DTYPE: tuple[type[ZDType[TBaseDType, TBaseScalar]], ...] = ( # type: ignore[assignment] +ANY_DTYPE: Final = ( Bool, *INTEGER_DTYPE, *FLOAT_DTYPE, @@ -101,10 +101,12 @@ *TIME_DTYPE, ) -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] +# This type models inputs that can be coerced to a ZDType +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] | str for dtype in ANY_DTYPE: - data_type_registry.register(dtype._zarr_v3_name, dtype) + # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType + data_type_registry.register(dtype._zarr_v3_name, dtype) # type: ignore[arg-type] # TODO: find a better name for this function From d8a382a167c0eaf043a158737b6c4aa5c6a252a7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 16 May 2025 12:21:33 +0200 Subject: [PATCH 097/130] expand changelog --- changes/2874.feature.rst | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst index 50634e5395..4c50532ae0 100644 --- a/changes/2874.feature.rst +++ b/changes/2874.feature.rst @@ -1,2 +1,9 @@ -Adds zarr-specific data type classes. This replaces the direct use of numpy data types for zarr -v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation `_ \ No newline at end of file +Adds zarr-specific data type classes. This replaces the internal use of numpy data types for zarr +v2 and a fixed set of string enums for zarr v3. This change is largely internal, but it does +change the type of the ``dtype`` and ``data_type`` fields on the ``ArrayV2Metadata`` and +``ArrayV3Metadata`` classes. It also changes the JSON metadata representation of the +variable-length string data type, but the old metadata representation can still be +used when reading arrays. The logic for automatically choosing the chunk encoding for a given data +type has also changed, and this necessitated changes to the ``config`` API. + +For more on this new feature, see the `documentation `_ \ No newline at end of file From 9aa751b170b83f92cc19e87bc44cddfa0bd9a7de Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 16 May 2025 18:50:33 +0200 Subject: [PATCH 098/130] tweak docstring --- src/zarr/core/dtype/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 199cbda5d8..3a56a85788 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -55,7 +55,7 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): Attributes ---------- dtype_cls : ClassVar[type[TDType]] - The numpy dtype class. This is a class variable. Instances of this class cannot set it. + The wrapped dtype class. This is a class variable. Instances of this class cannot set it. _zarr_v3_name : ClassVar[str] The name given to the wrapped data type by a zarr v3 data type specification. Note that this is not necessarily the same name that will appear in metadata documents, as some data types From e4a0372849b04d908a4a4e1487345615f96bed7d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 19 May 2025 12:36:55 +0200 Subject: [PATCH 099/130] support v3 nan strings in JSON for float dtypes --- src/zarr/core/dtype/common.py | 8 +- src/zarr/core/dtype/npy/common.py | 198 +++++------------------ src/zarr/core/dtype/npy/complex.py | 33 ++-- src/zarr/core/dtype/npy/float.py | 49 ++++-- src/zarr/core/metadata/v3.py | 6 +- tests/test_dtype/test_npy/test_common.py | 108 ++++++------- tests/test_dtype/test_npy/test_float.py | 15 ++ tests/test_metadata/test_v3.py | 11 +- 8 files changed, 180 insertions(+), 248 deletions(-) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 4249c57b1f..ecc475192c 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,11 +1,13 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Literal +from typing import Final, Literal Endianness = Literal["little", "big"] -SpecialFloats = Literal["NaN", "Infinity", "-Infinity"] -JSONFloat = float | SpecialFloats +SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] +SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity") +JSONFloatV2 = float | SpecialFloatStrings +JSONFloatV3 = float | SpecialFloatStrings | str class DataTypeValidationError(ValueError): ... diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 8033e48291..2481dcb150 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import struct import sys from collections.abc import Sequence from typing import ( @@ -18,7 +19,7 @@ import numpy as np -from zarr.core.dtype.common import Endianness, JSONFloat +from zarr.core.dtype.common import SPECIAL_FLOAT_STRINGS, Endianness, JSONFloatV2, JSONFloatV3 if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -112,7 +113,7 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: ) -def float_from_json_v2(data: JSONFloat) -> float: +def float_from_json_v2(data: JSONFloatV2) -> float: """ Convert a JSON float to a float (Zarr v2). @@ -137,7 +138,7 @@ def float_from_json_v2(data: JSONFloat) -> float: return float(data) -def float_from_json_v3(data: JSONFloat) -> float: +def float_from_json_v3(data: JSONFloatV3) -> float: """ Convert a JSON float to a float (v3). @@ -150,31 +151,35 @@ def float_from_json_v3(data: JSONFloat) -> float: ------- float The float value. - """ - # todo: support the v3-specific NaN handling - return float_from_json_v2(data) - -def float_from_json(data: JSONFloat, *, zarr_format: ZarrFormat) -> float: - """ - Convert a JSON float to a float based on zarr format. - - Parameters - ---------- - data : JSONFloat - The JSON float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - float - The float value. - """ - if zarr_format == 2: - return float_from_json_v2(data) - else: - return float_from_json_v3(data) + Notes + ----- + Zarr V3 allows floats to be stored as hex strings. To quote the spec: + "...for float32, "NaN" is equivalent to "0x7fc00000". + This representation is the only way to specify a NaN value other than the specific NaN value + denoted by "NaN"." + """ + + if isinstance(data, str): + if data in SPECIAL_FLOAT_STRINGS: + return float_from_json_v2(data) # type: ignore[arg-type] + if not data.startswith("0x"): + msg = ( + f"Invalid float value: {data!r}. Expected a string starting with the hex prefix" + " '0x', or one of 'NaN', 'Infinity', or '-Infinity'." + ) + raise ValueError(msg) + if len(data[2:]) == 4: + dtype_code = ">e" + elif len(data[2:]) == 8: + dtype_code = ">f" + elif len(data[2:]) == 16: + dtype_code = ">d" + else: + msg = f"Invalid float value: {data!r}. Expected a string of length 4, 8, or 16." + raise ValueError(msg) + return float(struct.unpack(dtype_code, bytes.fromhex(data[2:]))[0]) + return float_from_json_v2(data) def bytes_from_json(data: str, *, zarr_format: ZarrFormat) -> bytes: @@ -221,7 +226,7 @@ def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: return base64.b64encode(data).decode("ascii") -def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: +def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloatV2: """ Convert a float to JSON (v2). @@ -242,7 +247,7 @@ def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: return float(data) -def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: +def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloatV3: """ Convert a float to JSON (v3). @@ -261,32 +266,9 @@ def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: return float_to_json_v2(data) -def float_to_json(data: float | np.floating[Any], *, zarr_format: ZarrFormat) -> JSONFloat: - """ - Convert a float to JSON, parametrized by the zarr format version. - - Parameters - ---------- - data : float | np.floating - The float value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - JSONFloat - The JSON representation of the float. - """ - if zarr_format == 2: - return float_to_json_v2(data) - else: - return float_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - def complex_float_to_json_v3( data: complex | np.complexfloating[Any, Any], -) -> tuple[JSONFloat, JSONFloat]: +) -> tuple[JSONFloatV3, JSONFloatV3]: """ Convert a complex number to JSON as defined by the Zarr V3 spec. @@ -305,7 +287,7 @@ def complex_float_to_json_v3( def complex_float_to_json_v2( data: complex | np.complexfloating[Any, Any], -) -> tuple[JSONFloat, JSONFloat]: +) -> tuple[JSONFloatV2, JSONFloatV2]: """ Convert a complex number to JSON as defined by the Zarr V2 spec. @@ -322,32 +304,7 @@ def complex_float_to_json_v2( return float_to_json_v2(data.real), float_to_json_v2(data.imag) -def complex_float_to_json( - data: complex | np.complexfloating[Any, Any], *, zarr_format: ZarrFormat -) -> tuple[JSONFloat, JSONFloat]: - """ - Convert a complex number to JSON, parametrized by the zarr format version. - - Parameters - ---------- - data : complex | np.complexfloating - The complex value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - tuple[JSONFloat, JSONFloat] or JSONFloat - The JSON representation of the complex number. - """ - if zarr_format == 2: - return complex_float_to_json_v2(data) - else: - return complex_float_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: +def complex_float_from_json_v2(data: tuple[JSONFloatV2, JSONFloatV2]) -> complex: """ Convert a JSON complex float to a complex number (v2). @@ -364,7 +321,7 @@ def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) -def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: +def complex_float_from_json_v3(data: tuple[JSONFloatV3, JSONFloatV3]) -> complex: """ Convert a JSON complex float to a complex number (v3). @@ -381,30 +338,7 @@ def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) -def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: - """ - Convert a JSON complex float to a complex number based on zarr format. - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - np.complexfloating - The complex number. - """ - if zarr_format == 2: - return complex_float_from_json_v2(data) - else: - return complex_float_from_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: +def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloatV2]: """ Check if a JSON value represents a float (v2). @@ -423,7 +357,7 @@ def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: return isinstance(data, float | int) -def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: +def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloatV3]: """ Check if a JSON value represents a float (v3). @@ -437,11 +371,10 @@ def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: Bool True if the data is a float, False otherwise. """ - # TODO: handle the special JSON serialization of different NaN values - return check_json_float_v2(data) + return check_json_float_v2(data) or (isinstance(data, str) and data.startswith("0x")) -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloatV2, JSONFloatV2]]: """ Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x @@ -464,7 +397,7 @@ def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFl ) -def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: +def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloatV3, JSONFloatV3]]: """ Check if a JSON value represents a complex float, as per the zarr v3 spec @@ -487,51 +420,6 @@ def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFl ) -def check_json_complex_float( - data: JSON, zarr_format: ZarrFormat -) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, given a zarr format. - - Parameters - ---------- - data : JSON - The JSON value to check. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - Bool - True if the data represents a complex float, False otherwise. - """ - if zarr_format == 2: - return check_json_complex_float_v2(data) - return check_json_complex_float_v3(data) - - -def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: - """ - Check if a JSON value represents a float based on zarr format. - - Parameters - ---------- - data : JSON - The JSON value to check. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - Bool - True if the data is a float, False otherwise. - """ - if zarr_format == 2: - return check_json_float_v2(data) - else: - return check_json_float_v3(data) - - def check_json_int(data: JSON) -> TypeGuard[int]: """ Check if a JSON value is an integer. diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index fab4ca9893..3e5f640946 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -15,9 +15,12 @@ ComplexLike, TComplexDType_co, TComplexScalar_co, - check_json_complex_float, - complex_float_from_json, - complex_float_to_json, + check_json_complex_float_v2, + check_json_complex_float_v3, + complex_float_from_json_v2, + complex_float_from_json_v3, + complex_float_to_json_v2, + complex_float_to_json_v3, endianness_from_numpy_str, endianness_to_numpy_str, ) @@ -113,11 +116,19 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexSca TScalar_co The numpy float. """ - if check_json_complex_float(data, zarr_format=zarr_format): - return self._cast_value_unsafe(complex_float_from_json(data, zarr_format=zarr_format)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) + if zarr_format == 2: + if check_json_complex_float_v2(data): + return self._cast_value_unsafe(complex_float_from_json_v2(data)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + elif zarr_format == 3: + if check_json_complex_float_v3(data): + return self._cast_value_unsafe(complex_float_from_json_v3(data)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: """ @@ -136,7 +147,11 @@ def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: The JSON-serializable form of the complex number, which is a list of two floats, each of which is encoding according to a zarr-format-specific encoding. """ - return complex_float_to_json(self.cast_value(data), zarr_format=zarr_format) + if zarr_format == 2: + return complex_float_to_json_v2(self.cast_value(data)) + elif zarr_format == 3: + return complex_float_to_json_v3(self.cast_value(data)) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True) diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index bedd6a4751..e4d6e42ef3 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -10,11 +10,14 @@ FloatLike, TFloatDType_co, TFloatScalar_co, - check_json_float, + check_json_float_v2, + check_json_float_v3, endianness_from_numpy_str, endianness_to_numpy_str, - float_from_json, - float_to_json, + float_from_json_v2, + float_from_json_v3, + float_to_json_v2, + float_to_json_v3, ) from zarr.core.dtype.wrapper import TBaseDType, ZDType @@ -72,11 +75,11 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def check_value(self, value: object) -> TypeGuard[FloatLike]: - return isinstance(value, FloatLike) + def check_value(self, data: object) -> TypeGuard[FloatLike]: + return isinstance(data, FloatLike) - def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: - return self.to_dtype().type(value) # type: ignore[return-value, arg-type] + def _cast_value_unsafe(self, data: object) -> TFloatScalar_co: + return self.to_dtype().type(data) # type: ignore[return-value, arg-type] def default_value(self) -> TFloatScalar_co: """ @@ -105,13 +108,24 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScala TScalar_co The numpy float. """ - if check_json_float(data, zarr_format=zarr_format): - return self._cast_value_unsafe(float_from_json(data, zarr_format=zarr_format)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: + if zarr_format == 2: + if check_json_float_v2(data): + return self._cast_value_unsafe(float_from_json_v2(data)) + else: + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + elif zarr_format == 3: + if check_json_float_v3(data): + return self._cast_value_unsafe(float_from_json_v3(data)) + else: + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + else: + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> float | str: """ Convert an object to a JSON-serializable float. @@ -128,7 +142,12 @@ def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: The JSON-serializable form of the float, which is potentially a number or a string. See the zarr specifications for details on the JSON encoding for floats. """ - return float_to_json(self._cast_value_unsafe(data), zarr_format=zarr_format) + if zarr_format == 2: + return float_to_json_v2(self._cast_value_unsafe(data)) + elif zarr_format == 3: + return float_to_json_v3(self._cast_value_unsafe(data)) + else: + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 07856a3c7c..1c62e4b41c 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -295,7 +295,11 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: data_type = get_data_type_from_json(data_type_json, zarr_format=3) # check that the fill value is consistent with the data type - fill_value_parsed = data_type.from_json_value(_data.pop("fill_value"), zarr_format=3) + try: + fill = _data.pop("fill_value") + fill_value_parsed = data_type.from_json_value(fill, zarr_format=3) + except ValueError as e: + raise TypeError(f"Invalid fill_value: {fill!r}") from e # dimension_names key is optional, normalize missing to `None` _data["dimension_names"] = _data.pop("dimension_names", None) diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index 69a14a92b0..258ab48fe1 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -9,26 +9,22 @@ import numpy as np import pytest -from zarr.core.dtype.common import Endianness, JSONFloat, SpecialFloats +from zarr.core.dtype.common import Endianness, JSONFloatV2, SpecialFloatStrings from zarr.core.dtype.npy.common import ( EndiannessNumpy, bytes_from_json, bytes_to_json, check_json_bool, - check_json_complex_float, check_json_complex_float_v2, check_json_complex_float_v3, - check_json_float, check_json_float_v2, check_json_float_v3, check_json_int, check_json_str, - complex_float_to_json, complex_float_to_json_v2, complex_float_to_json_v3, endianness_from_numpy_str, endianness_to_numpy_str, - float_from_json, float_from_json_v2, float_from_json_v3, float_to_json_v2, @@ -49,7 +45,7 @@ def nan_equal(a: object, b: object) -> bool: return a == b -json_float_v2_cases: list[tuple[JSONFloat, float | np.floating[Any]]] = [ +json_float_v2_cases: list[tuple[JSONFloatV2, float | np.floating[Any]]] = [ ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -99,12 +95,12 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: @pytest.mark.parametrize(("data", "expected"), json_float_v2_cases + [("SHOULD_ERR", "")]) -def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> None: +def test_float_from_json_v2(data: JSONFloatV2 | str, expected: float | str) -> None: """ Test that float_from_json_v2 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(SpecialFloats) or isinstance(data, float): + if data in get_args(SpecialFloatStrings) or isinstance(data, float): assert nan_equal(float_from_json_v2(data), expected) # type: ignore[arg-type] else: msg = f"could not convert string to float: {data!r}" @@ -113,36 +109,25 @@ def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> Non @pytest.mark.parametrize(("data", "expected"), json_float_v3_cases + [("SHOULD_ERR", "")]) -def test_float_from_json_v3(data: JSONFloat | str, expected: float | str) -> None: +def test_float_from_json_v3(data: JSONFloatV2 | str, expected: float | str) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(SpecialFloats) or isinstance(data, float): - assert nan_equal(float_from_json_v3(data), expected) # type: ignore[arg-type] + if data in get_args(SpecialFloatStrings) or isinstance(data, float): + assert nan_equal(float_from_json_v3(data), expected) else: - msg = f"could not convert string to float: {data!r}" + msg = ( + f"Invalid float value: {data!r}. Expected a string starting with the hex prefix" + " '0x', or one of 'NaN', 'Infinity', or '-Infinity'." + ) with pytest.raises(ValueError, match=msg): - float_from_json_v3(data) # type: ignore[arg-type] - - -@pytest.mark.parametrize(("data", "expected"), json_float_v2_cases) -def test_float_from_json(data: JSONFloat, expected: float | str, zarr_format: ZarrFormat) -> None: - """ - Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. - This test also checks that an invalid string input raises a ``ValueError`` - """ - observed = float_from_json(data, zarr_format=zarr_format) - if zarr_format == 2: - expected = float_from_json_v2(data) - else: - expected = float_from_json_v3(data) - assert nan_equal(observed, expected) + float_from_json_v3(data) # note the order of parameters relative to the order of the parametrized variable. @pytest.mark.parametrize(("expected", "data"), json_float_v2_cases) -def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) -> None: +def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloatV2) -> None: """ Test that floats are JSON-encoded properly for zarr v2 """ @@ -152,7 +137,7 @@ def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) - # note the order of parameters relative to the order of the parametrized variable. @pytest.mark.parametrize(("expected", "data"), json_float_v3_cases) -def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloat) -> None: +def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloatV2) -> None: """ Test that floats are JSON-encoded properly for zarr v3 """ @@ -186,7 +171,9 @@ def test_bytes_to_json(zarr_format: ZarrFormat) -> None: # note the order of parameters relative to the order of the parametrized variable. @pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_cases) -def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: +def test_complex_to_json_v2( + float_data: float | np.floating[Any], json_expected: JSONFloatV2 +) -> None: """ Test that complex numbers are correctly converted to JSON in v2 format. @@ -202,7 +189,9 @@ def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: # note the order of parameters relative to the order of the parametrized variable. @pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) -def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: +def test_complex_to_json_v3( + float_data: float | np.floating[Any], json_expected: JSONFloatV2 +) -> None: """ Test that complex numbers are correctly converted to JSON in v3 format. @@ -218,7 +207,7 @@ def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: @pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) def test_complex_float_to_json( - float_data: float | np.floating[Any], json_expected: JSONFloat, zarr_format: ZarrFormat + float_data: float | np.floating[Any], json_expected: JSONFloatV2, zarr_format: ZarrFormat ) -> None: """ Test that complex numbers are correctly converted to JSON in v2 or v3 formats, depending @@ -231,18 +220,27 @@ def test_complex_float_to_json( cplx = complex(float_data, float_data) cplx_npy = np.complex128(cplx) - assert complex_float_to_json(cplx, zarr_format=zarr_format) == (json_expected, json_expected) - assert complex_float_to_json(cplx_npy, zarr_format=zarr_format) == ( - json_expected, - json_expected, - ) + if zarr_format == 2: + assert complex_float_to_json_v2(cplx) == (json_expected, json_expected) + assert complex_float_to_json_v2(cplx_npy) == ( + json_expected, + json_expected, + ) + elif zarr_format == 3: + assert complex_float_to_json_v3(cplx) == (json_expected, json_expected) + assert complex_float_to_json_v3(cplx_npy) == ( + json_expected, + json_expected, + ) + else: + raise ValueError("zarr_format must be 2 or 3") # pragma: no cover -check_json_float_cases = get_args(SpecialFloats) + (1.0, 2) +check_json_float_cases = get_args(SpecialFloatStrings) + (1.0, 2) @pytest.mark.parametrize("data", check_json_float_cases) -def test_check_json_float_v2_valid(data: JSONFloat | int) -> None: +def test_check_json_float_v2_valid(data: JSONFloatV2 | int) -> None: assert check_json_float_v2(data) @@ -251,7 +249,7 @@ def test_check_json_float_v2_invalid() -> None: @pytest.mark.parametrize("data", check_json_float_cases) -def test_check_json_float_v3_valid(data: JSONFloat | int) -> None: +def test_check_json_float_v3_valid(data: JSONFloatV2 | int) -> None: assert check_json_float_v3(data) @@ -259,25 +257,15 @@ def test_check_json_float_v3_invalid() -> None: assert not check_json_float_v3("invalid") -@pytest.mark.parametrize("data", check_json_float_cases) -def test_check_json_float(data: JSONFloat | int, zarr_format: ZarrFormat) -> None: - observed = check_json_float(data, zarr_format=zarr_format) - if zarr_format == 2: - expected = check_json_float_v2(data) - else: - expected = check_json_float_v3(data) - assert observed == expected - - -check_json_complex_float_true_cases = ( +check_json_complex_float_true_cases: tuple[list[JSONFloatV2], ...] = ( + [0.0, 1.0], [0.0, 1.0], - (0.0, 1.0), [-1.0, "NaN"], ["Infinity", 1.0], ["Infinity", "NaN"], ) -check_json_complex_float_false_cases = ( +check_json_complex_float_false_cases: tuple[object, ...] = ( 0.0, "foo", [0.0], @@ -309,12 +297,22 @@ def test_check_json_complex_float_v3_false(data: JSON) -> None: @pytest.mark.parametrize("data", check_json_complex_float_true_cases) def test_check_json_complex_float_true(data: JSON, zarr_format: ZarrFormat) -> None: - assert check_json_complex_float(data, zarr_format=zarr_format) + if zarr_format == 2: + assert check_json_complex_float_v2(data) + elif zarr_format == 3: + assert check_json_complex_float_v3(data) + else: + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @pytest.mark.parametrize("data", check_json_complex_float_false_cases) def test_check_json_complex_float_false(data: JSON, zarr_format: ZarrFormat) -> None: - assert not check_json_complex_float(data, zarr_format=zarr_format) + if zarr_format == 2: + assert not check_json_complex_float_v2(data) + elif zarr_format == 3: + assert not check_json_complex_float_v3(data) + else: + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def test_check_json_int() -> None: diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index 5981d09514..ba43b6bcf6 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -12,6 +12,15 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: return True return super().scalar_equals(scalar1, scalar2) + hex_nan_params: tuple[str, ...] = () + + def test_hex_nan(self, hex_nan_params: str) -> None: + """ + Test that hexadecimal strings can be read as NaN values + """ + zdtype = self.test_cls() + assert np.isnan(zdtype.from_json_value(hex_nan_params, zarr_format=3)) + class TestFloat16(_BaseTestFloat): test_cls = Float16 @@ -52,6 +61,8 @@ class TestFloat16(_BaseTestFloat): (Float16(), "NaN", np.float16("NaN")), ) + hex_nan_params = ("0x7fc0", "0x7fc1") + class TestFloat32(_BaseTestFloat): test_cls = Float32 @@ -94,6 +105,8 @@ class TestFloat32(_BaseTestFloat): (Float32(), "NaN", np.float32("NaN")), ) + hex_nan_params = ("0x7fc00000", "0x7fc00001") + class TestFloat64(_BaseTestFloat): test_cls = Float64 @@ -134,3 +147,5 @@ class TestFloat64(_BaseTestFloat): (Float64(), -1.0, np.float64(-1.0)), (Float64(), "NaN", np.float64("NaN")), ) + + hex_nan_params = ("0x7ff8000000000000", "0x7ff8000000000001") diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index fa23dccf59..0d7da0153f 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,7 +12,6 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.npy.common import check_json_complex_float from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( @@ -28,7 +27,7 @@ from typing import Any from zarr.abc.codec import Codec - from zarr.core.common import JSON, ZarrFormat + from zarr.core.common import JSON from zarr.core.metadata.v3 import ( @@ -135,14 +134,6 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) -@pytest.mark.parametrize("data", [[1.0, 0.0, 3.0], [0, 1, 3], [1]]) -def test_complex_to_json_invalid(data: object, zarr_format: ZarrFormat) -> None: - assert not check_json_complex_float(data, zarr_format=zarr_format) - # match = f"Invalid type: {data}. Expected a sequence of two numbers." - # with pytest.raises(TypeError, match=re.escape(match)): - # complex_float_from_json(data=data, zarr_format=3) - - @pytest.mark.parametrize("fill_value", [{"foo": 10}]) @pytest.mark.parametrize("dtype_str", [*int_dtypes, *float_dtypes, *complex_dtypes]) def test_parse_fill_value_invalid_type(fill_value: Any, dtype_str: str) -> None: From 8a976d6797508bd4d5167e51e495bf6d9cdd4f74 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 21 May 2025 17:21:28 +0200 Subject: [PATCH 100/130] revert removal of metadata chunk grid attribute --- src/zarr/core/array.py | 58 +++++++++++------------------------- src/zarr/core/metadata/v2.py | 6 ++++ src/zarr/core/metadata/v3.py | 14 +++++++++ tests/test_array.py | 2 +- tests/test_group.py | 2 +- 5 files changed, 40 insertions(+), 42 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2e3911361a..0e450d028a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -5,7 +5,6 @@ from asyncio import gather from collections.abc import Iterable from dataclasses import dataclass, field, replace -from functools import cached_property from itertools import starmap from logging import getLogger from typing import ( @@ -32,7 +31,7 @@ from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -42,7 +41,7 @@ default_buffer_prototype, ) from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype -from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid, _auto_partition, normalize_chunks +from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -951,13 +950,6 @@ def chunks(self) -> ChunkCoords: """ return self.metadata.chunks - @cached_property - def chunk_grid(self) -> ChunkGrid: - if self.metadata.zarr_format == 2: - return RegularChunkGrid(chunk_shape=self.chunks) - else: - return self.metadata.chunk_grid - @property def shards(self) -> ChunkCoords | None: """Returns the shard shape of the Array. @@ -1281,20 +1273,6 @@ def nbytes(self) -> int: """ return self.size * self.dtype.itemsize - def get_chunk_spec( - self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype - ) -> ArraySpec: - assert isinstance(self.chunk_grid, RegularChunkGrid), ( - "Currently, only regular chunk grid is supported" - ) - return ArraySpec( - shape=self.chunk_grid.chunk_shape, - dtype=self._zdtype, - fill_value=self.metadata.fill_value, - config=array_config, - prototype=prototype, - ) - async def _get_selection( self, indexer: Indexer, @@ -1334,7 +1312,7 @@ async def _get_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.get_chunk_spec(chunk_coords, _config, prototype=prototype), + self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, is_complete_chunk, @@ -1389,7 +1367,7 @@ async def getitem( indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_grid=self.chunk_grid, + chunk_grid=self.metadata.chunk_grid, ) return await self._get_selection(indexer, prototype=prototype) @@ -1464,7 +1442,7 @@ async def _set_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.get_chunk_spec(chunk_coords, _config, prototype), + self.metadata.get_chunk_spec(chunk_coords, _config, prototype), chunk_selection, out_selection, is_complete_chunk, @@ -1519,7 +1497,7 @@ async def setitem( indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_grid=self.chunk_grid, + chunk_grid=self.metadata.chunk_grid, ) return await self._set_selection(indexer, value, prototype=prototype) @@ -1556,8 +1534,8 @@ async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) if delete_outside_chunks: # Remove all chunks outside of the new shape - old_chunk_coords = set(self.chunk_grid.all_chunk_coords(self.metadata.shape)) - new_chunk_coords = set(self.chunk_grid.all_chunk_coords(new_shape)) + old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) + new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) async def _delete_key(key: str) -> None: await (self.store_path / key).delete() @@ -2687,7 +2665,7 @@ def get_basic_selection( prototype = default_buffer_prototype() return sync( self._async_array._get_selection( - BasicIndexer(selection, self.shape, self._async_array.chunk_grid), + BasicIndexer(selection, self.shape, self.metadata.chunk_grid), out=out, fields=fields, prototype=prototype, @@ -2787,7 +2765,7 @@ def set_basic_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BasicIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @_deprecate_positional_args @@ -2908,7 +2886,7 @@ def get_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3021,7 +2999,7 @@ def set_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype) ) @@ -3102,7 +3080,7 @@ def get_mask_selection( if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self._async_array.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3185,7 +3163,7 @@ def set_mask_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self._async_array.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @_deprecate_positional_args @@ -3266,7 +3244,7 @@ def get_coordinate_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) out_array = sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3352,7 +3330,7 @@ def set_coordinate_selection( if prototype is None: prototype = default_buffer_prototype() # setup indexer - indexer = CoordinateIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) # handle value - need ndarray-like flatten value if not is_scalar(value, self.dtype): @@ -3468,7 +3446,7 @@ def get_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3562,7 +3540,7 @@ def set_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @property diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 585771b0b3..6f5d52a972 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -3,11 +3,13 @@ import base64 import warnings from collections.abc import Iterable, Sequence +from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict import numcodecs.abc from zarr.abc.metadata import Metadata +from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType @@ -103,6 +105,10 @@ def __init__( def ndim(self) -> int: return len(self.shape) + @cached_property + def chunk_grid(self) -> RegularChunkGrid: + return RegularChunkGrid(chunk_shape=self.chunks) + @property def shards(self) -> ChunkCoords | None: return None diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 1c62e4b41c..606d373cba 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -269,6 +269,20 @@ def inner_codecs(self) -> tuple[Codec, ...]: return self.codecs[0].codecs return self.codecs + def get_chunk_spec( + self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype + ) -> ArraySpec: + assert isinstance(self.chunk_grid, RegularChunkGrid), ( + "Currently, only regular chunk grid is supported" + ) + return ArraySpec( + shape=self.chunk_grid.chunk_shape, + dtype=self.dtype, + fill_value=self.fill_value, + config=array_config, + prototype=prototype, + ) + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) diff --git a/tests/test_array.py b/tests/test_array.py index 99a5b8a0d7..7d6b877547 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1363,7 +1363,7 @@ async def test_with_data(impl: Literal["sync", "async"], store: Store) -> None: elif impl == "async": arr = await create_array(store, name=name, data=data, zarr_format=3) stored = await arr._get_selection( - BasicIndexer(..., shape=arr.shape, chunk_grid=arr.chunk_grid), + BasicIndexer(..., shape=arr.shape, chunk_grid=arr.metadata.chunk_grid), prototype=default_buffer_prototype(), ) else: diff --git a/tests/test_group.py b/tests/test_group.py index 72f7575b8d..b4dace2568 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -1007,7 +1007,7 @@ async def test_asyncgroup_create_array( assert subnode.dtype == dtype # todo: fix the type annotation of array.metadata.chunk_grid so that we get some autocomplete # here. - assert subnode.chunk_grid.chunk_shape == chunk_shape + assert subnode.metadata.chunk_grid.chunk_shape == chunk_shape assert subnode.metadata.zarr_format == zarr_format From be0d2dfb48c2696eabc7e77d5b755ba2d342b9a4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 13:23:15 +0200 Subject: [PATCH 101/130] use none to denote default fill value; remove old structured tests; use cast_value where appropriate --- src/zarr/api/synchronous.py | 2 +- src/zarr/core/array.py | 3 +- src/zarr/core/dtype/npy/sized.py | 23 ++++++-- src/zarr/core/dtype/wrapper.py | 6 +-- src/zarr/core/metadata/v2.py | 91 ++++---------------------------- src/zarr/core/metadata/v3.py | 2 +- tests/test_metadata/test_v2.py | 8 ++- tests/test_v2.py | 48 ++--------------- 8 files changed, 41 insertions(+), 142 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 9a9f800881..6cabfed446 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -601,7 +601,7 @@ def create( chunks: ChunkCoords | int | bool | None = None, dtype: npt.DTypeLike | None = None, compressor: CompressorLike = "auto", - fill_value: Any | None = 0, # TODO: need type + fill_value: Any | None = None, # TODO: need type order: MemoryOrder | None = None, store: str | StoreLike | None = None, synchronizer: Any | None = None, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0e450d028a..e379ee660a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -778,7 +778,8 @@ def _create_metadata_v2( ) -> ArrayV2Metadata: if dimension_separator is None: dimension_separator = "." - + if fill_value is None: + fill_value = dtype.default_value() # type: ignore[assignment] return ArrayV2Metadata( shape=shape, dtype=dtype, diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 281c634856..7ca507b84e 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -79,7 +79,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes) + # this is generous for backwards compatibility + return isinstance(data, np.bytes_ | str | bytes | int) def _cast_value_unsafe(self, value: object) -> np.bytes_: return self.to_dtype().type(value) @@ -168,7 +169,11 @@ def check_value(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) def _cast_value_unsafe(self, value: object) -> np.void: - return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] + native_dtype = self.to_dtype() + # Without the second argument, numpy will return a void scalar for dtype V1. + # The second argument ensures that, if native_dtype is something like V10, + # the result will actually be a V10 scalar. + return native_dtype.type(value, native_dtype) @dataclass(frozen=True, kw_only=True) @@ -239,7 +244,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: - return isinstance(data, str | np.str_ | bytes) + # this is generous for backwards compatibility + return isinstance(data, str | np.str_ | bytes | int) def _cast_value_unsafe(self, value: object) -> np.str_: return self.to_dtype().type(value) @@ -254,8 +260,15 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): def default_value(self) -> np.void: return self._cast_value_unsafe(0) - def _cast_value_unsafe(self, value: object) -> np.void: - return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) + def _cast_value_unsafe(self, data: object) -> np.void: + na_dtype = self.to_dtype() + if isinstance(data, bytes): + res = np.frombuffer(data, dtype=na_dtype)[0] + elif isinstance(data, list | tuple): + res = np.array([tuple(data)], dtype=na_dtype)[0] + else: + res = np.array([data], dtype=na_dtype)[0] + return cast("np.void", res) @classmethod def check_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 3a56a85788..c8e060e764 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -160,9 +160,9 @@ def cast_value(self, data: object) -> TScalar_co: if self.check_value(data): return self._cast_value_unsafe(data) msg = ( - f"The value {data} failed a type check." - f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}." - f"Consult the documentation for {self} to determine the possible values that can" + f"The value {data} failed a type check. " + f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}. " + f"Consult the documentation for {self} to determine the possible values that can " "be cast to scalars of the wrapped data type." ) raise TypeError(msg) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 6f5d52a972..23a0275691 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,6 +1,5 @@ from __future__ import annotations -import base64 import warnings from collections.abc import Iterable, Sequence from functools import cached_property @@ -52,7 +51,7 @@ class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: ChunkCoords dtype: ZDType[TBaseDType, TBaseScalar] - fill_value: int | float | str | bytes | None = 0 + fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." @@ -85,7 +84,11 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value(fill_value, dtype=dtype.to_dtype()) + fill_value_parsed: TBaseScalar | None + if fill_value is not None: + fill_value_parsed = dtype.cast_value(fill_value) + else: + fill_value_parsed = fill_value attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -134,11 +137,10 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _ = parse_zarr_format(_data.pop("zarr_format")) dtype = get_data_type_from_native_dtype(_data["dtype"]) _data["dtype"] = dtype - if dtype.to_dtype().kind in "SV": - fill_value_encoded = _data.get("fill_value") - if fill_value_encoded is not None: - fill_value = base64.standard_b64decode(fill_value_encoded) - _data["fill_value"] = fill_value + fill_value_encoded = _data.get("fill_value") + if fill_value_encoded is not None: + fill_value = dtype.from_json_value(fill_value_encoded, zarr_format=2) + _data["fill_value"] = fill_value # zarr v2 allowed arbitrary keys here. # We don't want the ArrayV2Metadata constructor to fail just because someone put an @@ -281,76 +283,3 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: ) raise ValueError(msg) return data - - -def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: - """Handle structured dtype/fill value pairs""" - try: - if isinstance(fill_value, list): - return np.array([tuple(fill_value)], dtype=dtype)[0] - elif isinstance(fill_value, tuple): - return np.array([fill_value], dtype=dtype)[0] - elif isinstance(fill_value, bytes): - return np.frombuffer(fill_value, dtype=dtype)[0] - elif isinstance(fill_value, str): - decoded = base64.standard_b64decode(fill_value) - return np.frombuffer(decoded, dtype=dtype)[0] - else: - return np.array(fill_value, dtype=dtype)[()] - except Exception as e: - raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e - - -def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: - """ - Parse a potential fill value into a value that is compatible with the provided dtype. - - Parameters - ---------- - fill_value : Any - A potential fill value. - dtype : np.dtype[Any] - A numpy dtype. - - Returns - ------- - An instance of `dtype`, or `None`, or any python object (in the case of an object dtype) - """ - - if fill_value is None or dtype.hasobject: - pass - elif dtype.fields is not None: - # the dtype is structured (has multiple fields), so the fill_value might be a - # compound value (e.g., a tuple or dict) that needs field-wise processing. - # We use parse_structured_fill_value to correctly convert each component. - fill_value = _parse_structured_fill_value(fill_value, dtype) - elif not isinstance(fill_value, np.void) and fill_value == 0: - # this should be compatible across numpy versions for any array type, including - # structured arrays - fill_value = np.zeros((), dtype=dtype)[()] - elif dtype.kind == "U": - # special case unicode because of encoding issues on Windows if passed through numpy - # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 - - if not isinstance(fill_value, str): - raise ValueError( - f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string" - ) - elif dtype.kind in "SV" and isinstance(fill_value, str): - fill_value = base64.standard_b64decode(fill_value) - elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2: - complex_val = complex(float(fill_value[0]), float(fill_value[1])) - fill_value = np.array(complex_val, dtype=dtype)[()] - else: - try: - if isinstance(fill_value, bytes) and dtype.kind == "V": - # special case for numpy 1.14 compatibility - fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] - else: - fill_value = np.array(fill_value, dtype=dtype)[()] - - except Exception as e: - msg = f"Fill_value {fill_value} is not valid for dtype {dtype}." - raise ValueError(msg) from e - - return fill_value diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 606d373cba..80ed722836 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -175,7 +175,7 @@ def __init__( chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # Note: relying on a type method is numpy-specific - fill_value_parsed = data_type.to_dtype().type(fill_value) + fill_value_parsed = data_type.cast_value(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index aa8cfc4a31..5fd3ae8cc6 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -128,7 +128,7 @@ async def v2_consolidated_metadata( "chunks": [730], "compressor": None, "dtype": " None: assert "checksum" not in metadata["compressor"] -@pytest.mark.parametrize( - "fill_value", [None, np.void((0, 0), np.dtype([("foo", "i4"), ("bar", "i4")]))] -) +@pytest.mark.parametrize("fill_value", [np.void((0, 0), np.dtype([("foo", "i4"), ("bar", "i4")]))]) def test_structured_dtype_fill_value_serialization(tmp_path, fill_value): zarr_format = 2 group_path = tmp_path / "test.zarr" diff --git a/tests/test_v2.py b/tests/test_v2.py index 145c3d58fb..51139bbeb4 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -15,7 +15,7 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.v2 import _parse_structured_fill_value +from zarr.core.dtype.npy.sized import Structured from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath @@ -261,35 +261,18 @@ def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: np.dtype([("x", "i4"), ("y", "i4")]), np.array([(1, 2)], dtype=[("x", "i4"), ("y", "i4")])[0], ), - ( - "BQAAAA==", - np.dtype([("val", "i4")]), - np.array([(5,)], dtype=[("val", "i4")])[0], - ), - ( - {"x": 1, "y": 2}, - np.dtype([("location", "O")]), - np.array([({"x": 1, "y": 2},)], dtype=[("location", "O")])[0], - ), - ( - {"x": 1, "y": 2, "z": 3}, - np.dtype([("location", "O")]), - np.array([({"x": 1, "y": 2, "z": 3},)], dtype=[("location", "O")])[0], - ), ], ids=[ "tuple_input", "list_input", "bytes_input", - "string_input", - "dictionary_input", - "dictionary_input_extra_fields", ], ) def test_parse_structured_fill_value_valid( fill_value: Any, dtype: np.dtype[Any], expected_result: Any ) -> None: - result = _parse_structured_fill_value(fill_value, dtype) + zdtype = Structured.from_dtype(dtype) + result = zdtype.cast_value(fill_value) assert result.dtype == expected_result.dtype assert result == expected_result if isinstance(expected_result, np.void): @@ -297,31 +280,6 @@ def test_parse_structured_fill_value_valid( assert result[name] == expected_result[name] -@pytest.mark.parametrize( - ( - "fill_value", - "dtype", - ), - [ - (("Alice", 30), np.dtype([("name", "U10"), ("age", "i4"), ("city", "U20")])), - (b"\x01\x00\x00\x00", np.dtype([("x", "i4"), ("y", "i4")])), - ("this_is_not_base64", np.dtype([("val", "i4")])), - ("hello", np.dtype([("age", "i4")])), - ({"x": 1, "y": 2}, np.dtype([("location", "i4")])), - ], - ids=[ - "tuple_list_wrong_length", - "bytes_wrong_length", - "invalid_base64", - "wrong_data_type", - "wrong_dictionary", - ], -) -def test_parse_structured_fill_value_invalid(fill_value: Any, dtype: np.dtype[Any]) -> None: - with pytest.raises(ValueError): - _parse_structured_fill_value(fill_value, dtype) - - @pytest.mark.parametrize("fill_value", [None, b"x"], ids=["no_fill", "fill"]) def test_other_dtype_roundtrip(fill_value, tmp_path) -> None: a = np.array([b"a\0\0", b"bb", b"ccc"], dtype="V7") From 8c90d2ca827de0846f9ce65e045b24e5b5682527 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 14:51:01 +0200 Subject: [PATCH 102/130] add item size abstraction --- src/zarr/codecs/blosc.py | 9 +++-- src/zarr/codecs/bytes.py | 6 +-- src/zarr/core/array.py | 9 +++-- src/zarr/core/dtype/common.py | 13 +++++++ src/zarr/core/dtype/npy/bool.py | 13 +++++-- src/zarr/core/dtype/npy/complex.py | 22 +++++++---- src/zarr/core/dtype/npy/float.py | 16 +++++++- src/zarr/core/dtype/npy/int.py | 46 +++++++++++++++++++---- src/zarr/core/dtype/npy/sized.py | 45 +++++++++++++++------- src/zarr/core/dtype/npy/string.py | 8 ++-- src/zarr/core/dtype/npy/time.py | 16 +++++--- tests/conftest.py | 7 +++- tests/test_dtype/test_npy/test_bool.py | 1 + tests/test_dtype/test_npy/test_complex.py | 3 ++ tests/test_dtype/test_npy/test_float.py | 21 ++++++++--- tests/test_dtype/test_npy/test_int.py | 9 +++++ tests/test_dtype/test_npy/test_sized.py | 20 ++++++++++ tests/test_dtype/test_npy/test_string.py | 3 ++ tests/test_dtype/test_npy/test_time.py | 2 + tests/test_dtype/test_wrapper.py | 16 +++++++- 20 files changed, 223 insertions(+), 62 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index fc9b656847..1c5e52e9a4 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -13,6 +13,7 @@ from zarr.abc.codec import BytesBytesCodec from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.dtype.common import HasItemSize from zarr.registry import register_codec if TYPE_CHECKING: @@ -137,14 +138,16 @@ def to_dict(self) -> dict[str, JSON]: } def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - dtype = array_spec.dtype.to_dtype() + item_size = 1 + if isinstance(array_spec.dtype, HasItemSize): + item_size = array_spec.dtype.item_size new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=dtype.itemsize) + new_codec = replace(new_codec, typesize=item_size) if new_codec.shuffle is None: new_codec = replace( new_codec, - shuffle=(BloscShuffle.bitshuffle if dtype.itemsize == 1 else BloscShuffle.shuffle), + shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), ) return new_codec diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index a87df060e7..5db39796e4 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -10,6 +10,7 @@ from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.dtype.common import HasEndianness from zarr.core.dtype.npy.common import endianness_to_numpy_str from zarr.registry import register_codec @@ -58,10 +59,7 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - # Note: this check is numpy-dtype-specific - # For single-byte (e.g., uint8) or 0-byte (e.g., S0) dtypes, - # endianness does not apply. - if array_spec.dtype.to_dtype().itemsize < 2: + if not isinstance(array_spec.dtype, HasEndianness): if self.endian is not None: return replace(self, endian=None) elif self.endian is None: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e379ee660a..a4e8c7c3d1 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -72,6 +72,7 @@ ZDTypeLike, parse_data_type, ) +from zarr.core.dtype.common import HasItemSize from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -586,11 +587,13 @@ async def _create( if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") - + item_size = 1 + if isinstance(dtype_parsed, HasItemSize): + item_size = dtype_parsed.item_size if chunks: - _chunks = normalize_chunks(chunks, shape, dtype_parsed.to_dtype().itemsize) + _chunks = normalize_chunks(chunks, shape, item_size) else: - _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.to_dtype().itemsize) + _chunks = normalize_chunks(chunk_shape, shape, item_size) config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index ecc475192c..d4aded658d 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -30,3 +30,16 @@ class HasEndianness: """ endianness: Endianness | None = "little" + + +@dataclass(frozen=True) +class HasItemSize: + """ + A mix-in class for data types with an item size attribute. + This mix-in bears a property ``item_size``, which denotes the size of each element of the data + type, in bytes. + """ + + @property + def item_size(self) -> int: + raise NotImplementedError diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index c80033c54e..d46758f789 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -4,12 +4,13 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import HasItemSize from zarr.core.dtype.npy.common import check_json_bool from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True, kw_only=True, slots=True) -class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): +class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ Wrapper for numpy boolean dtype. @@ -65,7 +66,7 @@ def default_value(self) -> np.bool_: """ return np.False_ - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> bool: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> bool: """ Convert a scalar to a python bool. @@ -107,5 +108,9 @@ def check_value(self, data: object) -> bool: # Anything can become a bool return True - def _cast_value_unsafe(self, value: object) -> np.bool_: - return np.bool_(value) + def _cast_value_unsafe(self, data: object) -> np.bool_: + return np.bool_(data) + + @property + def item_size(self) -> int: + return 1 diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 3e5f640946..ee52dd0577 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -10,7 +10,7 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( ComplexLike, TComplexDType_co, @@ -31,7 +31,7 @@ @dataclass(frozen=True) -class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness): +class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @@ -83,11 +83,11 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def check_value(self, value: object) -> bool: - return isinstance(value, ComplexLike) + def check_value(self, data: object) -> bool: + return isinstance(data, ComplexLike) - def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: - return self.to_dtype().type(value) # type: ignore[arg-type, return-value] + def _cast_value_unsafe(self, data: object) -> TComplexScalar_co: + return self.to_dtype().type(data) # type: ignore[arg-type, return-value] def default_value(self) -> TComplexScalar_co: """ @@ -130,7 +130,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexSca ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ Convert an object to a JSON-serializable float. @@ -160,9 +160,17 @@ class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): _zarr_v3_name = "complex64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " int: + return 8 + @dataclass(frozen=True, kw_only=True) class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): dtype_cls = np.dtypes.Complex128DType _zarr_v3_name = "complex128" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " int: + return 16 diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index e4d6e42ef3..28f3ced63e 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -4,7 +4,7 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( EndiannessNumpy, FloatLike, @@ -23,7 +23,7 @@ @dataclass(frozen=True) -class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness): +class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @@ -156,6 +156,10 @@ class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): _zarr_v3_name = "float16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " int: + return 2 + @dataclass(frozen=True, kw_only=True) class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): @@ -163,9 +167,17 @@ class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): _zarr_v3_name = "float32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " int: + return 4 + @dataclass(frozen=True, kw_only=True) class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " int: + return 8 diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 78d9499243..db5869b202 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -4,7 +4,7 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( EndiannessNumpy, check_json_int, @@ -32,7 +32,7 @@ @dataclass(frozen=True) -class BaseInt(ZDType[TIntDType_co, TIntScalar_co]): +class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @@ -67,11 +67,11 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def check_value(self, value: object) -> TypeGuard[IntLike]: - return isinstance(value, IntLike) + def check_value(self, data: object) -> TypeGuard[IntLike]: + return isinstance(data, IntLike) - def _cast_value_unsafe(self, value: object) -> TIntScalar_co: - return self.to_dtype().type(value) # type: ignore[return-value, arg-type] + def _cast_value_unsafe(self, data: object) -> TIntScalar_co: + return self.to_dtype().type(data) # type: ignore[return-value, arg-type] def default_value(self) -> TIntScalar_co: """ @@ -104,7 +104,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_ return self._cast_value_unsafe(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> int: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: """ Convert an object to JSON-serializable scalar. @@ -140,6 +140,10 @@ def to_dtype(self: Self) -> np.dtypes.Int8DType: def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() + @property + def item_size(self) -> int: + return 1 + @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): @@ -158,6 +162,10 @@ def to_dtype(self: Self) -> np.dtypes.UInt8DType: def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() + @property + def item_size(self) -> int: + return 1 + @dataclass(frozen=True, kw_only=True) class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): @@ -183,6 +191,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 2 + @dataclass(frozen=True, kw_only=True) class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): @@ -207,6 +219,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 2 + @dataclass(frozen=True, kw_only=True) class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): @@ -243,6 +259,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 4 + @dataclass(frozen=True, kw_only=True) class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): @@ -267,6 +287,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 4 + @dataclass(frozen=True, kw_only=True) class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): @@ -291,6 +315,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 8 + @dataclass(frozen=True, kw_only=True) class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): @@ -314,3 +342,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @property + def item_size(self) -> int: + return 8 diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 7ca507b84e..2b2ed2ac70 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -7,7 +7,7 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasLength +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize, HasLength from zarr.core.dtype.npy.common import ( EndiannessNumpy, bytes_from_json, @@ -20,7 +20,7 @@ @dataclass(frozen=True, kw_only=True) -class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): +class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): dtype_cls = np.dtypes.BytesDType _zarr_v3_name = "numpy.fixed_length_ascii" @@ -85,9 +85,13 @@ def check_value(self, data: object) -> bool: def _cast_value_unsafe(self, value: object) -> np.bytes_: return self.to_dtype().type(value) + @property + def item_size(self) -> int: + return self.length + @dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): +class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): # np.dtypes.VoidDType is specified in an odd way in numpy # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here @@ -168,25 +172,31 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: def check_value(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) - def _cast_value_unsafe(self, value: object) -> np.void: + def _cast_value_unsafe(self, data: object) -> np.void: native_dtype = self.to_dtype() # Without the second argument, numpy will return a void scalar for dtype V1. # The second argument ensures that, if native_dtype is something like V10, # the result will actually be a V10 scalar. - return native_dtype.type(value, native_dtype) + return native_dtype.type(data, native_dtype) + + @property + def item_size(self) -> int: + return self.length @dataclass(frozen=True, kw_only=True) -class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): +class FixedLengthUnicode( + ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize +): dtype_cls = np.dtypes.StrDType _zarr_v3_name = "numpy.fixed_length_ucs4" - item_size_bytes: ClassVar[int] = 4 # UCS4 is 4 bytes per code point + code_point_bytes: ClassVar[int] = 4 # UCS4 is 4 bytes per code point @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( - length=dtype.itemsize // (cls.item_size_bytes), + length=dtype.itemsize // (cls.code_point_bytes), endianness=endianness_from_numpy_str(byte_order), ) @@ -220,7 +230,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: elif zarr_format == 3: return { "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length * self.item_size_bytes}, + "configuration": {"length_bytes": self.length * self.code_point_bytes}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -229,7 +239,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"] // cls.item_size_bytes) # type: ignore[arg-type, index, call-overload, operator] + return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.str_: @@ -247,12 +257,16 @@ def check_value(self, data: object) -> bool: # this is generous for backwards compatibility return isinstance(data, str | np.str_ | bytes | int) - def _cast_value_unsafe(self, value: object) -> np.str_: - return self.to_dtype().type(value) + def _cast_value_unsafe(self, data: object) -> np.str_: + return self.to_dtype().type(data) + + @property + def item_size(self) -> int: + return self.length * self.code_point_bytes @dataclass(frozen=True, kw_only=True) -class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): +class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "structured" fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] @@ -395,3 +409,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: dtype = self.to_dtype() return cast("np.void", np.array([as_bytes]).view(dtype)[0]) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + + @property + def item_size(self) -> int: + # Lets have numpy do the arithmetic here + return self.to_dtype().itemsize diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 3849fd05ce..d5a4f9be08 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -72,8 +72,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: def check_value(self, data: object) -> bool: return isinstance(data, str) - def _cast_value_unsafe(self, value: object) -> str: - return str(value) + def _cast_value_unsafe(self, data: object) -> str: + return str(data) else: # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @@ -130,5 +130,5 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: def check_value(self, data: object) -> bool: return isinstance(data, str) - def _cast_value_unsafe(self, value: object) -> str: - return str(value) + def _cast_value_unsafe(self, data: object) -> str: + return str(data) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index ea44d76b56..61786351f8 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -17,7 +17,7 @@ import numpy as np -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( DateTimeUnit, EndiannessNumpy, @@ -99,7 +99,7 @@ class TimeConfig(TypedDict): @dataclass(frozen=True, kw_only=True, slots=True) -class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness): +class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness, HasItemSize): _zarr_v2_names: ClassVar[tuple[str, ...]] # this attribute exists so that we can programmatically create a numpy dtype instance # because the particular numpy dtype we are wrapping does not allow direct construction via @@ -163,6 +163,10 @@ def check_value(self, data: object) -> bool: except ValueError: return False + @property + def item_size(self) -> int: + return 8 + @dataclass(frozen=True, kw_only=True, slots=True) class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): @@ -188,8 +192,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelt return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_value_unsafe(self, value: object) -> np.timedelta64: - return self.to_dtype().type(value) # type: ignore[arg-type] + def _cast_value_unsafe(self, data: object) -> np.timedelta64: + return self.to_dtype().type(data) # type: ignore[arg-type] @classmethod def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: @@ -235,8 +239,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_value_unsafe(self, value: object) -> np.datetime64: - return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] + def _cast_value_unsafe(self, data: object) -> np.datetime64: + return self.to_dtype().type(data) # type: ignore[no-any-return, call-overload] @classmethod def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: diff --git a/tests/conftest.py b/tests/conftest.py index 663e2663b8..725de1b529 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,7 @@ from zarr.core.dtype import ( get_data_type_from_native_dtype, ) +from zarr.core.dtype.common import HasItemSize from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -268,12 +269,14 @@ def create_array_metadata( chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format ) - + item_size = 1 + if isinstance(dtype_parsed, HasItemSize): + item_size = dtype_parsed.item_size shard_shape_parsed, chunk_shape_parsed = _auto_partition( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=dtype_parsed.to_dtype().itemsize, + item_size=item_size, ) if order is None: diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 086a2cfee8..1adae57f02 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -38,3 +38,4 @@ class TestBool(_TestZDType): (Bool(), np.True_, np.True_), (Bool(), np.False_, np.False_), ) + item_size_params = (Bool(),) diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index b24bc4d7c8..45a3a1480e 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -52,6 +52,8 @@ class TestComplex64(_BaseTestFloat): (Complex64(), complex(0, math.nan), np.complex64(complex(0, math.nan))), ) + item_size_params = (Complex64(),) + class TestComplex128(_BaseTestFloat): test_cls = Complex128 @@ -89,3 +91,4 @@ class TestComplex128(_BaseTestFloat): (Complex128(), complex(-1.0, math.inf), np.complex128(complex(-1.0, math.inf))), (Complex128(), complex(0, math.nan), np.complex128(complex(0, math.nan))), ) + item_size_params = (Complex128(),) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index ba43b6bcf6..daa9bafac0 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -12,14 +12,16 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: return True return super().scalar_equals(scalar1, scalar2) - hex_nan_params: tuple[str, ...] = () + hex_string_params: tuple[tuple[str, float], ...] = () - def test_hex_nan(self, hex_nan_params: str) -> None: + def test_hex_encoding(self, hex_string_params: tuple[str, float]) -> None: """ Test that hexadecimal strings can be read as NaN values """ + hex_string, expected = hex_string_params zdtype = self.test_cls() - assert np.isnan(zdtype.from_json_value(hex_nan_params, zarr_format=3)) + observed = zdtype.from_json_value(hex_string, zarr_format=3) + assert self.scalar_equals(observed, expected) class TestFloat16(_BaseTestFloat): @@ -61,7 +63,8 @@ class TestFloat16(_BaseTestFloat): (Float16(), "NaN", np.float16("NaN")), ) - hex_nan_params = ("0x7fc0", "0x7fc1") + hex_string_params = (("0x7fc0", np.nan), ("0x7fc1", np.nan), ("0x3c00", 1.0)) + item_size_params = (Float16(),) class TestFloat32(_BaseTestFloat): @@ -105,7 +108,8 @@ class TestFloat32(_BaseTestFloat): (Float32(), "NaN", np.float32("NaN")), ) - hex_nan_params = ("0x7fc00000", "0x7fc00001") + hex_string_params = (("0x7fc00000", np.nan), ("0x7fc00001", np.nan), ("0x3f800000", 1.0)) + item_size_params = (Float32(),) class TestFloat64(_BaseTestFloat): @@ -148,4 +152,9 @@ class TestFloat64(_BaseTestFloat): (Float64(), "NaN", np.float64("NaN")), ) - hex_nan_params = ("0x7ff8000000000000", "0x7ff8000000000001") + hex_string_params = ( + ("0x7ff8000000000000", np.nan), + ("0x7ff8000000000001", np.nan), + ("0x3ff0000000000000", 1.0), + ) + item_size_params = (Float64(),) diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index 637b594e1b..5b0180af3b 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -34,6 +34,7 @@ class TestInt8(_TestZDType): (Int8(), 1, np.int8(1)), (Int8(), -1, np.int8(-1)), ) + item_size_params = (Int8(),) class TestInt16(_TestZDType): @@ -65,6 +66,8 @@ class TestInt16(_TestZDType): (Int16(), -1, np.int16(-1)), ) + item_size_params = (Int16(),) + class TestInt32(_TestZDType): test_cls = Int32 @@ -94,6 +97,7 @@ class TestInt32(_TestZDType): (Int32(), 1, np.int32(1)), (Int32(), -1, np.int32(-1)), ) + item_size_params = (Int32(),) class TestInt64(_TestZDType): @@ -124,6 +128,7 @@ class TestInt64(_TestZDType): (Int64(), 1, np.int64(1)), (Int64(), -1, np.int64(-1)), ) + item_size_params = (Int64(),) class TestUInt8(_TestZDType): @@ -154,6 +159,7 @@ class TestUInt8(_TestZDType): (UInt8(), 1, np.uint8(1)), (UInt8(), 0, np.uint8(0)), ) + item_size_params = (UInt8(),) class TestUInt16(_TestZDType): @@ -184,6 +190,7 @@ class TestUInt16(_TestZDType): (UInt16(), 1, np.uint16(1)), (UInt16(), 0, np.uint16(0)), ) + item_size_params = (UInt16(),) class TestUInt32(_TestZDType): @@ -214,6 +221,7 @@ class TestUInt32(_TestZDType): (UInt32(), 1, np.uint32(1)), (UInt32(), 0, np.uint32(0)), ) + item_size_params = (UInt32(),) class TestUInt64(_TestZDType): @@ -244,3 +252,4 @@ class TestUInt64(_TestZDType): (UInt64(), 1, np.uint64(1)), (UInt64(), 0, np.uint64(0)), ) + item_size_params = (UInt64(),) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 2ded5bbb7c..202bb0d04e 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -50,6 +50,11 @@ class TestFixedLengthAscii(_TestZDType): (FixedLengthAscii(length=2), "ab", np.bytes_("ab")), (FixedLengthAscii(length=4), "abcd", np.bytes_("abcd")), ) + item_size_params = ( + FixedLengthAscii(length=0), + FixedLengthAscii(length=4), + FixedLengthAscii(length=10), + ) class TestFixedLengthBytes(_TestZDType): @@ -91,6 +96,11 @@ class TestFixedLengthBytes(_TestZDType): (FixedLengthBytes(length=2), b"ab", np.void(b"ab")), (FixedLengthBytes(length=4), b"abcd", np.void(b"abcd")), ) + item_size_params = ( + FixedLengthBytes(length=0), + FixedLengthBytes(length=4), + FixedLengthBytes(length=10), + ) class TestFixedLengthUnicode(_TestZDType): @@ -125,6 +135,11 @@ class TestFixedLengthUnicode(_TestZDType): (FixedLengthUnicode(length=2), "hi", np.str_("hi")), (FixedLengthUnicode(length=4), "hihi", np.str_("hihi")), ) + item_size_params = ( + FixedLengthUnicode(length=0), + FixedLengthUnicode(length=4), + FixedLengthUnicode(length=10), + ) class TestStructured(_TestZDType): @@ -214,3 +229,8 @@ def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): return np.array_equal(scalar1, scalar2) return super().scalar_equals(scalar1, scalar2) + + item_size_params = ( + Structured(fields=(("field1", Int32()), ("field2", Float64()))), + Structured(fields=(("field1", Int64()), ("field2", Int32()))), + ) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index c87f538be5..1046afcac0 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -37,6 +37,7 @@ class TestVariableLengthString(_TestZDType): (VariableLengthString(), "", np.str_("")), (VariableLengthString(), "hi", np.str_("hi")), ) + item_size_params = (VariableLengthString(),) else: @@ -70,3 +71,5 @@ class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] (VariableLengthString(), "", np.str_("")), (VariableLengthString(), "hi", np.str_("hi")), ) + + item_size_params = (VariableLengthString(),) diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index f8f8b5ae47..90c573007f 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -63,6 +63,7 @@ class TestDateTime64(_TestTimeBase): (DateTime64(unit="s", scale_factor=1), "2005-02-25", np.datetime64("2005-02-25", "s")), (DateTime64(unit="ns", scale_factor=1), "NaT", np.datetime64("NaT")), ) + item_size_params = (DateTime64(unit="ns", scale_factor=1),) class TestTimeDelta64(_TestTimeBase): @@ -102,6 +103,7 @@ class TestTimeDelta64(_TestTimeBase): (TimeDelta64(unit="ns", scale_factor=1), "1", np.timedelta64(1, "ns")), (TimeDelta64(unit="ns", scale_factor=1), "NaT", np.timedelta64("NaT")), ) + item_size_params = (TimeDelta64(unit="ns", scale_factor=1),) def test_time_invalid_unit() -> None: diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 608e272690..302a419c0f 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -2,6 +2,10 @@ from typing import TYPE_CHECKING, Any, ClassVar +import pytest + +from zarr.core.dtype.common import HasItemSize + if TYPE_CHECKING: from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -74,8 +78,8 @@ class _TestZDType: scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () - cast_value_params: ClassVar[tuple[tuple[Any, Any, Any], ...]] + item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: # An equality check for json-encoded scalars. This defaults to regular equality, @@ -119,3 +123,13 @@ def test_cast_value(self, cast_value_params: tuple[Any, Any, Any]) -> None: zdtype, value, expected = cast_value_params observed = zdtype.cast_value(value) assert self.scalar_equals(expected, observed) + + def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: + """ + Test that the item_size attribute matches the numpy dtype itemsize attribute, for dtypes + with a fixed scalar size. + """ + if isinstance(item_size_params, HasItemSize): + assert item_size_params.item_size == item_size_params.to_dtype().itemsize + else: + pytest.skip(f"Dtype {item_size_params} does not implement HasItemSize") From 7c58f7ab40990c77b4eea82cac558d1d1ded9621 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 21:31:09 +0200 Subject: [PATCH 103/130] rename fixed-length string dtypes, and be strict about the numpy object dtype (i.e., refuse to match it) --- src/zarr/api/asynchronous.py | 10 ++-- src/zarr/api/synchronous.py | 2 +- src/zarr/core/dtype/__init__.py | 12 ++--- src/zarr/core/dtype/npy/sized.py | 8 ++-- src/zarr/core/dtype/registry.py | 13 ++++++ src/zarr/core/metadata/dtype.py | 0 tests/conftest.py | 10 ++++ tests/test_array.py | 24 ++++++---- tests/test_dtype/test_npy/test_sized.py | 62 ++++++++++++------------- tests/test_dtype_registry.py | 8 ++-- tests/test_v2.py | 18 +++++-- 11 files changed, 102 insertions(+), 65 deletions(-) delete mode 100644 src/zarr/core/metadata/dtype.py diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 7ecbacd3f6..ad3a81a64d 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -31,7 +31,7 @@ _warn_order_kwarg, _warn_write_empty_chunks_kwarg, ) -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype import ZDTypeLike, get_data_type_from_native_dtype, parse_data_type from zarr.core.group import ( AsyncGroup, ConsolidatedMetadata, @@ -843,7 +843,7 @@ async def create( shape: ChunkCoords | int, *, # Note: this is a change from v2 chunks: ChunkCoords | int | None = None, # TODO: v2 allowed chunks=True - dtype: npt.DTypeLike | None = None, + dtype: ZDTypeLike | None = None, compressor: CompressorLike = "auto", fill_value: Any | None = 0, # TODO: need type order: MemoryOrder | None = None, @@ -990,11 +990,11 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - dtype_wrapped = get_data_type_from_native_dtype(dtype) + zdtype = parse_data_type(dtype, zarr_format=zarr_format) if zarr_format == 2: if chunks is None: chunks = shape - default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype_wrapped) + default_filters, default_compressor = _get_default_chunk_encoding_v2(zdtype) if not filters: filters = default_filters # type: ignore[assignment] if compressor == "auto": @@ -1056,7 +1056,7 @@ async def create( store_path, shape=shape, chunks=chunks, - dtype=dtype_wrapped, + dtype=zdtype, compressor=compressor, fill_value=fill_value, overwrite=overwrite, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 694e8a3d7a..db5862a0ee 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -599,7 +599,7 @@ def create( shape: ChunkCoords | int, *, # Note: this is a change from v2 chunks: ChunkCoords | int | bool | None = None, - dtype: npt.DTypeLike | None = None, + dtype: ZDTypeLike | None = None, compressor: CompressorLike = "auto", fill_value: Any | None = None, # TODO: need type order: MemoryOrder | None = None, diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index b973691f0f..5d51db92db 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -8,9 +8,9 @@ from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 from zarr.core.dtype.npy.sized import ( - FixedLengthAscii, + FixedLengthASCII, FixedLengthBytes, - FixedLengthUnicode, + FixedLengthUTF32, Structured, ) from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 @@ -36,9 +36,9 @@ "DataTypeRegistry", "DataTypeValidationError", "DateTime64", - "FixedLengthAscii", + "FixedLengthASCII", "FixedLengthBytes", - "FixedLengthUnicode", + "FixedLengthUTF32", "Float16", "Float32", "Float64", @@ -72,8 +72,8 @@ ComplexFloatDType = Complex64 | Complex128 COMPLEX_FLOAT_DTYPE: Final = Complex64, Complex128 -StringDType = FixedLengthUnicode | VariableLengthString | FixedLengthAscii -STRING_DTYPE: Final = FixedLengthUnicode, VariableLengthString, FixedLengthAscii +StringDType = FixedLengthUTF32 | VariableLengthString | FixedLengthASCII +STRING_DTYPE: Final = FixedLengthUTF32, VariableLengthString, FixedLengthASCII TimeDType = DateTime64 | TimeDelta64 TIME_DTYPE: Final = DateTime64, TimeDelta64 diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 2b2ed2ac70..bf54638890 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -20,7 +20,7 @@ @dataclass(frozen=True, kw_only=True) -class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): +class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): dtype_cls = np.dtypes.BytesDType _zarr_v3_name = "numpy.fixed_length_ascii" @@ -185,12 +185,12 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) -class FixedLengthUnicode( +class FixedLengthUTF32( ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_ucs4" - code_point_bytes: ClassVar[int] = 4 # UCS4 is 4 bytes per code point + _zarr_v3_name = "numpy.fixed_length_utf32" + code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index ae5c3d426e..047f908ac6 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -3,6 +3,8 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Self +import numpy as np + from zarr.core.dtype.common import DataTypeValidationError if TYPE_CHECKING: @@ -38,6 +40,17 @@ def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: self.lazy_load() + if dtype == np.dtype("O"): + msg = ( + "Data type resolution failed. " + 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' + 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' + "data type. " + "In this case you should construct your array by providing a specific Zarr data " + 'type. For a list of Zarr data types that are compatible with the numpy "Object"' + "data type, see xxxxxxxxxxx" + ) + raise ValueError(msg) for val in self.contents.values(): try: return val.from_dtype(dtype) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/conftest.py b/tests/conftest.py index 725de1b529..a968016e6f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -39,6 +39,7 @@ from zarr.core.array import CompressorsLike, FiltersLike, SerializerLike, ShardsLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import ChunkCoords, MemoryOrder, ShapeLike, ZarrFormat + from zarr.core.dtype.wrapper import ZDType async def parse_store( @@ -417,3 +418,12 @@ def meta_from_array( chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, ) + + +def skip_object_dtype(dtype: ZDType[Any, Any]) -> None: + if dtype.dtype_cls is type(np.dtype("O")): + msg = ( + f"{dtype} uses the numpy object data type, which is not a valid target for data " + "type resolution" + ) + pytest.skip(msg) diff --git a/tests/test_array.py b/tests/test_array.py index bea4f30cc6..0cc32c7806 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -18,6 +18,7 @@ import zarr.api.asynchronous import zarr.api.synchronous as sync_api +from tests.conftest import skip_object_dtype from zarr import Array, AsyncArray, Group from zarr.abc.store import Store from zarr.codecs import ( @@ -43,8 +44,8 @@ from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.common import Endianness from zarr.core.dtype.npy.common import endianness_from_numpy_str -from zarr.core.dtype.npy.float import Float64 -from zarr.core.dtype.npy.int import Int16 +from zarr.core.dtype.npy.float import Float32, Float64 +from zarr.core.dtype.npy.int import Int16, UInt8 from zarr.core.dtype.npy.sized import ( Structured, ) @@ -1009,9 +1010,11 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor """ Test that the same array is produced from a ZDType instance, a numpy dtype, or a numpy string """ + skip_object_dtype(dtype) a = zarr.create_array( store, name="a", shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format ) + b = zarr.create_array( store, name="b", @@ -1054,12 +1057,13 @@ def test_dtype_roundtrip( """ Test that creating an array, then opening it, gets the same array. """ + skip_object_dtype(dtype) a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format) b = zarr.open_array(store) assert a.dtype == b.dtype @staticmethod - @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"]) + @pytest.mark.parametrize("dtype", ["uint8", "float32", "U3", "S4", "V1"]) @pytest.mark.parametrize( "compressors", [ @@ -1244,7 +1248,7 @@ async def test_invalid_v3_arguments( zarr.create(store=store, dtype="uint8", shape=(10,), zarr_format=3, **kwargs) @staticmethod - @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) + @pytest.mark.parametrize("dtype", ["uint8", "float32"]) @pytest.mark.parametrize( "compressors", [ @@ -1284,17 +1288,17 @@ async def test_v2_chunk_encoding( assert arr.filters == filters_expected @staticmethod - @pytest.mark.parametrize("dtype_str", ["uint8", "float32", "str"]) + @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthString()]) async def test_default_filters_compressors( - store: MemoryStore, dtype_str: str, zarr_format: ZarrFormat + store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthString, zarr_format: ZarrFormat ) -> None: """ Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. """ - zdtype = get_data_type_from_native_dtype(dtype_str) + arr = await create_array( store=store, - dtype=dtype_str, + dtype=dtype, shape=(10,), zarr_format=zarr_format, ) @@ -1306,14 +1310,14 @@ async def test_default_filters_compressors( compressors=sig.parameters["compressors"].default, filters=sig.parameters["filters"].default, serializer=sig.parameters["serializer"].default, - dtype=zdtype, + dtype=dtype, ) elif zarr_format == 2: default_filters, default_compressors = _parse_chunk_encoding_v2( compressor=sig.parameters["compressors"].default, filters=sig.parameters["filters"].default, - dtype=zdtype, + dtype=dtype, ) if default_filters is None: expected_filters = () diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 202bb0d04e..8bc83f2f73 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -8,15 +8,15 @@ from zarr.core.dtype.npy.float import Float16, Float64 from zarr.core.dtype.npy.int import Int32, Int64 from zarr.core.dtype.npy.sized import ( - FixedLengthAscii, + FixedLengthASCII, FixedLengthBytes, - FixedLengthUnicode, + FixedLengthUTF32, Structured, ) class TestFixedLengthAscii(_TestZDType): - test_cls = FixedLengthAscii + test_cls = FixedLengthASCII valid_dtype = (np.dtype("|S10"), np.dtype("|S4")) invalid_dtype = ( np.dtype(np.int8), @@ -36,24 +36,24 @@ class TestFixedLengthAscii(_TestZDType): ) scalar_v2_params = ( - (FixedLengthAscii(length=0), ""), - (FixedLengthAscii(length=2), "YWI="), - (FixedLengthAscii(length=4), "YWJjZA=="), + (FixedLengthASCII(length=0), ""), + (FixedLengthASCII(length=2), "YWI="), + (FixedLengthASCII(length=4), "YWJjZA=="), ) scalar_v3_params = ( - (FixedLengthAscii(length=0), ""), - (FixedLengthAscii(length=2), "YWI="), - (FixedLengthAscii(length=4), "YWJjZA=="), + (FixedLengthASCII(length=0), ""), + (FixedLengthASCII(length=2), "YWI="), + (FixedLengthASCII(length=4), "YWJjZA=="), ) cast_value_params = ( - (FixedLengthAscii(length=0), "", np.bytes_("")), - (FixedLengthAscii(length=2), "ab", np.bytes_("ab")), - (FixedLengthAscii(length=4), "abcd", np.bytes_("abcd")), + (FixedLengthASCII(length=0), "", np.bytes_("")), + (FixedLengthASCII(length=2), "ab", np.bytes_("ab")), + (FixedLengthASCII(length=4), "abcd", np.bytes_("abcd")), ) item_size_params = ( - FixedLengthAscii(length=0), - FixedLengthAscii(length=4), - FixedLengthAscii(length=10), + FixedLengthASCII(length=0), + FixedLengthASCII(length=4), + FixedLengthASCII(length=10), ) @@ -103,8 +103,8 @@ class TestFixedLengthBytes(_TestZDType): ) -class TestFixedLengthUnicode(_TestZDType): - test_cls = FixedLengthUnicode +class TestFixedLengthUTF32(_TestZDType): + test_cls = FixedLengthUTF32 valid_dtype = (np.dtype(">U10"), np.dtype("U10", " np.bool_: @staticmethod @pytest.mark.parametrize( - ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicode, "|U4")] + ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUTF32, "|U4")] ) def test_match_dtype( data_type_registry_fixture: DataTypeRegistry, @@ -100,7 +101,7 @@ def test_registered_dtypes( """ Test that the registered dtypes can be retrieved from the registry. """ - + skip_object_dtype(zdtype) assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype assert ( data_type_registry.match_json( @@ -121,6 +122,7 @@ def test_match_dtype_unique( that excludes the data type class being tested, and ensure that an instance of the wrapped data type fails to match anything in the registry """ + skip_object_dtype(zdtype) for _cls in get_args(AnyDType): if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) diff --git a/tests/test_v2.py b/tests/test_v2.py index 51139bbeb4..1b21e09952 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -15,7 +15,9 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype.npy.sized import Structured +from zarr.core.dtype.npy.sized import FixedLengthASCII, FixedLengthUTF32, Structured +from zarr.core.dtype.npy.string import VariableLengthString +from zarr.core.dtype.wrapper import ZDType from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath @@ -101,10 +103,16 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js np.testing.assert_equal(data, expected) -@pytest.mark.parametrize(("dtype", "value"), [("|S1", b"Y"), ("|U1", "Y"), (str, "Y")]) -def test_v2_encode_decode_with_data(dtype, value): - dtype, value = dtype, value - expected = np.full((3,), value, dtype=dtype) +@pytest.mark.parametrize( + ("dtype", "value"), + [ + (FixedLengthASCII(length=1), b"Y"), + (FixedLengthUTF32(length=1), "Y"), + (VariableLengthString(), "Y"), + ], +) +def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str): + expected = np.full((3,), value, dtype=dtype.to_dtype()) a = zarr.create( shape=(3,), zarr_format=2, From 3a21845ca4e962232479944f2de6a4f210b497c6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 21:46:41 +0200 Subject: [PATCH 104/130] remove vestigial use of to_dtype().itemsize() --- src/zarr/core/array.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a4e8c7c3d1..cc67c9040f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4197,7 +4197,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_wrapped = parse_data_type(dtype, zarr_format=zarr_format) + zdtype = parse_data_type(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4211,11 +4211,15 @@ async def init_array( else: await ensure_no_existing_node(store_path, zarr_format=zarr_format) + item_size = 1 + if isinstance(zdtype, HasItemSize): + item_size = zdtype.item_size + shard_shape_parsed, chunk_shape_parsed = _auto_partition( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=dtype_wrapped.to_dtype().itemsize, + item_size=item_size, ) chunks_out: tuple[int, ...] meta: ArrayV2Metadata | ArrayV3Metadata @@ -4231,7 +4235,7 @@ async def init_array( raise ValueError("Zarr format 2 arrays do not support `serializer`.") filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( - compressor=compressors, filters=filters, dtype=dtype_wrapped + compressor=compressors, filters=filters, dtype=zdtype ) if dimension_names is not None: raise ValueError("Zarr format 2 arrays do not support dimension names.") @@ -4242,7 +4246,7 @@ async def init_array( meta = AsyncArray._create_metadata_v2( shape=shape_parsed, - dtype=dtype_wrapped, + dtype=zdtype, chunks=chunk_shape_parsed, dimension_separator=chunk_key_encoding_parsed.separator, fill_value=fill_value, @@ -4256,7 +4260,7 @@ async def init_array( compressors=compressors, filters=filters, serializer=serializer, - dtype=dtype_wrapped, + dtype=zdtype, ) sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes)) codecs_out: tuple[Codec, ...] @@ -4271,7 +4275,7 @@ async def init_array( ) sharding_codec.validate( shape=chunk_shape_parsed, - dtype=dtype_wrapped, + dtype=zdtype, chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), ) codecs_out = (sharding_codec,) @@ -4287,7 +4291,7 @@ async def init_array( meta = AsyncArray._create_metadata_v3( shape=shape_parsed, - dtype=dtype_wrapped, + dtype=zdtype, fill_value=fill_value, chunk_shape=chunks_out, chunk_key_encoding=chunk_key_encoding_parsed, From ce0afe3379836b24db66520289173821b19c72cd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 21:49:12 +0200 Subject: [PATCH 105/130] remove another vestigial use of to_dtype().itemsize() --- src/zarr/core/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index cc67c9040f..d87db52bb4 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -72,7 +72,7 @@ ZDTypeLike, parse_data_type, ) -from zarr.core.dtype.common import HasItemSize +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -4731,7 +4731,7 @@ def _parse_chunk_encoding_v3( # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - if isinstance(out_array_bytes, BytesCodec) and dtype.to_dtype().itemsize == 1: + if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness): # The default endianness in the bytescodec might not be None, so we need to replace it out_array_bytes = replace(out_array_bytes, endian=None) return out_array_array, out_array_bytes, out_bytes_bytes From e67d4dcbb6d55931bf4238712c9607a56baff14f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 23 May 2025 10:49:19 +0200 Subject: [PATCH 106/130] emit warning about unstable dtype when serializing Structured dtype to JSON --- src/zarr/core/dtype/common.py | 20 ++++++++++++++++++++ src/zarr/core/dtype/npy/sized.py | 9 ++++++++- src/zarr/core/dtype/wrapper.py | 13 ------------- tests/test_array.py | 6 +++++- tests/test_dtype/conftest.py | 7 ++++++- tests/test_dtype/test_wrapper.py | 1 + tests/test_dtype_registry.py | 2 ++ 7 files changed, 42 insertions(+), 16 deletions(-) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index d4aded658d..5eeff2af5b 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from dataclasses import dataclass from typing import Final, Literal @@ -43,3 +44,22 @@ class HasItemSize: @property def item_size(self) -> int: raise NotImplementedError + + +class UnstableSpecificationWarning(FutureWarning): ... + + +def v3_unstable_dtype_warning(dtype: object) -> None: + """ + Emit this warning when a data type does not have a stable zarr v3 spec + """ + msg = ( + f"The data type ({dtype}) does not have a Zarr V3 specification. " + "That means that the representation of data saved with this data type may change without " + "warning in a future version of Zarr Python. " + "Arrays stored with this data type may be unreadable by other Zarr libraries " + "Use this data type at your own risk! " + "Check https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for the " + "status of data type specifications for Zarr V3." + ) + warnings.warn(msg, category=UnstableSpecificationWarning, stacklevel=2) diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index bf54638890..1014ba6f79 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -7,7 +7,13 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize, HasLength +from zarr.core.dtype.common import ( + DataTypeValidationError, + HasEndianness, + HasItemSize, + HasLength, + v3_unstable_dtype_warning, +) from zarr.core.dtype.npy.common import ( EndiannessNumpy, bytes_from_json, @@ -325,6 +331,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: return fields elif zarr_format == 3: + v3_unstable_dtype_warning(self) base_dict = {"name": self._zarr_v3_name} base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] return cast("JSON", base_dict) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index c8e060e764..1a9d9b1e21 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -22,7 +22,6 @@ from __future__ import annotations -import warnings from abc import ABC, abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING, ClassVar, Generic, Self, TypeGuard, TypeVar @@ -336,15 +335,3 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal The native scalar value. """ ... - - -def v3_unstable_dtype_warning(dtype: ZDType[TBaseDType, TBaseScalar]) -> None: - msg = ( - f"You are using a data type ({dtype}) that does not have a stable Zarr V3 specification." - "Be advised that arrays stored with this data type may be unreadable by other Zarr " - "libraries, and possibly future versions of Zarr-Python as well. " - "Use this data type at your own risk." - "See https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for a list" - "of data types with a stable Zarr V3 specification." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) diff --git a/tests/test_array.py b/tests/test_array.py index 0cc32c7806..db7214f3fc 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -189,6 +189,7 @@ def test_array_name_properties_with_group( assert spam.basename == "spam" +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize("specifiy_fill_value", [True, False]) @pytest.mark.parametrize( @@ -199,7 +200,7 @@ def test_array_fill_value_default( ) -> None: """ Test that creating an array with the fill_value parameter set to None, or unspecified, - results in the expected fill_value attribute of the array, i.e. 0 cast to the array's dtype. + results in the expected fill_value attribute of the array, i.e. the default value of the dtype """ shape = (10,) if specifiy_fill_value: @@ -994,6 +995,7 @@ def test_chunks_and_shards(store: Store) -> None: @staticmethod @pytest.mark.parametrize("dtype", zdtype_examples) + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: """ Test that the fill value of an array is set to the default value for the dtype object @@ -1005,6 +1007,7 @@ def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: assert a.fill_value == dtype.default_value() @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("dtype", zdtype_examples) def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat) -> None: """ @@ -1050,6 +1053,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor assert a.dtype == c.dtype @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("dtype", zdtype_examples) def test_dtype_roundtrip( dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index bf58a17556..2b21a57365 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -1,4 +1,5 @@ # Generate a collection of zdtype instances for use in testing. +import warnings from typing import Any import numpy as np @@ -13,7 +14,11 @@ for wrapper_cls in data_type_registry.contents.values(): # The Structured dtype has to be constructed with some actual fields if wrapper_cls is Structured: - zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + zdtype_examples += ( + wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])), + ) elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 302a419c0f..a33e443c76 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -105,6 +105,7 @@ def test_from_json_roundtrip_v2(self, valid_json_v2: Any) -> None: zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) assert zdtype.to_json(zarr_format=2) == valid_json_v2 + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 35c704673d..0c650e5c29 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -94,6 +94,7 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non data_type_registry_fixture.get(outside_dtype) @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("zdtype", zdtype_examples) def test_registered_dtypes( zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat @@ -111,6 +112,7 @@ def test_registered_dtypes( ) @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("zdtype", zdtype_examples) def test_match_dtype_unique( zdtype: ZDType[Any, Any], From 4e2a15783635f0e3b95febd88d5fa75177f8b7c1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 24 May 2025 14:10:09 +0200 Subject: [PATCH 107/130] put string dtypes in the strings module --- src/zarr/core/dtype/__init__.py | 4 +- src/zarr/core/dtype/npy/sized.py | 158 +---------------------- src/zarr/core/dtype/npy/string.py | 164 +++++++++++++++++++++++- tests/test_dtype/test_npy/test_sized.py | 8 +- tests/test_v2.py | 3 +- 5 files changed, 171 insertions(+), 166 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 5d51db92db..9c672fd986 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -8,9 +8,7 @@ from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 from zarr.core.dtype.npy.sized import ( - FixedLengthASCII, FixedLengthBytes, - FixedLengthUTF32, Structured, ) from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 @@ -24,6 +22,8 @@ from zarr.core.common import JSON from zarr.core.dtype.npy.string import ( _NUMPY_SUPPORTS_VLEN_STRING, + FixedLengthASCII, + FixedLengthUTF32, VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 1014ba6f79..eb2b39ad9a 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -2,100 +2,25 @@ import re from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, ClassVar, Self, TypeGuard, cast +from typing import Any, Self, TypeGuard, cast import numpy as np from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.common import ( DataTypeValidationError, - HasEndianness, HasItemSize, HasLength, v3_unstable_dtype_warning, ) from zarr.core.dtype.npy.common import ( - EndiannessNumpy, bytes_from_json, bytes_to_json, check_json_str, - endianness_from_numpy_str, - endianness_to_numpy_str, ) from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType -@dataclass(frozen=True, kw_only=True) -class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): - dtype_cls = np.dtypes.BytesDType - _zarr_v3_name = "numpy.fixed_length_ascii" - - @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize) - - def to_dtype(self) -> np.dtypes.BytesDType[int]: - return self.dtype_cls(self.length) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - if zarr_format == 2: - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and "length_bytes" in data["configuration"] - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.bytes_: - return np.bytes_(b"") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_value(self, data: object) -> bool: - # this is generous for backwards compatibility - return isinstance(data, np.bytes_ | str | bytes | int) - - def _cast_value_unsafe(self, value: object) -> np.bytes_: - return self.to_dtype().type(value) - - @property - def item_size(self) -> int: - return self.length - - @dataclass(frozen=True, kw_only=True) class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): # np.dtypes.VoidDType is specified in an odd way in numpy @@ -190,87 +115,6 @@ def item_size(self) -> int: return self.length -@dataclass(frozen=True, kw_only=True) -class FixedLengthUTF32( - ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize -): - dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_utf32" - code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point - - @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls( - length=dtype.itemsize // (cls.code_point_bytes), - endianness=endianness_from_numpy_str(byte_order), - ) - - def to_dtype(self) -> np.dtypes.StrDType[int]: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls(self.length).newbyteorder(byte_order) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - if zarr_format == 2: - # match >U1, <]U\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - and isinstance(data["configuration"]["length_bytes"], int) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length * self.code_point_bytes}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.str_: - return np.str_("") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - if check_json_str(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_value(self, data: object) -> bool: - # this is generous for backwards compatibility - return isinstance(data, str | np.str_ | bytes | int) - - def _cast_value_unsafe(self, data: object) -> np.str_: - return self.to_dtype().type(data) - - @property - def item_size(self) -> int: - return self.length * self.code_point_bytes - - @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index d5a4f9be08..f65db5a984 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -1,11 +1,19 @@ from __future__ import annotations +import base64 +import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Self, TypeGuard +from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard, cast import numpy as np -from zarr.core.dtype.npy.common import check_json_str +from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength +from zarr.core.dtype.npy.common import ( + EndiannessNumpy, + check_json_str, + endianness_from_numpy_str, + endianness_to_numpy_str, +) from zarr.core.dtype.wrapper import ZDType if TYPE_CHECKING: @@ -15,6 +23,158 @@ _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +@dataclass(frozen=True, kw_only=True) +class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): + dtype_cls = np.dtypes.BytesDType + _zarr_v3_name = "numpy.fixed_length_ascii" + + @classmethod + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + return cls(length=dtype.itemsize) + + def to_dtype(self) -> np.dtypes.BytesDType[int]: + return self.dtype_cls(self.length) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and "length_bytes" in data["configuration"] + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.bytes_: + return np.bytes_(b"") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + + def check_value(self, data: object) -> bool: + # this is generous for backwards compatibility + return isinstance(data, np.bytes_ | str | bytes | int) + + def _cast_value_unsafe(self, value: object) -> np.bytes_: + return self.to_dtype().type(value) + + @property + def item_size(self) -> int: + return self.length + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthUTF32( + ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize +): + dtype_cls = np.dtypes.StrDType + _zarr_v3_name = "numpy.fixed_length_utf32" + code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point + + @classmethod + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls( + length=dtype.itemsize // (cls.code_point_bytes), + endianness=endianness_from_numpy_str(byte_order), + ) + + def to_dtype(self) -> np.dtypes.StrDType[int]: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls(self.length).newbyteorder(byte_order) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match >U1, <]U\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} + and isinstance(data["configuration"]["length_bytes"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length * self.code_point_bytes}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.str_: + return np.str_("") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + if check_json_str(data): + return self.to_dtype().type(data) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + + def check_value(self, data: object) -> bool: + # this is generous for backwards compatibility + return isinstance(data, str | np.str_ | bytes | int) + + def _cast_value_unsafe(self, data: object) -> np.str_: + return self.to_dtype().type(data) + + @property + def item_size(self) -> int: + return self.length * self.code_point_bytes + + if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 8bc83f2f73..c0e8f137d4 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -5,12 +5,14 @@ import numpy as np from tests.test_dtype.test_wrapper import _TestZDType -from zarr.core.dtype.npy.float import Float16, Float64 -from zarr.core.dtype.npy.int import Int32, Int64 -from zarr.core.dtype.npy.sized import ( +from zarr.core.dtype import ( FixedLengthASCII, FixedLengthBytes, FixedLengthUTF32, + Float16, + Float64, + Int32, + Int64, Structured, ) diff --git a/tests/test_v2.py b/tests/test_v2.py index 1b21e09952..4b041a9b82 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -15,8 +15,7 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype.npy.sized import FixedLengthASCII, FixedLengthUTF32, Structured -from zarr.core.dtype.npy.string import VariableLengthString +from zarr.core.dtype import FixedLengthASCII, FixedLengthUTF32, Structured, VariableLengthString from zarr.core.dtype.wrapper import ZDType from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath From 528a942c91de3febd3897e7d7ee21152fcbfed62 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 24 May 2025 22:56:35 +0200 Subject: [PATCH 108/130] make tests isomorphic to source code --- tests/test_dtype/test_npy/test_sized.py | 83 ------------------------ tests/test_dtype/test_npy/test_string.py | 82 +++++++++++++++++++++++ tests/test_dtype/test_wrapper.py | 2 +- 3 files changed, 83 insertions(+), 84 deletions(-) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index c0e8f137d4..eaaa915f59 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -6,9 +6,7 @@ from tests.test_dtype.test_wrapper import _TestZDType from zarr.core.dtype import ( - FixedLengthASCII, FixedLengthBytes, - FixedLengthUTF32, Float16, Float64, Int32, @@ -17,48 +15,6 @@ ) -class TestFixedLengthAscii(_TestZDType): - test_cls = FixedLengthASCII - valid_dtype = (np.dtype("|S10"), np.dtype("|S4")) - invalid_dtype = ( - np.dtype(np.int8), - np.dtype(np.float64), - np.dtype("|U10"), - ) - valid_json_v2 = ("|S0", "|S2", "|S4") - valid_json_v3 = ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 10}},) - invalid_json_v2 = ( - "|S", - "|U10", - "|f8", - ) - invalid_json_v3 = ( - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": 0}}, - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, - ) - - scalar_v2_params = ( - (FixedLengthASCII(length=0), ""), - (FixedLengthASCII(length=2), "YWI="), - (FixedLengthASCII(length=4), "YWJjZA=="), - ) - scalar_v3_params = ( - (FixedLengthASCII(length=0), ""), - (FixedLengthASCII(length=2), "YWI="), - (FixedLengthASCII(length=4), "YWJjZA=="), - ) - cast_value_params = ( - (FixedLengthASCII(length=0), "", np.bytes_("")), - (FixedLengthASCII(length=2), "ab", np.bytes_("ab")), - (FixedLengthASCII(length=4), "abcd", np.bytes_("abcd")), - ) - item_size_params = ( - FixedLengthASCII(length=0), - FixedLengthASCII(length=4), - FixedLengthASCII(length=10), - ) - - class TestFixedLengthBytes(_TestZDType): test_cls = FixedLengthBytes valid_dtype = (np.dtype("|V10"),) @@ -105,45 +61,6 @@ class TestFixedLengthBytes(_TestZDType): ) -class TestFixedLengthUTF32(_TestZDType): - test_cls = FixedLengthUTF32 - valid_dtype = (np.dtype(">U10"), np.dtype("U10", "U10"), np.dtype("U10", " bool: return scalar1 == scalar2 def test_check_dtype_valid(self, valid_dtype: object) -> None: - assert self.test_cls.check_dtype(valid_dtype) # type: ignore[arg-type] + assert self.test_cls.check_dtype(valid_dtype) def test_check_dtype_invalid(self, invalid_dtype: object) -> None: assert not self.test_cls.check_dtype(invalid_dtype) # type: ignore[arg-type] From c9c8181534837fb4d43d19c499d1750ddd6eae17 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 25 May 2025 12:19:20 +0200 Subject: [PATCH 109/130] remove old string logic --- src/zarr/codecs/vlen_utf8.py | 5 +- src/zarr/core/strings.py | 89 ---------------------------------- tests/test_codecs/test_vlen.py | 2 +- tests/test_metadata/test_v3.py | 2 +- tests/test_strings.py | 37 -------------- 5 files changed, 4 insertions(+), 131 deletions(-) delete mode 100644 src/zarr/core/strings.py delete mode 100644 tests/test_strings.py diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 0ef423793d..15bae8da81 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -10,7 +10,6 @@ from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON, parse_named_configuration -from zarr.core.strings import cast_to_string_dtype from zarr.registry import register_codec if TYPE_CHECKING: @@ -49,6 +48,7 @@ def to_dict(self) -> dict[str, JSON]: def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self + # TODO: expand the tests for this function async def _decode_single( self, chunk_bytes: Buffer, @@ -60,8 +60,7 @@ async def _decode_single( decoded = _vlen_utf8_codec.decode(raw_bytes) assert decoded.dtype == np.object_ decoded.shape = chunk_spec.shape - # coming out of the code, we know this is safe, so don't issue a warning - as_string_dtype = cast_to_string_dtype(decoded, safe=True) + as_string_dtype = decoded.astype(chunk_spec.dtype.to_dtype(), copy=False) return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype) async def _encode_single( diff --git a/src/zarr/core/strings.py b/src/zarr/core/strings.py deleted file mode 100644 index 15c30b6f9b..0000000000 --- a/src/zarr/core/strings.py +++ /dev/null @@ -1,89 +0,0 @@ -"""This module contains utilities for working with string arrays across -different versions of Numpy. -""" - -from __future__ import annotations - -from typing import Any, Union, cast -from warnings import warn - -import numpy as np - -# _STRING_DTYPE is the in-memory datatype that will be used for V3 string arrays -# when reading data back from Zarr. -# Any valid string-like datatype should be fine for *setting* data. - -VLenStringType = Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"] -_VLEN_STRING_DTYPE: VLenStringType -_NUMPY_SUPPORTS_VLEN_STRING: bool - - -def cast_array( - data: np.ndarray[Any, np.dtype[Any]], -) -> np.ndarray[Any, VLenStringType]: - raise NotImplementedError - - -try: - # this new vlen string dtype was added in NumPy 2.0 - _VLEN_STRING_DTYPE = np.dtypes.StringDType() - _NUMPY_SUPPORTS_VLEN_STRING = True - - def cast_array( - data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, VLenStringType]: - out = data.astype(_VLEN_STRING_DTYPE, copy=False) - return cast(np.ndarray[Any, np.dtypes.StringDType], out) - -except AttributeError: - # if not available, we fall back on an object array of strings, as in Zarr < 3 - _VLEN_STRING_DTYPE = np.dtypes.ObjectDType() - _NUMPY_SUPPORTS_VLEN_STRING = False - - def cast_array( - data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, VLenStringType]: - out = data.astype(_VLEN_STRING_DTYPE, copy=False) - return cast(np.ndarray[Any, np.dtypes.ObjectDType], out) - - -def cast_to_string_dtype( - data: np.ndarray[Any, np.dtype[Any]], safe: bool = False -) -> np.ndarray[Any, VLenStringType]: - """Take any data and attempt to cast to to our preferred string dtype. - - data : np.ndarray - The data to cast - - safe : bool - If True, do not issue a warning if the data is cast from object to string dtype. - - """ - if np.issubdtype(data.dtype, np.str_): - # legacy fixed-width string type (e.g. "= 2.", - stacklevel=2, - ) - return cast_array(data) - raise ValueError(f"Cannot cast dtype {data.dtype} to string dtype") diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index b1508953ea..9024efa7ed 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -9,8 +9,8 @@ from zarr.abc.store import Store from zarr.codecs import ZstdCodec from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.metadata.v3 import ArrayV3Metadata -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage import StorePath numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"] diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 0d7da0153f..f3bd4510e5 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,6 +12,7 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( @@ -19,7 +20,6 @@ parse_dimension_names, parse_zarr_format, ) -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.errors import MetadataValidationError if TYPE_CHECKING: diff --git a/tests/test_strings.py b/tests/test_strings.py deleted file mode 100644 index 963f2e305e..0000000000 --- a/tests/test_strings.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Tests for the strings module.""" - -import numpy as np -import pytest - -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING, _VLEN_STRING_DTYPE, cast_to_string_dtype - - -def test_string_defaults() -> None: - if _NUMPY_SUPPORTS_VLEN_STRING: - assert _VLEN_STRING_DTYPE == np.dtypes.StringDType() - else: - assert _VLEN_STRING_DTYPE == np.dtypes.ObjectDType() - - -def test_cast_to_string_dtype() -> None: - d1 = np.array(["a", "b", "c"]) - assert d1.dtype == np.dtype(" Date: Mon, 26 May 2025 17:27:41 +0200 Subject: [PATCH 110/130] use scale_factor and unit in cast_value for datetime --- src/zarr/core/dtype/npy/time.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 61786351f8..1c0e0d715c 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -240,7 +240,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, data: object) -> np.datetime64: - return self.to_dtype().type(data) # type: ignore[no-any-return, call-overload] + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] @classmethod def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: From 7806563c681ec53b6c446a99a8680b1c70f5fc98 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 10:46:10 +0200 Subject: [PATCH 111/130] add regression testing against v2.18 --- tests/test_regression/__init__.py | 0 tests/test_regression/test_regression.py | 125 +++++++++++++++++++++++ tests/test_regression/v2.18.py | 81 +++++++++++++++ 3 files changed, 206 insertions(+) create mode 100644 tests/test_regression/__init__.py create mode 100644 tests/test_regression/test_regression.py create mode 100644 tests/test_regression/v2.18.py diff --git a/tests/test_regression/__init__.py b/tests/test_regression/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py new file mode 100644 index 0000000000..362e8d75b4 --- /dev/null +++ b/tests/test_regression/test_regression.py @@ -0,0 +1,125 @@ +import subprocess +from dataclasses import asdict, dataclass +from itertools import product +from pathlib import Path + +import numcodecs +import numpy as np +import pytest +from numcodecs import LZ4, LZMA, Blosc, GZip, VLenUTF8, Zstd + +import zarr +from zarr.core.array import Array +from zarr.core.dtype.npy.string import VariableLengthString +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.storage import LocalStore + + +def runner_installed() -> bool: + try: + subprocess.check_output(["uv", "--version"]) + return True + except FileNotFoundError: + return False + + +def array_metadata_equals(a: ArrayV2Metadata, b: ArrayV2Metadata) -> bool: + dict_a, dict_b = asdict(a), asdict(b) + fill_value_a, fill_value_b = dict_a.pop("fill_value"), dict_b.pop("fill_value") + if ( + isinstance(fill_value_a, float) + and isinstance(fill_value_b, float) + and np.isnan(fill_value_a) + and np.isnan(fill_value_b) + ): + return dict_a == dict_b + else: + return fill_value_a == fill_value_b and dict_a == dict_b + + +@dataclass(kw_only=True) +class ArrayParams: + values: np.ndarray[tuple[int], np.dtype[np.generic]] + fill_value: np.generic | str + compressor: numcodecs.abc.Codec + + +basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() +basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" +string_dtypes = ">S1", "U4" + +basic_array_cases = [ + ArrayParams(values=np.arange(4, dtype=dtype), fill_value=1, compressor=codec) + for codec, dtype in product(basic_codecs, basic_dtypes) +] +datetime_array_cases = [ + ArrayParams(values=np.ones((4,), dtype=dtype), fill_value=1, compressor=codec) + for codec, dtype in product(basic_codecs, datetime_dtypes) +] +string_array_cases = [ + ArrayParams( + values=np.array(["aaaa", "bbbb", "ccccc", "dddd"], dtype=dtype), + fill_value="foo", + compressor=codec, + ) + for codec, dtype in product(basic_codecs, string_dtypes) +] +vlen_string_cases = [ + ArrayParams( + values=np.array(["a", "bb", "ccc", "dddd"], dtype="O"), + fill_value="1", + compressor=VLenUTF8(), + ) +] +array_cases = basic_array_cases + datetime_array_cases + string_array_cases + vlen_string_cases + + +@pytest.fixture +def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: + dest = tmp_path / "in" + store = LocalStore(dest) + array_params: ArrayParams = request.param + compressor = array_params.compressor + if array_params.values.dtype == np.dtype("|O"): + dtype = VariableLengthString() + else: + dtype = array_params.values.dtype + z = zarr.create_array( + store, + shape=array_params.values.shape, + dtype=dtype, + chunks=array_params.values.shape, + compressors=compressor, + fill_value=array_params.fill_value, + order="C", + filters=None, + chunk_key_encoding={"name": "v2", "configuration": {"separator": "/"}}, + write_data=True, + zarr_format=2, + ) + z[:] = array_params.values + return z + + +@pytest.mark.skipif(not runner_installed(), reason="no python script runner installed") +@pytest.mark.parametrize( + "source_array", array_cases, indirect=True, ids=tuple(map(str, array_cases)) +) +def test_roundtrip(source_array: Array, tmp_path: Path) -> None: + out_path = tmp_path / "out" + copy_op = subprocess.run( + [ + "uv", + "run", + Path(__file__).resolve().parent / "v2.18.py", + str(source_array.store).removeprefix("file://"), + str(out_path), + ], + capture_output=True, + text=True, + ) + assert copy_op.returncode == 0 + out_array = zarr.open_array(store=out_path, mode="r", zarr_format=2) + assert array_metadata_equals(source_array.metadata, out_array.metadata) + assert np.array_equal(source_array[:], out_array[:]) diff --git a/tests/test_regression/v2.18.py b/tests/test_regression/v2.18.py new file mode 100644 index 0000000000..39e1c5210c --- /dev/null +++ b/tests/test_regression/v2.18.py @@ -0,0 +1,81 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "zarr==2.18", +# "numcodecs==0.15" +# ] +# /// + +import argparse + +import zarr +from zarr._storage.store import BaseStore + + +def copy_group( + *, node: zarr.hierarchy.Group, store: zarr.storage.BaseStore, path: str, overwrite: bool +) -> zarr.hierarchy.Group: + result = zarr.group(store=store, path=path, overwrite=overwrite) + result.attrs.put(node.attrs.asdict()) + for key, child in node.items(): + child_path = f"{path}/{key}" + if isinstance(child, zarr.hierarchy.Group): + copy_group(node=child, store=store, path=child_path, overwrite=overwrite) + elif isinstance(child, zarr.core.Array): + copy_array(node=child, store=store, overwrite=overwrite, path=child_path) + return result + + +def copy_array( + *, node: zarr.core.Array, store: BaseStore, path: str, overwrite: bool +) -> zarr.core.Array: + result = zarr.create( + shape=node.shape, + dtype=node.dtype, + fill_value=node.fill_value, + chunks=node.chunks, + compressor=node.compressor, + filters=node.filters, + order=node.order, + dimension_separator=node._dimension_separator, + store=store, + path=path, + overwrite=overwrite, + ) + result.attrs.put(node.attrs.asdict()) + result[:] = node[:] + return result + + +def copy_node( + node: zarr.hierarchy.Group | zarr.core.Array, store: BaseStore, path: str, overwrite: bool +) -> zarr.hierarchy.Group | zarr.core.Array: + if isinstance(node, zarr.hierarchy.Group): + return copy_group(node=node, store=store, path=path, overwrite=overwrite) + elif isinstance(node, zarr.core.Array): + return copy_array(node=node, store=store, path=path, overwrite=overwrite) + else: + raise TypeError(f"Unexpected node type: {type(node)}") # pragma: no cover + + +def cli() -> None: + parser = argparse.ArgumentParser( + description="Copy a zarr hierarchy from one location to another" + ) + parser.add_argument("source", type=str, help="Path to the source zarr hierarchy") + parser.add_argument("destination", type=str, help="Path to the destination zarr hierarchy") + args = parser.parse_args() + + src, dst = args.source, args.destination + root_src = zarr.open(src, mode="r") + result = copy_node(node=root_src, store=zarr.NestedDirectoryStore(dst), path="", overwrite=True) + + print(f"successfully created {result} at {dst}") + + +def main() -> None: + cli() + + +if __name__ == "__main__": + main() From 39219fa45b8be28db30dc29947c5d0c33f094df5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 11:33:05 +0200 Subject: [PATCH 112/130] truncate U and S scalars in _cast_value_unsafe --- src/zarr/core/dtype/npy/string.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index f65db5a984..b5b86ca387 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -86,8 +86,15 @@ def check_value(self, data: object) -> bool: # this is generous for backwards compatibility return isinstance(data, np.bytes_ | str | bytes | int) - def _cast_value_unsafe(self, value: object) -> np.bytes_: - return self.to_dtype().type(value) + def _cast_value_unsafe(self, data: object) -> np.bytes_: + # We explicitly truncate the result because of the following numpy behavior: + # >>> x = np.dtype('S3').type('hello world') + # >>> x + # np.bytes_(b'hello world') + # >>> x.dtype + # dtype('S11') + + return self.to_dtype().type(data[: self.length]) # type: ignore[index] @property def item_size(self) -> int: @@ -168,7 +175,14 @@ def check_value(self, data: object) -> bool: return isinstance(data, str | np.str_ | bytes | int) def _cast_value_unsafe(self, data: object) -> np.str_: - return self.to_dtype().type(data) + # We explicitly truncate the result because of the following numpy behavior: + # >>> x = np.dtype('U3').type('hello world') + # >>> x + # np.str_('hello world') + # >>> x.dtype + # dtype('U11') + + return self.to_dtype().type(data[: self.length]) # type: ignore[index] @property def item_size(self) -> int: From 4a7a5502349ba28e0fc1a484a1ab499ca32583f7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 11:58:34 +0200 Subject: [PATCH 113/130] docstrings and simplification for regression tests --- tests/test_regression/test_regression.py | 26 +++++++----------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index 362e8d75b4..688c5ff89d 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -1,5 +1,5 @@ import subprocess -from dataclasses import asdict, dataclass +from dataclasses import dataclass from itertools import product from pathlib import Path @@ -11,36 +11,24 @@ import zarr from zarr.core.array import Array from zarr.core.dtype.npy.string import VariableLengthString -from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.storage import LocalStore def runner_installed() -> bool: + """ + Check if a PEP-723 compliant python script runner is installed. + """ try: subprocess.check_output(["uv", "--version"]) - return True + return True # noqa: TRY300 except FileNotFoundError: return False -def array_metadata_equals(a: ArrayV2Metadata, b: ArrayV2Metadata) -> bool: - dict_a, dict_b = asdict(a), asdict(b) - fill_value_a, fill_value_b = dict_a.pop("fill_value"), dict_b.pop("fill_value") - if ( - isinstance(fill_value_a, float) - and isinstance(fill_value_b, float) - and np.isnan(fill_value_a) - and np.isnan(fill_value_b) - ): - return dict_a == dict_b - else: - return fill_value_a == fill_value_b and dict_a == dict_b - - @dataclass(kw_only=True) class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] - fill_value: np.generic | str + fill_value: np.generic | str | int compressor: numcodecs.abc.Codec @@ -121,5 +109,5 @@ def test_roundtrip(source_array: Array, tmp_path: Path) -> None: ) assert copy_op.returncode == 0 out_array = zarr.open_array(store=out_path, mode="r", zarr_format=2) - assert array_metadata_equals(source_array.metadata, out_array.metadata) + assert source_array.metadata.to_dict() == out_array.metadata.to_dict() assert np.array_equal(source_array[:], out_array[:]) From 807c585e9c15615cadd8d781c05422e84abdcff6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 12:29:34 +0200 Subject: [PATCH 114/130] changes necessary for linting with regression tests --- pyproject.toml | 3 ++- src/zarr/core/dtype/wrapper.py | 1 + tests/test_dtype/test_wrapper.py | 2 +- tests/test_regression/scripts/__init__.py | 0 tests/test_regression/{ => scripts}/v2.18.py | 0 tests/test_regression/test_regression.py | 20 ++++++++++++++++---- 6 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 tests/test_regression/scripts/__init__.py rename tests/test_regression/{ => scripts}/v2.18.py (100%) diff --git a/pyproject.toml b/pyproject.toml index a43e51abd2..33904334e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -283,6 +283,7 @@ extend-exclude = [ "notebooks", # temporary, until we achieve compatibility with ruff ≥ 0.6 "venv", "docs", + "tests/test_regression/scripts/", # these are scripts that use a different version of python "src/zarr/v2/", "tests/v2/", ] @@ -353,7 +354,6 @@ strict = true warn_unreachable = true enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] - [[tool.mypy.overrides]] module = [ "tests.package_with_entrypoint.*", @@ -383,6 +383,7 @@ module = [ "tests.test_properties", "tests.test_sync", "tests.test_v2", + "tests.test_regression.scripts.*" ] ignore_errors = true diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 1a9d9b1e21..bd9686afc1 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -39,6 +39,7 @@ # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. TBaseDType = np.dtype[np.generic] + # These two type parameters are covariant because we want # x : ZDType[BaseDType, BaseScalar] = ZDType[SubDType, SubScalar] # to type check diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index a61fc1a9cd..9a5e3ee56d 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -91,7 +91,7 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: # but some classes may need to override this for special cases return scalar1 == scalar2 - def test_check_dtype_valid(self, valid_dtype: object) -> None: + def test_check_dtype_valid(self, valid_dtype: TBaseDType) -> None: assert self.test_cls.check_dtype(valid_dtype) def test_check_dtype_invalid(self, invalid_dtype: object) -> None: diff --git a/tests/test_regression/scripts/__init__.py b/tests/test_regression/scripts/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_regression/v2.18.py b/tests/test_regression/scripts/v2.18.py similarity index 100% rename from tests/test_regression/v2.18.py rename to tests/test_regression/scripts/v2.18.py diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index 688c5ff89d..61ff8ebfa9 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from itertools import product from pathlib import Path +from typing import TYPE_CHECKING import numcodecs import numpy as np @@ -10,9 +11,13 @@ import zarr from zarr.core.array import Array +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.string import VariableLengthString from zarr.storage import LocalStore +if TYPE_CHECKING: + from zarr.core.dtype import ZDTypeLike + def runner_installed() -> bool: """ @@ -69,8 +74,10 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: store = LocalStore(dest) array_params: ArrayParams = request.param compressor = array_params.compressor + chunk_key_encoding = V2ChunkKeyEncoding(separator="/") + dtype: ZDTypeLike if array_params.values.dtype == np.dtype("|O"): - dtype = VariableLengthString() + dtype = VariableLengthString() # type: ignore[assignment] else: dtype = array_params.values.dtype z = zarr.create_array( @@ -82,7 +89,7 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: fill_value=array_params.fill_value, order="C", filters=None, - chunk_key_encoding={"name": "v2", "configuration": {"separator": "/"}}, + chunk_key_encoding=chunk_key_encoding, write_data=True, zarr_format=2, ) @@ -90,17 +97,22 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: return z +# TODO: make this dynamic based on the installed scripts +script_paths = [Path(__file__).resolve().parent / "scripts" / "v2.18.py"] + + @pytest.mark.skipif(not runner_installed(), reason="no python script runner installed") @pytest.mark.parametrize( "source_array", array_cases, indirect=True, ids=tuple(map(str, array_cases)) ) -def test_roundtrip(source_array: Array, tmp_path: Path) -> None: +@pytest.mark.parametrize("script_path", script_paths) +def test_roundtrip(source_array: Array, tmp_path: Path, script_path: Path) -> None: out_path = tmp_path / "out" copy_op = subprocess.run( [ "uv", "run", - Path(__file__).resolve().parent / "v2.18.py", + script_path, str(source_array.store).removeprefix("file://"), str(out_path), ], From 5150d607c7ec17d428b24aaa0596927b500704af Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 12:40:03 +0200 Subject: [PATCH 115/130] improve method names, refactor type hints with typeddictionaries, fix registry load frequency, add object_codec_id for v2 json deserialization --- docs/user-guide/arrays.rst | 4 +- docs/user-guide/data_types.rst | 10 +- src/zarr/codecs/_v2.py | 6 +- src/zarr/codecs/bytes.py | 2 +- src/zarr/codecs/sharding.py | 4 +- src/zarr/codecs/vlen_utf8.py | 2 +- src/zarr/core/array.py | 6 +- src/zarr/core/codec_pipeline.py | 8 +- src/zarr/core/common.py | 10 + src/zarr/core/dtype/__init__.py | 52 +-- src/zarr/core/dtype/common.py | 24 +- src/zarr/core/dtype/npy/bool.py | 50 +-- src/zarr/core/dtype/npy/common.py | 5 +- src/zarr/core/dtype/npy/complex.py | 53 ++-- src/zarr/core/dtype/npy/float.py | 48 +-- src/zarr/core/dtype/npy/int.py | 368 +++++++++++++++++----- src/zarr/core/dtype/npy/sized.py | 220 +++++++------ src/zarr/core/dtype/npy/string.py | 258 +++++++++------ src/zarr/core/dtype/npy/time.py | 174 +++++----- src/zarr/core/dtype/registry.py | 49 ++- src/zarr/core/dtype/wrapper.py | 151 ++++++--- src/zarr/core/metadata/v2.py | 33 +- src/zarr/core/metadata/v3.py | 10 +- tests/package_with_entrypoint/__init__.py | 12 +- tests/test_array.py | 22 +- tests/test_dtype/conftest.py | 2 +- tests/test_dtype/test_npy/test_bool.py | 6 +- tests/test_dtype/test_npy/test_common.py | 36 ++- tests/test_dtype/test_npy/test_complex.py | 8 +- tests/test_dtype/test_npy/test_float.py | 18 +- tests/test_dtype/test_npy/test_int.py | 34 +- tests/test_dtype/test_npy/test_sized.py | 18 +- tests/test_dtype/test_npy/test_string.py | 36 ++- tests/test_dtype/test_npy/test_time.py | 18 +- tests/test_dtype/test_wrapper.py | 48 +-- tests/test_dtype_registry.py | 47 ++- tests/test_group.py | 7 +- tests/test_metadata/test_consolidated.py | 2 +- tests/test_metadata/test_v2.py | 2 +- tests/test_metadata/test_v3.py | 14 +- tests/test_properties.py | 2 +- tests/test_regression/test_regression.py | 6 +- tests/test_v2.py | 10 +- 43 files changed, 1210 insertions(+), 685 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index c27f1296b9..13190a4689 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -211,8 +211,8 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 3558573 - Storage ratio : 112.4 + No. bytes stored : 9696520 + Storage ratio : 41.3 Chunks Initialized : 100 .. note:: diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index a4d8314a5e..c101ae50fc 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -128,20 +128,20 @@ Create a ``ZDType`` from a native data type: >>> from zarr.core.dtype import Int8 >>> import numpy as np - >>> int8 = Int8.from_dtype(np.dtype('int8')) + >>> int8 = Int8.from_native_dtype(np.dtype('int8')) Convert back to native data type: .. code-block:: python - >>> native_dtype = int8.to_dtype() + >>> native_dtype = int8.to_native_dtype() >>> assert native_dtype == np.dtype('int8') Get the default scalar value for the data type: .. code-block:: python - >>> default_value = int8.default_value() + >>> default_value = int8.default_scalar() >>> assert default_value == np.int8(0) @@ -160,7 +160,7 @@ Serialize a scalar value to JSON: .. code-block:: python - >>> json_value = int8.to_json_value(42, zarr_format=3) + >>> json_value = int8.to_json_scalar(42, zarr_format=3) >>> json_value 42 @@ -168,5 +168,5 @@ Deserialize a scalar value from JSON: .. code-block:: python - >>> scalar_value = int8.from_json_value(42, zarr_format=3) + >>> scalar_value = int8.from_json_scalar(42, zarr_format=3) >>> assert scalar_value == np.int8(42) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index c03e3c55fb..08853f27f1 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -48,7 +48,7 @@ async def _decode_single( # segfaults and other bad things happening if chunk_spec.dtype.dtype_cls is not np.dtypes.ObjectDType: try: - chunk = chunk.view(chunk_spec.dtype.to_dtype()) + chunk = chunk.view(chunk_spec.dtype.to_native_dtype()) except TypeError: # this will happen if the dtype of the chunk # does not match the dtype of the array spec i.g. if @@ -56,7 +56,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_dtype()) + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -80,7 +80,7 @@ async def _encode_single( chunk = chunk_array.as_ndarray_like() # ensure contiguous and correct order - chunk = chunk.astype(chunk_spec.dtype.to_dtype(), order=chunk_spec.order, copy=False) + chunk = chunk.astype(chunk_spec.dtype.to_native_dtype(), order=chunk_spec.order, copy=False) # apply filters if self.filters: diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 5db39796e4..6ef0fef60b 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -79,7 +79,7 @@ async def _decode_single( "Endianness | None", self.endian.value if self.endian is not None else None ) new_byte_order = endianness_to_numpy_str(endian_str) - dtype = chunk_spec.dtype.to_dtype().newbyteorder(new_byte_order) + dtype = chunk_spec.dtype.to_native_dtype().newbyteorder(new_byte_order) as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 914236d700..cd8676b4d1 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -452,7 +452,7 @@ async def _decode_single( # setup output array out = chunk_spec.prototype.nd_buffer.create( shape=shard_shape, - dtype=shard_spec.dtype.to_dtype(), + dtype=shard_spec.dtype.to_native_dtype(), order=shard_spec.order, fill_value=0, ) @@ -499,7 +499,7 @@ async def _decode_partial_single( # setup output array out = shard_spec.prototype.nd_buffer.create( shape=indexer.shape, - dtype=shard_spec.dtype.to_dtype(), + dtype=shard_spec.dtype.to_native_dtype(), order=shard_spec.order, fill_value=0, ) diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 15bae8da81..b7c0418b2e 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -60,7 +60,7 @@ async def _decode_single( decoded = _vlen_utf8_codec.decode(raw_bytes) assert decoded.dtype == np.object_ decoded.shape = chunk_spec.shape - as_string_dtype = decoded.astype(chunk_spec.dtype.to_dtype(), copy=False) + as_string_dtype = decoded.astype(chunk_spec.dtype.to_native_dtype(), copy=False) return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype) async def _encode_single( diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e3d9e3fdaf..cd6b33a28c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -700,7 +700,7 @@ def _create_metadata_v3( if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = dtype.default_value() + fill_value_parsed = dtype.default_scalar() else: fill_value_parsed = fill_value @@ -782,7 +782,7 @@ def _create_metadata_v2( if dimension_separator is None: dimension_separator = "." if fill_value is None: - fill_value = dtype.default_value() # type: ignore[assignment] + fill_value = dtype.default_scalar() # type: ignore[assignment] return ArrayV2Metadata( shape=shape, dtype=dtype, @@ -1056,7 +1056,7 @@ def dtype(self) -> TBaseDType: np.dtype Data type of the array """ - return self._zdtype.to_dtype() + return self._zdtype.to_native_dtype() @property def order(self) -> MemoryOrder: diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 3d00fe5467..23c27e40c6 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -62,7 +62,7 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: # validated when decoding the metadata, but we support reading # Zarr V2 data and need to support the case where fill_value # is None. - return chunk_spec.dtype.default_value() + return chunk_spec.dtype.default_scalar() else: return fill_value @@ -296,7 +296,9 @@ def _merge_chunk_array( is_complete_chunk: bool, drop_axes: tuple[int, ...], ) -> NDBuffer: - if chunk_selection == () or is_scalar(value.as_ndarray_like(), chunk_spec.dtype.to_dtype()): + if chunk_selection == () or is_scalar( + value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype() + ): chunk_value = value else: chunk_value = value[out_selection] @@ -317,7 +319,7 @@ def _merge_chunk_array( if existing_chunk_array is None: chunk_array = chunk_spec.prototype.nd_buffer.create( shape=chunk_spec.shape, - dtype=chunk_spec.dtype.to_dtype(), + dtype=chunk_spec.dtype.to_native_dtype(), order=chunk_spec.order, fill_value=fill_value_or_default(chunk_spec), ) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 864959a948..2ba5914ea5 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -10,7 +10,9 @@ from typing import ( TYPE_CHECKING, Any, + Generic, Literal, + TypedDict, TypeVar, cast, overload, @@ -39,6 +41,14 @@ AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"] DimensionNames = Iterable[str | None] | None +TName = TypeVar("TName", bound=str) +TConfig = TypeVar("TConfig", bound=Mapping[str, object]) + + +class NamedConfig(TypedDict, Generic[TName, TConfig]): + name: TName + configuration: TConfig + def product(tup: ChunkCoords) -> int: return functools.reduce(operator.mul, tup, 1) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 9c672fd986..a8bfe2b5c4 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -16,12 +16,13 @@ if TYPE_CHECKING: from zarr.core.common import ZarrFormat +from collections.abc import Mapping + import numpy as np import numpy.typing as npt from zarr.core.common import JSON from zarr.core.dtype.npy.string import ( - _NUMPY_SUPPORTS_VLEN_STRING, FixedLengthASCII, FixedLengthUTF32, VariableLengthString, @@ -102,7 +103,7 @@ ) # This type models inputs that can be coerced to a ZDType -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] | str +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str for dtype in ANY_DTYPE: # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType @@ -114,42 +115,41 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, """ Get a data type wrapper (an instance of ``ZDType``) from a native data type, e.g. a numpy dtype. """ - data_type_registry.lazy_load() if not isinstance(dtype, np.dtype): - # TODO: This check has a lot of assumptions in it! Chiefly, we assume that the - # numpy object dtype contains variable length strings, which is not in general true - # When / if zarr python supports ragged arrays, for example, this check will fail! - if dtype in (str, "str", "|T16", "O", "|O", np.dtypes.ObjectDType()): - if _NUMPY_SUPPORTS_VLEN_STRING: - na_dtype = np.dtype("T") - else: - na_dtype = np.dtype("O") - elif isinstance(dtype, list): + na_dtype: np.dtype[np.generic] + if isinstance(dtype, list): # this is a valid _VoidDTypeLike check na_dtype = np.dtype([tuple(d) for d in dtype]) else: na_dtype = np.dtype(dtype) else: na_dtype = dtype - return data_type_registry.match_dtype(na_dtype) + return data_type_registry.match_dtype(dtype=na_dtype) + + +def get_data_type_from_json_v3( + dtype_spec: JSON, +) -> ZDType[TBaseDType, TBaseScalar]: + return data_type_registry.match_json_v3(dtype_spec) -def get_data_type_from_json( - dtype: JSON, zarr_format: ZarrFormat +def get_data_type_from_json_v2( + dtype_spec: JSON, *, object_codec_id: str | None = None ) -> ZDType[TBaseDType, TBaseScalar]: - return data_type_registry.match_json(dtype, zarr_format=zarr_format) + return data_type_registry.match_json_v2(dtype_spec, object_codec_id=object_codec_id) -def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]: +def parse_data_type( + dtype_spec: ZDTypeLike, *, zarr_format: ZarrFormat, object_codec_id: str | None = None +) -> ZDType[TBaseDType, TBaseScalar]: """ Interpret the input as a ZDType instance. """ - if isinstance(dtype, ZDType): - return dtype - elif isinstance(dtype, dict): - # This branch assumes that the data type has been specified in the JSON form - # but it's also possible for numpy data types to be specified as dictionaries, which will - # cause an error in the `get_data_type_from_json`, but that's ok for now - return get_data_type_from_json(dtype, zarr_format=zarr_format) # type: ignore[arg-type] - else: - return get_data_type_from_native_dtype(dtype) + if isinstance(dtype_spec, ZDType): + return dtype_spec + # dict and zarr_format 3 means that we have a JSON object representation of the dtype + if zarr_format == 3 and isinstance(dtype_spec, Mapping): + return get_data_type_from_json_v3(dtype_spec) # type: ignore[arg-type] + # otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case + # we can create a numpy dtype from it, and do the dtype inference from that + return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type] diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 5eeff2af5b..bbdc06c50d 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -2,7 +2,7 @@ import warnings from dataclasses import dataclass -from typing import Final, Literal +from typing import ClassVar, Final, Literal Endianness = Literal["little", "big"] SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] @@ -46,6 +46,28 @@ def item_size(self) -> int: raise NotImplementedError +@dataclass(frozen=True) +class HasObjectCodec: + """ + A mix-in class for data types that require an object codec id. + This class bears the property ``object_codec_id``, which is the string name of an object + codec that is required to encode and decode the data type. + + In zarr-python 2.x certain data types like variable-length strings or variable-length arrays + used the catch-all numpy "object" data type for their in-memory representation. But these data + types cannot be stored as numpy object data types, because the object data type does not define + a fixed memory layout. So these data types required a special codec, called an "object codec", + that effectively defined a compact representation for the data type, which was used to encode + and decode the data type. + + Zarr-python 2.x would not allow the creation of arrays with the "object" data type if an object + codec was not specified, and thus the name of the object codec is effectively part of the data + type model. + """ + + object_codec_id: ClassVar[str] + + class UnstableSpecificationWarning(FutureWarning): ... diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index d46758f789..b1800127e8 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -1,12 +1,12 @@ from dataclasses import dataclass -from typing import ClassVar, Literal, Self, TypeGuard +from typing import ClassVar, Literal, Self, TypeGuard, overload import numpy as np from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.common import HasItemSize from zarr.core.dtype.npy.common import check_json_bool -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType @dataclass(frozen=True, kw_only=True, slots=True) @@ -22,40 +22,50 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): The numpy dtype class. """ - _zarr_v3_name = "bool" + _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) dtype_cls = np.dtypes.BoolDType @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self: Self) -> np.dtypes.BoolDType: + def to_native_dtype(self: Self) -> np.dtypes.BoolDType: return self.dtype_cls() @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["bool", "|b1"]]: + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|b1"]]: """ Check that the input is a valid JSON representation of a bool. """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data in cls._zarr_v2_names + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["bool"]]: + return data == cls._zarr_v3_name + + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|b1"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... - def to_json(self, zarr_format: ZarrFormat) -> str: + def to_json(self, zarr_format: ZarrFormat) -> Literal["|b1", "bool"]: if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() - def default_value(self) -> np.bool_: + def default_scalar(self) -> np.bool_: """ Get the default value for the boolean dtype. @@ -66,7 +76,7 @@ def default_value(self) -> np.bool_: """ return np.False_ - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> bool: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> bool: """ Convert a scalar to a python bool. @@ -84,7 +94,7 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> bool: """ return bool(data) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: """ Read a JSON-serializable value as a numpy boolean scalar. @@ -101,14 +111,14 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: The numpy boolean scalar. """ if check_json_bool(data): - return self._cast_value_unsafe(data) + return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # Anything can become a bool return True - def _cast_value_unsafe(self, data: object) -> np.bool_: + def _cast_scalar_unchecked(self, data: object) -> np.bool_: return np.bool_(data) @property diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 2481dcb150..03dc194a7a 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -176,7 +176,10 @@ def float_from_json_v3(data: JSONFloatV3) -> float: elif len(data[2:]) == 16: dtype_code = ">d" else: - msg = f"Invalid float value: {data!r}. Expected a string of length 4, 8, or 16." + msg = ( + f"Invalid hexadecimal float value: {data!r}. " + "Expected the '0x' prefix to be followed by 4, 8, or 16 numeral characters" + ) raise ValueError(msg) return float(struct.unpack(dtype_code, bytes.fromhex(data[2:]))[0]) return float_from_json_v2(data) diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index ee52dd0577..f7db6fe94d 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, ClassVar, + Literal, Self, TypeGuard, cast, @@ -24,7 +25,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.dtype.npy.common import EndiannessNumpy @@ -36,11 +37,11 @@ class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, Ha _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> TComplexDType_co: + def to_native_dtype(self) -> TComplexDType_co: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -59,37 +60,39 @@ def to_json(self, zarr_format: ZarrFormat) -> str: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data in cls._zarr_v2_names + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + return data == cls._zarr_v3_name - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: return isinstance(data, ComplexLike) - def _cast_value_unsafe(self, data: object) -> TComplexScalar_co: - return self.to_dtype().type(data) # type: ignore[arg-type, return-value] + def _cast_scalar_unchecked(self, data: object) -> TComplexScalar_co: + return self.to_native_dtype().type(data) # type: ignore[arg-type, return-value] - def default_value(self) -> TComplexScalar_co: + def default_scalar(self) -> TComplexScalar_co: """ Get the default value, which is 0 cast to this dtype @@ -98,9 +101,9 @@ def default_value(self) -> TComplexScalar_co: Int scalar The default value. """ - return self._cast_value_unsafe(0) + return self._cast_scalar_unchecked(0) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: """ Read a JSON-serializable value as a numpy float. @@ -118,19 +121,19 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexSca """ if zarr_format == 2: if check_json_complex_float_v2(data): - return self._cast_value_unsafe(complex_float_from_json_v2(data)) + return self._cast_scalar_unchecked(complex_float_from_json_v2(data)) raise TypeError( f"Invalid type: {data}. Expected a float or a special string encoding of a float." ) elif zarr_format == 3: if check_json_complex_float_v3(data): - return self._cast_value_unsafe(complex_float_from_json_v3(data)) + return self._cast_scalar_unchecked(complex_float_from_json_v3(data)) raise TypeError( f"Invalid type: {data}. Expected a float or a special string encoding of a float." ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ Convert an object to a JSON-serializable float. @@ -148,16 +151,16 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: each of which is encoding according to a zarr-format-specific encoding. """ if zarr_format == 2: - return complex_float_to_json_v2(self.cast_value(data)) + return complex_float_to_json_v2(self.cast_scalar(data)) elif zarr_format == 3: - return complex_float_to_json_v3(self.cast_value(data)) + return complex_float_to_json_v3(self.cast_scalar(data)) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True) class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): dtype_cls = np.dtypes.Complex64DType - _zarr_v3_name = "complex64" + _zarr_v3_name: ClassVar[Literal["complex64"]] = "complex64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " int: @dataclass(frozen=True, kw_only=True) class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): dtype_cls = np.dtypes.Complex128DType - _zarr_v3_name = "complex128" + _zarr_v3_name: ClassVar[Literal["complex128"]] = "complex128" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> TFloatDType_co: + def to_native_dtype(self) -> TFloatDType_co: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -51,37 +51,39 @@ def to_json(self, zarr_format: ZarrFormat) -> str: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data in cls._zarr_v2_names + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + return data == cls._zarr_v3_name - def check_value(self, data: object) -> TypeGuard[FloatLike]: + def check_scalar(self, data: object) -> TypeGuard[FloatLike]: return isinstance(data, FloatLike) - def _cast_value_unsafe(self, data: object) -> TFloatScalar_co: - return self.to_dtype().type(data) # type: ignore[return-value, arg-type] + def _cast_scalar_unchecked(self, data: object) -> TFloatScalar_co: + return self.to_native_dtype().type(data) # type: ignore[return-value, arg-type] - def default_value(self) -> TFloatScalar_co: + def default_scalar(self) -> TFloatScalar_co: """ Get the default value, which is 0 cast to this dtype @@ -90,9 +92,9 @@ def default_value(self) -> TFloatScalar_co: Int scalar The default value. """ - return self._cast_value_unsafe(0) + return self._cast_scalar_unchecked(0) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: """ Read a JSON-serializable value as a numpy float. @@ -110,14 +112,14 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScala """ if zarr_format == 2: if check_json_float_v2(data): - return self._cast_value_unsafe(float_from_json_v2(data)) + return self._cast_scalar_unchecked(float_from_json_v2(data)) else: raise TypeError( f"Invalid type: {data}. Expected a float or a special string encoding of a float." ) elif zarr_format == 3: if check_json_float_v3(data): - return self._cast_value_unsafe(float_from_json_v3(data)) + return self._cast_scalar_unchecked(float_from_json_v3(data)) else: raise TypeError( f"Invalid type: {data}. Expected a float or a special string encoding of a float." @@ -125,7 +127,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScala else: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> float | str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | str: """ Convert an object to a JSON-serializable float. @@ -143,9 +145,9 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> float | str See the zarr specifications for details on the JSON encoding for floats. """ if zarr_format == 2: - return float_to_json_v2(self._cast_value_unsafe(data)) + return float_to_json_v2(self._cast_scalar_unchecked(data)) elif zarr_format == 3: - return float_to_json_v3(self._cast_value_unsafe(data)) + return float_to_json_v3(self._cast_scalar_unchecked(data)) else: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index db5869b202..92705917f9 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -1,5 +1,15 @@ from dataclasses import dataclass -from typing import ClassVar, Self, SupportsIndex, SupportsInt, TypeGuard, TypeVar, cast +from typing import ( + ClassVar, + Literal, + Self, + SupportsIndex, + SupportsInt, + TypeGuard, + TypeVar, + cast, + overload, +) import numpy as np @@ -11,7 +21,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType _NumpyIntDType = ( np.dtypes.Int8DType @@ -36,44 +46,24 @@ class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data in cls._zarr_v2_names - def check_value(self, data: object) -> TypeGuard[IntLike]: + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + return data == cls._zarr_v3_name + + def check_scalar(self, data: object) -> TypeGuard[IntLike]: return isinstance(data, IntLike) - def _cast_value_unsafe(self, data: object) -> TIntScalar_co: - return self.to_dtype().type(data) # type: ignore[return-value, arg-type] + def _cast_scalar_unchecked(self, data: object) -> TIntScalar_co: + return self.to_native_dtype().type(data) # type: ignore[return-value, arg-type] - def default_value(self) -> TIntScalar_co: + def default_scalar(self) -> TIntScalar_co: """ Get the default value, which is 0 cast to this dtype @@ -82,9 +72,9 @@ def default_value(self) -> TIntScalar_co: Int scalar The default value. """ - return self._cast_value_unsafe(0) + return self._cast_scalar_unchecked(0) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: """ Read a JSON-serializable value as a numpy int scalar. @@ -101,10 +91,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_ The numpy scalar. """ if check_json_int(data): - return self._cast_value_unsafe(data) + return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: """ Convert an object to JSON-serializable scalar. @@ -120,24 +110,52 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: int The JSON-serializable form of the scalar. """ - return int(self.cast_value(data)) + return int(self.cast_scalar(data)) @dataclass(frozen=True, kw_only=True) class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): dtype_cls = np.dtypes.Int8DType - _zarr_v3_name = "int8" + _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|i1"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["int8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["int8", "|i1"]: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_native_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self: Self) -> np.dtypes.Int8DType: + def to_native_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() @property @@ -148,18 +166,46 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): dtype_cls = np.dtypes.UInt8DType - _zarr_v3_name = "uint8" + _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|u1"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["uint8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["uint8", "|u1"]: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_native_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self: Self) -> np.dtypes.UInt8DType: + def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: return self.dtype_cls() @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() @property @@ -170,23 +216,51 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType - _zarr_v3_name = "int16" + _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Literal[">i2", " Literal["int16"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["int16", ">i2", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Int16DType: + def to_native_dtype(self) -> np.dtypes.Int16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: # This ensures that we get the endianness correct without annoying string parsing - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -199,22 +273,50 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType - _zarr_v3_name = "uint16" + _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Literal[">u2", " Literal["uint16"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["uint16", ">u2", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.UInt16DType: + def to_native_dtype(self) -> np.dtypes.UInt16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -227,34 +329,64 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType - _zarr_v3_name = "int32" + _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Literal[">i4", " Literal["int32"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["int32", ">i4", " Self: + def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: # We override the base implementation to address a windows-specific, pre-numpy 2 issue where # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, # despite the two classes being different. Thus we will create an instance of `cls` with the # latter dtype, after pulling in the byte order of the input if dtype == np.dtypes.Int32DType(): - return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) + return cls._from_native_dtype_unsafe( + np.dtypes.Int32DType().newbyteorder(dtype.byteorder) + ) else: - return super().from_dtype(dtype) + return super().from_native_dtype(dtype) @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Int32DType: + def to_native_dtype(self) -> np.dtypes.Int32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -267,22 +399,48 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType - _zarr_v3_name = "uint32" + _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Literal[">u4", " Literal["uint32"]: ... + def to_json(self, zarr_format: ZarrFormat) -> Literal["uint32", ">u4", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.UInt32DType: + def to_native_dtype(self) -> np.dtypes.UInt32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -295,22 +453,48 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType - _zarr_v3_name = "int64" + _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Literal[">i8", " Literal["int64"]: ... + def to_json(self, zarr_format: ZarrFormat) -> Literal["int64", ">i8", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Int64DType: + def to_native_dtype(self) -> np.dtypes.Int64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -323,22 +507,50 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType - _zarr_v3_name = "uint64" + _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Literal[">u8", " Literal["uint64"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["uint64", ">u8", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.UInt64DType: + def to_native_dtype(self) -> np.dtypes.UInt64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index eb2b39ad9a..69d6145ad4 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -2,11 +2,11 @@ import re from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, Self, TypeGuard, cast +from typing import Any, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np -from zarr.core.common import JSON, ZarrFormat +from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import ( DataTypeValidationError, HasItemSize, @@ -18,7 +18,14 @@ bytes_to_json, check_json_str, ) -from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, TBaseScalar, ZDType + + +class FixedLengthBytesConfig(TypedDict): + length_bytes: int + + +FixedLengthBytesJSONV3 = NamedConfig[Literal["fixed_length_bytes"], FixedLengthBytesConfig] @dataclass(frozen=True, kw_only=True) @@ -27,49 +34,59 @@ class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, Has # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "numpy.fixed_length_bytes" + _zarr_v3_name: ClassVar[Literal["fixed_length_bytes"]] = "fixed_length_bytes" @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(length=dtype.itemsize) - def to_dtype(self) -> np.dtypes.VoidDType[int]: + def to_native_dtype(self) -> np.dtypes.VoidDType[int]: # Numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # Check that the dtype is |V1, |V2, ... - return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + # Check that the dtype is |V1, |V2, ... + return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthBytesJSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> FixedLengthBytesJSONV3: ... + + def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthBytesJSONV3: if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + def check_native_dtype( + cls: type[Self], dtype: TBaseDType + ) -> TypeGuard[np.dtypes.VoidDType[Any]]: """ Numpy void dtype comes in two forms: * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. @@ -89,22 +106,22 @@ def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidD """ return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - def default_value(self) -> np.void: - return self.to_dtype().type(("\x00" * self.length).encode("ascii")) + def default_scalar(self) -> np.void: + return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_value(data).tobytes()).decode("ascii") + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data)) + return self.to_native_dtype().type(base64.standard_b64decode(data)) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) - def _cast_value_unsafe(self, data: object) -> np.void: - native_dtype = self.to_dtype() + def _cast_scalar_unchecked(self, data: object) -> np.void: + native_dtype = self.to_native_dtype() # Without the second argument, numpy will return a void scalar for dtype V1. # The second argument ensures that, if native_dtype is something like V10, # the result will actually be a V10 scalar. @@ -115,17 +132,18 @@ def item_size(self) -> int: return self.length +# TODO: tighten this up, get a v3 spec in place, handle endianness, etc. @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "structured" fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] - def default_value(self) -> np.void: - return self._cast_value_unsafe(0) + def default_scalar(self) -> np.void: + return self._cast_scalar_unchecked(0) - def _cast_value_unsafe(self, data: object) -> np.void: - na_dtype = self.to_dtype() + def _cast_scalar_unchecked(self, data: object) -> np.void: + na_dtype = self.to_native_dtype() if isinstance(data, bytes): res = np.frombuffer(data, dtype=na_dtype)[0] elif isinstance(data, list | tuple): @@ -135,7 +153,7 @@ def _cast_value_unsafe(self, data: object) -> np.void: return cast("np.void", res) @classmethod - def check_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: + def check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ Check that this dtype is a numpy structured dtype @@ -149,10 +167,10 @@ def check_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: TypeGuard[np.dtypes.VoidDType] True if the dtype matches, False otherwise. """ - return super().check_dtype(dtype) and dtype.fields is not None + return super().check_native_dtype(dtype) and dtype.fields is not None @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] @@ -168,7 +186,13 @@ def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(fields=tuple(fields)) - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V3 | DTypeJSON_V2: fields = [ (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields ] @@ -178,90 +202,94 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: v3_unstable_dtype_warning(self) base_dict = {"name": self._zarr_v3_name} base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] - return cast("JSON", base_dict) + return cast("DTypeJSON_V3", base_dict) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[dict[str, JSON] | list[Any]]: + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[list[object]]: # the actual JSON form is recursive and hard to annotate, so we give up and do - # list[Any] for now - if zarr_format == 2: - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and all( - not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 - for field in data - ) - ) - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and "configuration" in data - and isinstance(data["configuration"], dict) - and "fields" in data["configuration"] + # list[object] for now + + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and all( + not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 + for field in data ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + ) + + @classmethod + def check_json_v3( + cls, data: JSON + ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, JSON]]]]]: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "fields" in data["configuration"] + ) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - from zarr.core.dtype import get_data_type_from_json + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + # avoid circular import issues by importing these functions here + from zarr.core.dtype import get_data_type_from_json_v2, get_data_type_from_json_v3 # This is a horrible mess, because this data type is recursive - if cls.check_json(data, zarr_format=zarr_format): - if zarr_format == 2: + if zarr_format == 2: + if cls.check_json_v2(data): # type: ignore[arg-type] # structured dtypes are constructed directly from a list of lists + # note that we do not handle the object codec here! this will prevent structured + # dtypes from containing object dtypes. return cls( fields=tuple( # type: ignore[misc] - (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + (f_name, get_data_type_from_json_v2(f_dtype, object_codec_id=None)) # type: ignore[has-type] for f_name, f_dtype in data ) ) - elif zarr_format == 3: - if isinstance(data, dict) and "configuration" in data: - config = data["configuration"] - if isinstance(config, dict) and "fields" in config: - meta_fields = config["fields"] - fields = tuple( - (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) - for f_name, f_dtype in meta_fields - ) - return cls(fields=fields) - else: - raise TypeError( - f"Invalid type: {data}. Expected a dictionary." - ) # pragma: no cover - else: - raise TypeError( - f"Invalid type: {data}. Expected a dictionary." - ) # pragma: no cover + else: + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + elif zarr_format == 3: + if cls.check_json_v3(data): # type: ignore[arg-type] + config = data["configuration"] + meta_fields = config["fields"] + fields = tuple( + (f_name, get_data_type_from_json_v3(f_dtype)) for f_name, f_dtype in meta_fields + ) + else: + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + else: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - def to_dtype(self) -> np.dtypes.VoidDType[int]: + return cls(fields=fields) + + def to_native_dtype(self) -> np.dtypes.VoidDType[int]: return cast( "np.dtypes.VoidDType[int]", - np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), + np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), ) - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # TODO: implement something here! return True - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): as_bytes = bytes_from_json(data, zarr_format=zarr_format) - dtype = self.to_dtype() + dtype = self.to_native_dtype() return cast("np.void", np.array([as_bytes]).view(dtype)[0]) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover @property def item_size(self) -> int: # Lets have numpy do the arithmetic here - return self.to_dtype().itemsize + return self.to_native_dtype().itemsize diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index b5b86ca387..2299b7aab1 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -3,18 +3,19 @@ import base64 import re from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard, cast +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np -from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength +from zarr.core.common import NamedConfig +from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength, HasObjectCodec from zarr.core.dtype.npy.common import ( EndiannessNumpy, check_json_str, endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -23,39 +24,53 @@ _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +class LengthBytesConfig(TypedDict): + length_bytes: int + + +# TDO: Fix this terrible name +FixedLengthASCIIJSONV3 = NamedConfig[Literal["fixed_length_ascii"], LengthBytesConfig] + + @dataclass(frozen=True, kw_only=True) class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): dtype_cls = np.dtypes.BytesDType - _zarr_v3_name = "numpy.fixed_length_ascii" + _zarr_v3_name: ClassVar[Literal["fixed_length_ascii"]] = "fixed_length_ascii" @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(length=dtype.itemsize) - def to_dtype(self) -> np.dtypes.BytesDType[int]: + def to_native_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of a numpy S dtype. """ - if zarr_format == 2: - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and "length_bytes" in data["configuration"] - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthASCIIJSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and "length_bytes" in data["configuration"] + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> FixedLengthASCIIJSONV3: ... - def to_json(self, zarr_format: ZarrFormat) -> JSON: + def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthASCIIJSONV3: if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return { "name": self._zarr_v3_name, @@ -64,29 +79,31 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.bytes_: + def default_scalar(self) -> np.bytes_: return np.bytes_(b"") - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # this is generous for backwards compatibility return isinstance(data, np.bytes_ | str | bytes | int) - def _cast_value_unsafe(self, data: object) -> np.bytes_: + def _cast_scalar_unchecked(self, data: object) -> np.bytes_: # We explicitly truncate the result because of the following numpy behavior: # >>> x = np.dtype('S3').type('hello world') # >>> x @@ -94,56 +111,68 @@ def _cast_value_unsafe(self, data: object) -> np.bytes_: # >>> x.dtype # dtype('S11') - return self.to_dtype().type(data[: self.length]) # type: ignore[index] + if isinstance(data, int): + return self.to_native_dtype().type(str(data)[: self.length]) + else: + return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] @property def item_size(self) -> int: return self.length +# TODO: Fix this terrible name +FixedLengthUTF32JSONV3 = NamedConfig[Literal["fixed_length_utf32"], LengthBytesConfig] + + @dataclass(frozen=True, kw_only=True) class FixedLengthUTF32( ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_utf32" + _zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32" code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.code_point_bytes), endianness=endianness_from_numpy_str(byte_order), ) - def to_dtype(self) -> np.dtypes.StrDType[int]: + def to_native_dtype(self) -> np.dtypes.StrDType[int]: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls(self.length).newbyteorder(byte_order) @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of a numpy S dtype. """ - if zarr_format == 2: - # match >U1, <]U\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - and isinstance(data["configuration"]["length_bytes"], int) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return isinstance(data, str) and re.match(r"^[><]U\d+$", data) is not None + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthUTF32JSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} + and isinstance(data["configuration"]["length_bytes"], int) + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @overload + def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSONV3: ... + + def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthUTF32JSONV3: if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return { "name": self._zarr_v3_name, @@ -152,29 +181,31 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator] + return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.str_: + def default_scalar(self) -> np.str_: return np.str_("") - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: if check_json_str(data): - return self.to_dtype().type(data) + return self.to_native_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # this is generous for backwards compatibility return isinstance(data, str | np.str_ | bytes | int) - def _cast_value_unsafe(self, data: object) -> np.str_: + def _cast_scalar_unchecked(self, data: object) -> np.str_: # We explicitly truncate the result because of the following numpy behavior: # >>> x = np.dtype('U3').type('hello world') # >>> x @@ -182,7 +213,10 @@ def _cast_value_unsafe(self, data: object) -> np.str_: # >>> x.dtype # dtype('U11') - return self.to_dtype().type(data[: self.length]) # type: ignore[index] + if isinstance(data, int): + return self.to_native_dtype().type(str(data)[: self.length]) + else: + return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] @property def item_size(self) -> int: @@ -192,32 +226,38 @@ def item_size(self) -> int: if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[type-var] + class VariableLengthString(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "numpy.variable_length_utf8" + _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + object_codec_id = "vlen-utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self) -> np.dtypes.StringDType: + def to_native_dtype(self) -> np.dtypes.StringDType: return self.dtype_cls() @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|O"]]: """ - Check that the input is a valid JSON representation of a numpy string dtype. + Check that the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. """ - if zarr_format == 2: - # TODO: take the entire metadata document in here, and - # check the compressors / filters for vlen-utf8 - # Note that we are checking for the object dtype name. - return data == "|O" - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data == "|O" and object_codec_id == cls.object_codec_id + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + return data == cls._zarr_v3_name - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: if zarr_format == 2: # Note: unlike many other numpy data types, we don't serialize the .str attribute # of the data type to JSON. This is because Zarr was using `|O` for strings before the @@ -229,71 +269,83 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() - def default_value(self) -> str: + def default_scalar(self) -> str: return "" - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") return data - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: return isinstance(data, str) - def _cast_value_unsafe(self, data: object) -> str: + def _cast_scalar_unchecked(self, data: object) -> str: return str(data) else: # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] + class VariableLengthString(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "numpy.variable_length_utf8" + _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + object_codec_id = "vlen-utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self) -> np.dtypes.ObjectDType: + def to_native_dtype(self) -> np.dtypes.ObjectDType: return self.dtype_cls() @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|O"]]: """ - Check that the input is a valid JSON representation of a numpy O dtype. + Check that the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. """ - if zarr_format == 2: - # TODO: take the entire metadata document in here, and - # check the compressors / filters for vlen-utf8 - return data == "|O" - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data == "|O" and object_codec_id == cls.object_codec_id - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + return data == cls._zarr_v3_name + + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: if zarr_format == 2: - return self.to_dtype().str + return "|O" elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() - def default_value(self) -> str: + def default_scalar(self) -> str: return "" - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return data # type: ignore[return-value] - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ Strings pass through """ @@ -301,8 +353,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: raise TypeError(f"Invalid type: {data}. Expected a string.") return data - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: return isinstance(data, str) - def _cast_value_unsafe(self, data: object) -> str: + def _cast_scalar_unchecked(self, data: object) -> str: return str(data) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 1c0e0d715c..4c5ce45442 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,11 +1,9 @@ from __future__ import annotations -from collections.abc import Mapping from dataclasses import dataclass from typing import ( TYPE_CHECKING, ClassVar, - Generic, Literal, Self, TypedDict, @@ -13,10 +11,12 @@ TypeVar, cast, get_args, + overload, ) import numpy as np +from zarr.core.common import NamedConfig from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( DateTimeUnit, @@ -25,7 +25,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -79,23 +79,14 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: ) _BaseTimeScalar = TypeVar("_BaseTimeScalar", bound=np.timedelta64 | np.datetime64) -TName = TypeVar("TName", bound=str) -TConfig = TypeVar("TConfig", bound=Mapping[str, object]) - - -class NamedConfig(TypedDict, Generic[TName, TConfig]): - name: TName - configuration: TConfig - class TimeConfig(TypedDict): unit: DateTimeUnit interval: int -# aspirational -DateTime64MetaParams = NamedConfig[Literal["numpy.datetime64"], TimeConfig] -TimeDelta64MetaParams = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] +DateTime64JSONV3 = NamedConfig[Literal["numpy.datetime64"], TimeConfig] +TimeDelta64JSONV3 = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] @dataclass(frozen=True, kw_only=True, slots=True) @@ -117,7 +108,7 @@ def __post_init__(self) -> None: raise ValueError(f"unit must be one of {get_args(DateTimeUnit)}, got {self.unit!r}.") @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: unit, scale_factor = np.datetime_data(dtype.name) unit = cast("DateTimeUnit", unit) byteorder = cast("EndiannessNumpy", dtype.byteorder) @@ -125,7 +116,7 @@ def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: unit=unit, scale_factor=scale_factor, endianness=endianness_from_numpy_str(byteorder) ) - def to_dtype(self) -> _BaseTimeDType_co: + def to_native_dtype(self) -> _BaseTimeDType_co: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. @@ -133,32 +124,42 @@ def to_dtype(self) -> _BaseTimeDType_co: return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: unit = data["configuration"]["unit"] # type: ignore[index, call-overload] scale_factor = data["configuration"]["scale_factor"] # type: ignore[index, call-overload] - return cls(unit=unit, scale_factor=scale_factor) # type: ignore[arg-type] + return cls(unit=unit, scale_factor=scale_factor) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> DateTime64JSONV3 | TimeDelta64JSONV3: ... + + def to_json(self, zarr_format: ZarrFormat) -> str | DateTime64JSONV3 | TimeDelta64JSONV3: if zarr_format == 2: - return cast("str", self.to_dtype().str) + return cast("str", self.to_native_dtype().str) elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - } + return cast( + "DateTime64JSONV3 | TimeDelta64JSONV3", + { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + }, + ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: return datetimelike_to_int(data) # type: ignore[arg-type] - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # TODO: decide which values we should accept for datetimes. try: - np.array([data], dtype=self.to_dtype()) + np.array([data], dtype=self.to_native_dtype()) return True # noqa: TRY300 except ValueError: return False @@ -178,91 +179,90 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has """ dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] - _zarr_v3_name = "numpy.timedelta64" + _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" _zarr_v2_names = (">m8", " np.timedelta64: + def default_scalar(self) -> np.timedelta64: return np.timedelta64("NaT") - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: if check_json_int(data) or data == "NaT": - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_value_unsafe(self, data: object) -> np.timedelta64: - return self.to_dtype().type(data) # type: ignore[arg-type] + def _cast_scalar_unchecked(self, data: object) -> np.timedelta64: + return self.to_native_dtype().type(data) # type: ignore[arg-type] @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # match m[M], etc - # consider making this a standalone function - if not isinstance(data, str): - return False - if not data.startswith(cls._zarr_v2_names): - return False - if len(data) == 3: - # no unit, and - # we already checked that this string is either m8 - return True - else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and set(data.keys()) == {"name", "configuration"} - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"unit", "scale_factor"} - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + # match m[M], etc + # consider making this a standalone function + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either m8 + return True + else: + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"unit", "scale_factor"} + ) @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] - _zarr_v3_name = "numpy.datetime64" + _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" _zarr_v2_names = (">M8", " np.datetime64: + def default_scalar(self) -> np.datetime64: return np.datetime64("NaT") - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data) or data == "NaT": - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_value_unsafe(self, data: object) -> np.datetime64: - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] + def _cast_scalar_unchecked(self, data: object) -> np.datetime64: + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # match M[M], etc - # consider making this a standalone function - if not isinstance(data, str): - return False - if not data.startswith(cls._zarr_v2_names): - return False - if len(data) == 3: - # no unit, and - # we already checked that this string is either M8 - return True - else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and set(data["configuration"].keys()) == {"unit", "scale_factor"} - and data["configuration"]["unit"] in get_args(DateTimeUnit) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + # match M[M], etc + # consider making this a standalone function + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either M8 + return True + else: + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"unit", "scale_factor"} + ) diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 047f908ac6..0423f69dbe 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib from dataclasses import dataclass, field from typing import TYPE_CHECKING, Self @@ -10,7 +11,7 @@ if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.core.common import JSON, ZarrFormat + from zarr.core.common import JSON from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -22,11 +23,12 @@ class DataTypeRegistry: contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( default_factory=dict, init=False ) + lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) def lazy_load(self) -> None: for e in self.lazy_load_list: - self.register(e.name, e.load()) + self.register(e.load()._zarr_v3_name, e.load()) self.lazy_load_list.clear() @@ -35,14 +37,20 @@ def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) - if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls + def unregister(self, key: str) -> None: + """Unregister a data type by its key.""" + if key in self.contents: + del self.contents[key] + else: + raise KeyError(f"Data type '{key}' not found in registry.") + def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: return self.contents[key] def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: - self.lazy_load() if dtype == np.dtype("O"): msg = ( - "Data type resolution failed. " + f"Zarr data type resolution from {dtype} failed. " 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' "data type. " @@ -51,18 +59,41 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: "data type, see xxxxxxxxxxx" ) raise ValueError(msg) + matched: list[ZDType[TBaseDType, TBaseScalar]] = [] + for val in self.contents.values(): + with contextlib.suppress(DataTypeValidationError): + matched.append(val.from_native_dtype(dtype)) + if len(matched) == 1: + return matched[0] + elif len(matched) > 1: + msg = ( + f"Zarr data type resolution from {dtype} failed. " + f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " + "You should unregister one of these data types, or avoid Zarr data type inference " + "entirely by providing a specific Zarr data type when creating your array." + "For more information, see xxxxxxxxxxxxxxxxxx" + ) + raise ValueError(msg) + raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") + + def match_json_v2( + self, data: JSON, *, object_codec_id: str | None = None + ) -> ZDType[TBaseDType, TBaseScalar]: + # The dtype field in zarr v2 JSON metadata is not unique across different distinct data types. + # Specifically, multiple distinct data types all use the "|O" data type representation. + # These must be disambiguated by the presence of an "object codec", which is a codec + # like variable-length utf8 encoding for strings. for val in self.contents.values(): try: - return val.from_dtype(dtype) + return val.from_json_v2(data, object_codec_id=object_codec_id) except DataTypeValidationError: pass - raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") + raise ValueError(f"No data type wrapper found that matches {data}") - def match_json(self, data: JSON, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]: - self.lazy_load() + def match_json_v3(self, data: JSON) -> ZDType[TBaseDType, TBaseScalar]: for val in self.contents.values(): try: - return val.from_json(data, zarr_format=zarr_format) + return val.from_json_v3(data) except DataTypeValidationError: pass raise ValueError(f"No data type wrapper found that matches {data}") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index bd9686afc1..c9b23707e8 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -23,8 +23,18 @@ from __future__ import annotations from abc import ABC, abstractmethod +from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Generic, Self, TypeGuard, TypeVar +from typing import ( + TYPE_CHECKING, + ClassVar, + Generic, + Literal, + Self, + TypeGuard, + TypeVar, + overload, +) import numpy as np @@ -46,6 +56,10 @@ TScalar_co = TypeVar("TScalar_co", bound=TBaseScalar, covariant=True) TDType_co = TypeVar("TDType_co", bound=TBaseDType, covariant=True) +# These types should include all JSON-serializable types that can be used to represent a data type. +DTypeJSON_V2 = str | Sequence[object] +DTypeJSON_V3 = str | Mapping[str, object] + @dataclass(frozen=True, kw_only=True, slots=True) class ZDType(Generic[TDType_co, TScalar_co], ABC): @@ -70,7 +84,7 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): _zarr_v3_name: ClassVar[str] @classmethod - def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: + def check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ Check that a data type matches the dtype_cls class attribute. Used as a type guard. @@ -87,7 +101,7 @@ def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: return type(dtype) is cls.dtype_cls @classmethod - def from_dtype(cls: type[Self], dtype: TBaseDType) -> Self: + def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a dtype object. @@ -106,15 +120,15 @@ def from_dtype(cls: type[Self], dtype: TBaseDType) -> Self: TypeError If the dtype does not match the dtype_cls class attribute. """ - if cls.check_dtype(dtype): - return cls._from_dtype_unsafe(dtype) + if cls.check_native_dtype(dtype): + return cls._from_native_dtype_unsafe(dtype) raise DataTypeValidationError( f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." ) @classmethod @abstractmethod - def _from_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a native dtype without checking. @@ -131,7 +145,7 @@ def _from_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: ... @abstractmethod - def to_dtype(self: Self) -> TDType_co: + def to_native_dtype(self: Self) -> TDType_co: """ Return an instance of the wrapped dtype. @@ -142,10 +156,10 @@ def to_dtype(self: Self) -> TDType_co: """ ... - def cast_value(self, data: object) -> TScalar_co: + def cast_scalar(self, data: object) -> TScalar_co: """ - Cast a value to the wrapped scalar type. The type is first checked for compatibility. If it's - incompatible with the associated scalar type, a ``TypeError`` will be raised. + Cast a scalar to the wrapped scalar type. The type is first checked for compatibility. If + it's incompatible with the associated scalar type, a ``TypeError`` will be raised. Parameters ---------- @@ -157,8 +171,8 @@ def cast_value(self, data: object) -> TScalar_co: TScalar The cast value. """ - if self.check_value(data): - return self._cast_value_unsafe(data) + if self.check_scalar(data): + return self._cast_scalar_unchecked(data) msg = ( f"The value {data} failed a type check. " f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}. " @@ -168,9 +182,9 @@ def cast_value(self, data: object) -> TScalar_co: raise TypeError(msg) @abstractmethod - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: """ - Check that a value is a valid value for the wrapped data type. + Check that a scalar is a valid value for the wrapped data type. Parameters ---------- @@ -185,9 +199,9 @@ def check_value(self, data: object) -> bool: ... @abstractmethod - def _cast_value_unsafe(self, data: object) -> TScalar_co: + def _cast_scalar_unchecked(self, data: object) -> TScalar_co: """ - Cast a value to the wrapped data type. This method should not perform any input validation. + Cast a scalar to the wrapped data type. This method should not perform any input validation. Parameters ---------- @@ -202,11 +216,12 @@ def _cast_value_unsafe(self, data: object) -> TScalar_co: ... @abstractmethod - def default_value(self) -> TScalar_co: + def default_scalar(self) -> TScalar_co: """ - Get the default value for the wrapped data type. This is a method, rather than an attribute, + Get the default scalar value for the wrapped data type. This is a method, rather than an attribute, because the default value for some data types may depend on parameters that are not known - until a concrete data type is wrapped. + until a concrete data type is wrapped. For example, data types parametrized by a length like + fixed-length strings or bytes will generate scalars consistent with that length. Returns ------- @@ -217,7 +232,35 @@ def default_value(self) -> TScalar_co: @classmethod @abstractmethod - def check_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2( + cls: type[Self], data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[DTypeJSON_V2]: + """ + Check that a JSON representation of a data type is consistent with the ZDType class. + + Parameters + ---------- + data : JSON + The JSON representation of the data type. + + object_codec_id : str | None + The object codec ID, if applicable. Object codecs are specific numcodecs codecs that + zarr-python 2.x used to serialize numpy "Object" scalars. For example, a dtype field set + to "|O" with an object codec ID of "vlen-utf8" indicates that the data type is a + variable-length string. + + Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. + + Returns + ------- + Bool + True if the JSON representation matches, False otherwise. + """ + ... + + @classmethod + @abstractmethod + def check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: """ Check that a JSON representation of a data type matches the dtype_cls class attribute. Used as a type guard. This base implementation checks that the input is a dictionary, @@ -229,9 +272,6 @@ def check_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> TypeGuar data : JSON The JSON representation of the data type. - zarr_format : ZarrFormat - The zarr format version. - Returns ------- Bool @@ -239,8 +279,14 @@ def check_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> TypeGuar """ ... + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... + @abstractmethod - def to_json(self, zarr_format: ZarrFormat) -> JSON: + def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: """ Convert the wrapped data type to a JSON-serializable form. @@ -251,46 +297,73 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: Returns ------- - JSON + DTypeJSON_V2 | DTypeJSON_V3 The JSON-serializable representation of the wrapped data type """ ... @classmethod - def from_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> Self: + def from_json_v3(cls: type[Self], data: JSON) -> Self: """ - Wrap a JSON representation of a data type. + Wrap a Zarr V3 JSON representation of a data type. Parameters ---------- data : JSON The JSON representation of the data type. - zarr_format : ZarrFormat - The zarr format version. - Returns ------- Self The wrapped data type. """ - if cls.check_json(data, zarr_format=zarr_format): - return cls._from_json_unsafe(data, zarr_format=zarr_format) + if cls.check_json_v3(data): + return cls._from_json_unchecked(data, zarr_format=3) raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}: {data}") @classmethod - @abstractmethod - def _from_json_unsafe(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> Self: + def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None) -> Self: """ - Wrap a JSON representation of a data type. + Wrap a Zarr V2 JSON representation of a data type. Parameters ---------- data : JSON The JSON representation of the data type. - zarr_format : ZarrFormat - The zarr format version. + Returns + ------- + Self + The wrapped data type. + """ + if cls.check_json_v2(data, object_codec_id=object_codec_id): + return cls._from_json_unchecked(data, zarr_format=2) + raise DataTypeValidationError( + f"Invalid JSON representation of data type {cls}: {data!r}, object_codec_id={object_codec_id!r}" + ) + + @classmethod + @overload + def _from_json_unchecked(cls, data: DTypeJSON_V2, *, zarr_format: Literal[2]) -> Self: ... + @classmethod + @overload + def _from_json_unchecked(cls, data: DTypeJSON_V3, *, zarr_format: Literal[3]) -> Self: ... + + @classmethod + @abstractmethod + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + """ + Create a ZDType instance from a JSON representation of a data type. + + This method should be called after input has been type checked, and so it should not perform + any input validation. + + Parameters + ---------- + data : JSON + The JSON representation of the data type. Returns ------- @@ -300,7 +373,7 @@ def _from_json_unsafe(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> S ... @abstractmethod - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ Convert a single value to JSON-serializable format. @@ -319,7 +392,7 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: ... @abstractmethod - def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: + def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: """ Read a JSON-serializable value as a scalar. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 23a0275691..ec1ac42264 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -9,7 +9,7 @@ from zarr.abc.metadata import Metadata from zarr.core.chunk_grids import RegularChunkGrid -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype import get_data_type_from_json_v2 from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType if TYPE_CHECKING: @@ -45,6 +45,9 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +# These are the ids of the known object codecs for zarr v2. +ObjectCodecIds = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2") + @dataclass(frozen=True, kw_only=True) class ArrayV2Metadata(Metadata): @@ -86,7 +89,7 @@ def __init__( filters_parsed = parse_filters(filters) fill_value_parsed: TBaseScalar | None if fill_value is not None: - fill_value_parsed = dtype.cast_value(fill_value) + fill_value_parsed = dtype.cast_scalar(fill_value) else: fill_value_parsed = fill_value attributes_parsed = parse_attributes(attributes) @@ -135,11 +138,29 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # Check that the zarr_format attribute is correct. _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = get_data_type_from_native_dtype(_data["dtype"]) + + # To resolve a numpy object dtype array, we need to search for an object codec, + # which could be in filters or as a compressor. + # we will use a hard-coded list of object codecs for this search. + object_codec_id: str | None = None + maybe_object_codecs = (data.get("filters"), data.get("compressor")) + for maybe_object_codec in maybe_object_codecs: + if isinstance(maybe_object_codec, Sequence): + for codec in maybe_object_codec: + if isinstance(codec, dict) and codec.get("id") in ObjectCodecIds: + object_codec_id = codec["id"] + break + elif ( + isinstance(maybe_object_codec, dict) + and maybe_object_codec.get("id") in ObjectCodecIds + ): + object_codec_id = maybe_object_codec["id"] + break + dtype = get_data_type_from_json_v2(data["dtype"], object_codec_id=object_codec_id) _data["dtype"] = dtype fill_value_encoded = _data.get("fill_value") if fill_value_encoded is not None: - fill_value = dtype.from_json_value(fill_value_encoded, zarr_format=2) + fill_value = dtype.from_json_scalar(fill_value_encoded, zarr_format=2) _data["fill_value"] = fill_value # zarr v2 allowed arbitrary keys here. @@ -192,11 +213,11 @@ def to_dict(self) -> dict[str, JSON]: # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: - fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) + fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value # serialize the dtype after fill value-specific JSON encoding - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) # type: ignore[assignment] return zarray_dict diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 80ed722836..83b9bd7bc8 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -7,7 +7,7 @@ from zarr.core.dtype import ( VariableLengthString, ZDType, - get_data_type_from_json, + get_data_type_from_json_v3, ) if TYPE_CHECKING: @@ -175,7 +175,7 @@ def __init__( chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # Note: relying on a type method is numpy-specific - fill_value_parsed = data_type.cast_value(fill_value) + fill_value_parsed = data_type.cast_scalar(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) @@ -306,12 +306,12 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: _ = parse_node_type_array(_data.pop("node_type")) data_type_json = _data.pop("data_type") - data_type = get_data_type_from_json(data_type_json, zarr_format=3) + data_type = get_data_type_from_json_v3(data_type_json) # check that the fill value is consistent with the data type try: fill = _data.pop("fill_value") - fill_value_parsed = data_type.from_json_value(fill, zarr_format=3) + fill_value_parsed = data_type.from_json_scalar(fill, zarr_format=3) except ValueError as e: raise TypeError(f"Invalid fill_value: {fill!r}") from e @@ -325,7 +325,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: out_dict = super().to_dict() - out_dict["fill_value"] = self.data_type.to_json_value( + out_dict["fill_value"] = self.data_type.to_json_scalar( self.fill_value, zarr_format=self.zarr_format ) if not isinstance(out_dict, dict): diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index cf8ba4b0bb..834d5654c0 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -9,7 +9,7 @@ from zarr.codecs import BytesCodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ZarrFormat +from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.npy.bool import Bool @@ -74,13 +74,13 @@ class TestDataType(Bool): This is a "data type" that serializes to "test" """ - _zarr_v3_name = "test" + _zarr_v3_name = "test" # type: ignore[assignment] @classmethod - def from_json(cls, data: Any, zarr_format: Literal[2, 3]) -> Self: - if data == cls._zarr_v3_name: + def from_json(cls, data: JSON, zarr_format: Literal[2, 3]) -> Self: + if data == cls._zarr_v3_name: # type: ignore[has-type] return cls() raise ValueError - def to_json(self, zarr_format: ZarrFormat) -> str: - return self._zarr_v3_name + def to_json(self, zarr_format: ZarrFormat) -> str: # type: ignore[override] + return self._zarr_v3_name # type: ignore[no-any-return, has-type] diff --git a/tests/test_array.py b/tests/test_array.py index 997470a0d3..7c500fe32b 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -163,7 +163,7 @@ def test_array_name_properties_no_group( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: arr = zarr.create_array( - store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" + store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype=">i4" ) assert arr.path == "" assert arr.name == "/" @@ -214,7 +214,7 @@ def test_array_fill_value_default( ) else: arr = zarr.create_array(store=store, shape=shape, dtype=zdtype, zarr_format=3, chunks=shape) - expected_fill_value = zdtype.default_value() + expected_fill_value = zdtype.default_scalar() if isinstance(expected_fill_value, np.datetime64 | np.timedelta64): if np.isnat(expected_fill_value): assert np.isnat(arr.fill_value) @@ -370,7 +370,7 @@ def test_storage_transformers(store: MemoryStore, zarr_format: ZarrFormat | str) "zarr_format": zarr_format, "shape": (10,), "chunks": (1,), - "dtype": "uint8", + "dtype": "|u1", "dimension_separator": ".", "codecs": (BytesCodec().to_dict(),), "fill_value": 0, @@ -1008,9 +1008,9 @@ def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: """ a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype) if isinstance(dtype, DateTime64 | TimeDelta64) and np.isnat(a.fill_value): - assert np.isnat(dtype.default_value()) + assert np.isnat(dtype.default_scalar()) else: - assert a.fill_value == dtype.default_value() + assert a.fill_value == dtype.default_scalar() @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -1029,7 +1029,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor name="b", shape=(5,), chunks=(5,), - dtype=dtype.to_dtype(), + dtype=dtype.to_native_dtype(), zarr_format=zarr_format, ) assert a.dtype == b.dtype @@ -1044,7 +1044,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor name="c", shape=(5,), chunks=(5,), - dtype=dtype.to_dtype().char, + dtype=dtype.to_native_dtype().char, zarr_format=zarr_format, ) else: @@ -1053,7 +1053,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor name="c", shape=(5,), chunks=(5,), - dtype=dtype.to_dtype().str, + dtype=dtype.to_native_dtype().str, zarr_format=zarr_format, ) assert a.dtype == c.dtype @@ -1308,7 +1308,7 @@ async def test_default_filters_compressors( arr = await create_array( store=store, - dtype=dtype, + dtype=dtype, # type: ignore[arg-type] shape=(10,), zarr_format=zarr_format, ) @@ -1320,14 +1320,14 @@ async def test_default_filters_compressors( compressors=sig.parameters["compressors"].default, filters=sig.parameters["filters"].default, serializer=sig.parameters["serializer"].default, - dtype=dtype, + dtype=dtype, # type: ignore[arg-type] ) elif zarr_format == 2: default_filters, default_compressors = _parse_chunk_encoding_v2( compressor=sig.parameters["compressors"].default, filters=sig.parameters["filters"].default, - dtype=dtype, + dtype=dtype, # type: ignore[arg-type] ) if default_filters is None: expected_filters = () diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 2b21a57365..b2aa89afd7 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -17,7 +17,7 @@ with warnings.catch_warnings(): warnings.simplefilter("ignore") zdtype_examples += ( - wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])), + wrapper_cls.from_native_dtype(np.dtype([("a", np.float64), ("b", np.int8)])), ) elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 1adae57f02..03dc550a9d 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -2,11 +2,11 @@ import numpy as np -from tests.test_dtype.test_wrapper import _TestZDType +from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams from zarr.core.dtype.npy.bool import Bool -class TestBool(_TestZDType): +class TestBool(BaseTestZDType): test_cls = Bool valid_dtype = (np.dtype(np.bool_),) @@ -15,7 +15,7 @@ class TestBool(_TestZDType): np.dtype(np.float64), np.dtype(np.uint16), ) - valid_json_v2 = ("|b1",) + valid_json_v2 = (V2JsonTestParams(dtype="|b1"),) valid_json_v3 = ("bool",) invalid_json_v2 = ( "|b1", diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index 258ab48fe1..c4a82e22b0 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -45,7 +45,7 @@ def nan_equal(a: object, b: object) -> bool: return a == b -json_float_v2_cases: list[tuple[JSONFloatV2, float | np.floating[Any]]] = [ +json_float_v2_roundtrip_cases: tuple[tuple[JSONFloatV2, float | np.floating[Any]], ...] = ( ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -53,11 +53,9 @@ def nan_equal(a: object, b: object) -> bool: ("NaN", float("nan")), ("NaN", np.nan), (1.0, 1.0), -] +) -# exactly the same as v2, for now, until we get support for the special NaN encoding defined in the -# v3 spec -json_float_v3_cases = json_float_v2_cases +json_float_v3_cases = json_float_v2_roundtrip_cases @pytest.mark.parametrize( @@ -94,13 +92,15 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: endianness_to_numpy_str(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v2_cases + [("SHOULD_ERR", "")]) +@pytest.mark.parametrize( + ("data", "expected"), json_float_v2_roundtrip_cases + (("SHOULD_ERR", ""),) +) def test_float_from_json_v2(data: JSONFloatV2 | str, expected: float | str) -> None: """ Test that float_from_json_v2 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(SpecialFloatStrings) or isinstance(data, float): + if data != "SHOULD_ERR": assert nan_equal(float_from_json_v2(data), expected) # type: ignore[arg-type] else: msg = f"could not convert string to float: {data!r}" @@ -108,25 +108,35 @@ def test_float_from_json_v2(data: JSONFloatV2 | str, expected: float | str) -> N float_from_json_v2(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v3_cases + [("SHOULD_ERR", "")]) +@pytest.mark.parametrize( + ("data", "expected"), json_float_v3_cases + (("SHOULD_ERR", ""), ("0x", "")) +) def test_float_from_json_v3(data: JSONFloatV2 | str, expected: float | str) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(SpecialFloatStrings) or isinstance(data, float): - assert nan_equal(float_from_json_v3(data), expected) - else: + if data == "SHOULD_ERR": msg = ( f"Invalid float value: {data!r}. Expected a string starting with the hex prefix" " '0x', or one of 'NaN', 'Infinity', or '-Infinity'." ) with pytest.raises(ValueError, match=msg): float_from_json_v3(data) + elif data == "0x": + msg = ( + f"Invalid hexadecimal float value: {data!r}. " + "Expected the '0x' prefix to be followed by 4, 8, or 16 numeral characters" + ) + + with pytest.raises(ValueError, match=msg): + float_from_json_v3(data) + else: + assert nan_equal(float_from_json_v3(data), expected) # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("expected", "data"), json_float_v2_cases) +@pytest.mark.parametrize(("expected", "data"), json_float_v2_roundtrip_cases) def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloatV2) -> None: """ Test that floats are JSON-encoded properly for zarr v2 @@ -170,7 +180,7 @@ def test_bytes_to_json(zarr_format: ZarrFormat) -> None: # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_cases) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_roundtrip_cases) def test_complex_to_json_v2( float_data: float | np.floating[Any], json_expected: JSONFloatV2 ) -> None: diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index 45a3a1480e..fd216d8415 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -4,11 +4,11 @@ import numpy as np -from tests.test_dtype.test_wrapper import _TestZDType +from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams from zarr.core.dtype.npy.complex import Complex64, Complex128 -class _BaseTestFloat(_TestZDType): +class _BaseTestFloat(BaseTestZDType): def scalar_equals(self, scalar1: object, scalar2: object) -> bool: if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] return True @@ -23,7 +23,7 @@ class TestComplex64(_BaseTestFloat): np.dtype(np.float64), np.dtype(np.complex128), ) - valid_json_v2 = (">c8", "c8"), V2JsonTestParams(dtype="c16", "c16"), V2JsonTestParams(dtype=" bool: if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] return True @@ -20,7 +20,7 @@ def test_hex_encoding(self, hex_string_params: tuple[str, float]) -> None: """ hex_string, expected = hex_string_params zdtype = self.test_cls() - observed = zdtype.from_json_value(hex_string, zarr_format=3) + observed = zdtype.from_json_scalar(hex_string, zarr_format=3) assert self.scalar_equals(observed, expected) @@ -32,8 +32,8 @@ class TestFloat16(_BaseTestFloat): np.dtype(np.uint16), np.dtype(np.float32), ) - valid_json_v2 = Float16._zarr_v2_names - valid_json_v3 = (Float16._zarr_v3_name,) + valid_json_v2 = (V2JsonTestParams(dtype=">f2"), V2JsonTestParams(dtype="f4"), V2JsonTestParams(dtype="f8"), V2JsonTestParams(dtype="i1", @@ -37,7 +37,7 @@ class TestInt8(_TestZDType): item_size_params = (Int8(),) -class TestInt16(_TestZDType): +class TestInt16(BaseTestZDType): test_cls = Int16 scalar_type = np.int16 valid_dtype = (np.dtype(">i2"), np.dtype("i2", "i2"), V2JsonTestParams(dtype="i4"), np.dtype("i4", "i4"), V2JsonTestParams(dtype="i8"), np.dtype("i8", "i8"), V2JsonTestParams(dtype="u2"), np.dtype("u2", "u2"), V2JsonTestParams(dtype="u4"), np.dtype("u4", "u4"), V2JsonTestParams(dtype="u8"), np.dtype("u8", "u8"), V2JsonTestParams(dtype="i4"), ("field2", ">f8")], - [("field1", ">i8"), ("field2", ">i4")], + V2JsonTestParams(dtype=[("field1", ">i4"), ("field2", ">f8")]), + V2JsonTestParams(dtype=[("field1", ">i8"), ("field2", ">i4")]), ) valid_json_v3 = ( { @@ -99,7 +99,7 @@ class TestStructured(_TestZDType): ), ( "field2", - {"name": "numpy.fixed_length_utf32", "configuration": {"length_bytes": 32}}, + {"name": "fixed_length_utf32", "configuration": {"length_bytes": 32}}, ), ] }, diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 6620f45052..73c8612db4 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -2,13 +2,13 @@ import numpy as np -from tests.test_dtype.test_wrapper import _TestZDType +from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams from zarr.core.dtype import FixedLengthASCII, FixedLengthUTF32 from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthString if _NUMPY_SUPPORTS_VLEN_STRING: - class TestVariableLengthString(_TestZDType): + class TestVariableLengthString(BaseTestZDType): test_cls = VariableLengthString # type: ignore[assignment] valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment] invalid_dtype = ( @@ -16,15 +16,15 @@ class TestVariableLengthString(_TestZDType): np.dtype(np.float64), np.dtype("|S10"), ) - valid_json_v2 = ("|O",) - valid_json_v3 = ("numpy.variable_length_utf8",) + valid_json_v2 = (V2JsonTestParams(dtype="|O", object_codec_id="vlen-utf8"),) + valid_json_v3 = ("variable_length_utf8",) invalid_json_v2 = ( "|S10", "|f8", "invalid", ) invalid_json_v3 = ( - {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, + {"name": "variable_length_utf8", "configuration": {"invalid_key": "value"}}, {"name": "invalid_name"}, ) @@ -42,7 +42,7 @@ class TestVariableLengthString(_TestZDType): else: - class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] + class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] test_cls = VariableLengthString # type: ignore[assignment] valid_dtype = (np.dtype("O"),) invalid_dtype = ( @@ -50,8 +50,8 @@ class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] np.dtype(np.float64), np.dtype("|S10"), ) - valid_json_v2 = ("|O",) - valid_json_v3 = ("numpy.variable_length_utf8",) + valid_json_v2 = (V2JsonTestParams(dtype="|O", object_codec_id="vlen-utf8"),) + valid_json_v3 = ("variable_length_utf8",) invalid_json_v2 = ( "|S10", "|f8", @@ -76,7 +76,7 @@ class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] item_size_params = (VariableLengthString(),) -class TestFixedLengthAscii(_TestZDType): +class TestFixedLengthAscii(BaseTestZDType): test_cls = FixedLengthASCII valid_dtype = (np.dtype("|S10"), np.dtype("|S4")) invalid_dtype = ( @@ -84,15 +84,19 @@ class TestFixedLengthAscii(_TestZDType): np.dtype(np.float64), np.dtype("|U10"), ) - valid_json_v2 = ("|S0", "|S2", "|S4") - valid_json_v3 = ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 10}},) + valid_json_v2 = ( + V2JsonTestParams(dtype="|S0"), + V2JsonTestParams(dtype="|S2"), + V2JsonTestParams(dtype="|S4"), + ) + valid_json_v3 = ({"name": "fixed_length_ascii", "configuration": {"length_bytes": 10}},) invalid_json_v2 = ( "|S", "|U10", "|f8", ) invalid_json_v3 = ( - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": 0}}, + {"name": "fixed_length_ascii", "configuration": {"length_bits": 0}}, {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, ) @@ -118,7 +122,7 @@ class TestFixedLengthAscii(_TestZDType): ) -class TestFixedLengthUTF32(_TestZDType): +class TestFixedLengthUTF32(BaseTestZDType): test_cls = FixedLengthUTF32 valid_dtype = (np.dtype(">U10"), np.dtype("U10", "U10"), V2JsonTestParams(dtype=" bool: # This method gets overridden here to support the equivalency between NaT and # -9223372036854775808 fill values @@ -34,7 +34,12 @@ class TestDateTime64(_TestTimeBase): np.dtype(np.float64), np.dtype("timedelta64[ns]"), ) - valid_json_v2 = (">M8", ">M8[s]", " None: """ -class _TestZDType: +@dataclass(frozen=True, kw_only=True, slots=True) +class V2JsonTestParams: + dtype: str | dict[str, object] | list[object] + object_codec_id: str | None = None + + +class BaseTestZDType: """ A base class for testing ZDType subclasses. This class works in conjunction with the custom pytest collection function ``pytest_generate_tests`` defined in conftest.py, which applies the @@ -66,7 +73,7 @@ class _TestZDType: valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () - valid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () + valid_json_v2: ClassVar[tuple[V2JsonTestParams, ...]] = () invalid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () valid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () @@ -92,37 +99,40 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: return scalar1 == scalar2 def test_check_dtype_valid(self, valid_dtype: TBaseDType) -> None: - assert self.test_cls.check_dtype(valid_dtype) + assert self.test_cls.check_native_dtype(valid_dtype) def test_check_dtype_invalid(self, invalid_dtype: object) -> None: - assert not self.test_cls.check_dtype(invalid_dtype) # type: ignore[arg-type] + assert not self.test_cls.check_native_dtype(invalid_dtype) # type: ignore[arg-type] def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: - zdtype = self.test_cls.from_dtype(valid_dtype) - assert zdtype.to_dtype() == valid_dtype + zdtype = self.test_cls.from_native_dtype(valid_dtype) + assert zdtype.to_native_dtype() == valid_dtype - def test_from_json_roundtrip_v2(self, valid_json_v2: Any) -> None: - zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) - assert zdtype.to_json(zarr_format=2) == valid_json_v2 + def test_from_json_roundtrip_v2(self, valid_json_v2: V2JsonTestParams) -> None: + zdtype = self.test_cls.from_json_v2( + valid_json_v2.dtype, # type: ignore[arg-type] + object_codec_id=valid_json_v2.object_codec_id, + ) + assert zdtype.to_json(zarr_format=2) == valid_json_v2.dtype @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: - zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) + zdtype = self.test_cls.from_json_v3(valid_json_v3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 - def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[Any, Any]) -> None: + def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[ZDType[Any, Any], Any]) -> None: zdtype, scalar_json = scalar_v2_params - scalar = zdtype.from_json_value(scalar_json, zarr_format=2) - assert self.json_scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=2)) + scalar = zdtype.from_json_scalar(scalar_json, zarr_format=2) + assert self.json_scalar_equals(scalar_json, zdtype.to_json_scalar(scalar, zarr_format=2)) - def test_scalar_roundtrip_v3(self, scalar_v3_params: tuple[Any, Any]) -> None: + def test_scalar_roundtrip_v3(self, scalar_v3_params: tuple[ZDType[Any, Any], Any]) -> None: zdtype, scalar_json = scalar_v3_params - scalar = zdtype.from_json_value(scalar_json, zarr_format=3) - assert self.json_scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=3)) + scalar = zdtype.from_json_scalar(scalar_json, zarr_format=3) + assert self.json_scalar_equals(scalar_json, zdtype.to_json_scalar(scalar, zarr_format=3)) - def test_cast_value(self, cast_value_params: tuple[Any, Any, Any]) -> None: + def test_cast_value(self, cast_value_params: tuple[ZDType[Any, Any], Any, Any]) -> None: zdtype, value, expected = cast_value_params - observed = zdtype.cast_value(value) + observed = zdtype.cast_scalar(value) assert self.scalar_equals(expected, observed) def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: @@ -131,6 +141,6 @@ def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: with a fixed scalar size. """ if isinstance(item_size_params, HasItemSize): - assert item_size_params.item_size == item_size_params.to_dtype().itemsize + assert item_size_params.item_size == item_size_params.to_native_dtype().itemsize else: pytest.skip(f"Dtype {item_size_params} does not implement HasItemSize") diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 0c650e5c29..c4225874a4 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -23,9 +23,10 @@ TBaseScalar, ZDType, data_type_registry, - get_data_type_from_json, + get_data_type_from_json_v3, parse_data_type, ) +from zarr.core.dtype.common import HasObjectCodec if TYPE_CHECKING: from collections.abc import Generator @@ -58,7 +59,7 @@ def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): - def default_value(self) -> np.bool_: + def default_scalar(self) -> np.bool_: return np.True_ data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) @@ -96,20 +97,36 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("zdtype", zdtype_examples) - def test_registered_dtypes( - zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat - ) -> None: + def test_registered_dtypes_match_dtype(zdtype: ZDType[TBaseDType, TBaseScalar]) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ skip_object_dtype(zdtype) - assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype - assert ( - data_type_registry.match_json( - zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format + assert data_type_registry.match_dtype(zdtype.to_native_dtype()) == zdtype + + @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") + @pytest.mark.parametrize("zdtype", zdtype_examples) + def test_registered_dtypes_match_json( + zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat + ) -> None: + if zarr_format == 2: + if isinstance(zdtype, HasObjectCodec): + object_codec_id = zdtype.object_codec_id + else: + object_codec_id = None + assert ( + data_type_registry.match_json_v2( + zdtype.to_json(zarr_format=zarr_format), # type: ignore[arg-type] + object_codec_id=object_codec_id, + ) + == zdtype + ) + else: + skip_object_dtype(zdtype) + assert ( + data_type_registry.match_json_v3(zdtype.to_json(zarr_format=zarr_format)) == zdtype # type: ignore[arg-type] ) - == zdtype - ) @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -129,7 +146,7 @@ def test_match_dtype_unique( if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) - dtype_instance = zdtype.to_dtype() + dtype_instance = zdtype.to_native_dtype() msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" with pytest.raises(ValueError, match=re.escape(msg)): @@ -138,7 +155,7 @@ def test_match_dtype_unique( instance_dict = zdtype.to_json(zarr_format=zarr_format) msg = f"No data type wrapper found that matches {instance_dict}" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) + data_type_registry_fixture.match_json_v3(instance_dict) # type: ignore[arg-type] # this is copied from the registry tests -- we should deduplicate @@ -161,9 +178,11 @@ def set_path() -> Generator[None, None, None]: def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType + data_type_registry.lazy_load() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) - assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance + assert get_data_type_from_json_v3(dtype_json) == instance + data_type_registry.unregister(TestDataType._zarr_v3_name) @pytest.mark.parametrize( diff --git a/tests/test_group.py b/tests/test_group.py index b4dace2568..c0a40bf5f9 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -23,6 +23,7 @@ from zarr.core._info import GroupInfo from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config +from zarr.core.dtype.npy.int import UInt8 from zarr.core.group import ( ConsolidatedMetadata, GroupMetadata, @@ -494,7 +495,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat expected_groups = list(zip(expected_group_keys, expected_group_values, strict=False)) fill_value = 3 - dtype = "uint8" + dtype = UInt8() expected_group_values[0].create_group("subgroup") expected_group_values[0].create_array( @@ -515,7 +516,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat metadata = { "subarray": { "attributes": {}, - "dtype": dtype, + "dtype": dtype.to_json(zarr_format=zarr_format), "fill_value": fill_value, "shape": (1,), "chunks": (1,), @@ -551,7 +552,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": dtype, + "data_type": dtype.to_json(zarr_format=zarr_format), "fill_value": fill_value, "node_type": "array", "shape": (1,), diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index b2244c5047..cfb548cc8d 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -594,7 +594,7 @@ async def test_consolidated_metadata_encodes_special_chars( "consolidated_metadata" ]["metadata"] - expected_fill_value = _time._zdtype.to_json_value(fill_value, zarr_format=2) + expected_fill_value = _time._zdtype.to_json_scalar(fill_value, zarr_format=2) if zarr_format == 2: assert root_metadata["time/.zarray"]["fill_value"] == expected_fill_value diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 5fd3ae8cc6..a2894529aa 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -86,7 +86,7 @@ def test_filters_empty_tuple_warns() -> None: "zarr_format": 2, "shape": (1,), "chunks": (1,), - "dtype": "uint8", + "dtype": "|u1", "order": "C", "compressor": None, "filters": (), diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index f3bd4510e5..a806a438c7 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -128,10 +128,10 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ zarr_format = 3 dtype = get_data_type_from_native_dtype(dtype_str) - expected = dtype.to_dtype().type(complex(*fill_value)) - observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) + expected = dtype.to_native_dtype().type(complex(*fill_value)) + observed = dtype.from_json_scalar(fill_value, zarr_format=zarr_format) assert observed == expected - assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) + assert dtype.to_json_scalar(observed, zarr_format=zarr_format) == tuple(fill_value) @pytest.mark.parametrize("fill_value", [{"foo": 10}]) @@ -143,7 +143,7 @@ def test_parse_fill_value_invalid_type(fill_value: Any, dtype_str: str) -> None: """ dtype_instance = get_data_type_from_native_dtype(dtype_str) with pytest.raises(TypeError, match=f"Invalid type: {fill_value}"): - dtype_instance.from_json_value(fill_value, zarr_format=3) + dtype_instance.from_json_scalar(fill_value, zarr_format=3) @pytest.mark.parametrize( @@ -164,7 +164,7 @@ def test_parse_fill_value_invalid_type_sequence(fill_value: Any, dtype_str: str) """ dtype_instance = get_data_type_from_native_dtype(dtype_str) with pytest.raises(TypeError, match=re.escape(f"Invalid type: {fill_value}")): - dtype_instance.from_json_value(fill_value, zarr_format=3) + dtype_instance.from_json_scalar(fill_value, zarr_format=3) @pytest.mark.parametrize("chunk_grid", ["regular"]) @@ -266,8 +266,8 @@ async def test_datetime_metadata(fill_value: int, precision: str) -> None: "data_type": dtype.to_json(zarr_format=3), "chunk_key_encoding": {"name": "default", "separator": "."}, "codecs": (BytesCodec(),), - "fill_value": dtype.to_json_value( - dtype.to_dtype().type(fill_value, dtype.unit), zarr_format=3 + "fill_value": dtype.to_json_scalar( + dtype.to_native_dtype().type(fill_value, dtype.unit), zarr_format=3 ), } metadata = ArrayV3Metadata.from_dict(metadata_dict) diff --git a/tests/test_properties.py b/tests/test_properties.py index 15dd701582..ed8aa997c0 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -316,7 +316,7 @@ def test_array_metadata_meets_spec(meta: ArrayV2Metadata | ArrayV3Metadata) -> N assert asdict_dict["zarr_format"] == 3 # version-agnostic validations - dtype_native = meta.dtype.to_dtype() + dtype_native = meta.dtype.to_native_dtype() if dtype_native.kind == "f": assert serialized_float_is_valid(asdict_dict["fill_value"]) elif dtype_native.kind == "c": diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index 61ff8ebfa9..a5b77d9931 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -34,6 +34,7 @@ def runner_installed() -> bool: class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] fill_value: np.generic | str | int + filters: tuple[numcodecs.abc.Codec, ...] = () compressor: numcodecs.abc.Codec @@ -62,7 +63,8 @@ class ArrayParams: ArrayParams( values=np.array(["a", "bb", "ccc", "dddd"], dtype="O"), fill_value="1", - compressor=VLenUTF8(), + filters=(VLenUTF8(),), + compressor=GZip(), ) ] array_cases = basic_array_cases + datetime_array_cases + string_array_cases + vlen_string_cases @@ -86,9 +88,9 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: dtype=dtype, chunks=array_params.values.shape, compressors=compressor, + filters=array_params.filters, fill_value=array_params.fill_value, order="C", - filters=None, chunk_key_encoding=chunk_key_encoding, write_data=True, zarr_format=2, diff --git a/tests/test_v2.py b/tests/test_v2.py index 4b041a9b82..fa2aa65b22 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -62,8 +62,8 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize( ("dtype", "expected_dtype", "fill_value", "fill_value_json"), [ - ("|S", "|S0", b"X", "WA=="), - ("|V", "|V0", b"X", "WA=="), + ("|S1", "|S1", b"X", "WA=="), + ("|V1", "|V1", b"X", "WA=="), ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), ], ) @@ -111,7 +111,7 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js ], ) def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str): - expected = np.full((3,), value, dtype=dtype.to_dtype()) + expected = np.full((3,), value, dtype=dtype.to_native_dtype()) a = zarr.create( shape=(3,), zarr_format=2, @@ -278,8 +278,8 @@ def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: def test_parse_structured_fill_value_valid( fill_value: Any, dtype: np.dtype[Any], expected_result: Any ) -> None: - zdtype = Structured.from_dtype(dtype) - result = zdtype.cast_value(fill_value) + zdtype = Structured.from_native_dtype(dtype) + result = zdtype.cast_scalar(fill_value) assert result.dtype == expected_result.dtype assert result == expected_result if isinstance(expected_result, np.void): From d6535d65facbe6be67cd9f0ed195d5e8656ffc07 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 12:55:58 +0200 Subject: [PATCH 116/130] fix storage info discrepancy in docs --- docs/user-guide/arrays.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 13190a4689..c27f1296b9 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -211,8 +211,8 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 9696520 - Storage ratio : 41.3 + No. bytes stored : 3558573 + Storage ratio : 112.4 Chunks Initialized : 100 .. note:: From 42e14ef8856b13e863cfa538d6f4a2d08d61d10d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 17:29:05 +0200 Subject: [PATCH 117/130] fix docstring that was troubling sphinx --- src/zarr/core/dtype/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index c9b23707e8..f3d6b0adca 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -246,7 +246,7 @@ def check_json_v2( object_codec_id : str | None The object codec ID, if applicable. Object codecs are specific numcodecs codecs that zarr-python 2.x used to serialize numpy "Object" scalars. For example, a dtype field set - to "|O" with an object codec ID of "vlen-utf8" indicates that the data type is a + to ``"|O"`` with an object codec ID of "vlen-utf8" indicates that the data type is a variable-length string. Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. From 3991406bcdac1cc76da01352a3abb54c1796e02e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 21:01:14 +0200 Subject: [PATCH 118/130] wip: add vlen-bytes --- src/zarr/core/dtype/npy/vlen_bytes.py | 75 ++++++++++++++++++++++++ tests/test_regression/test_regression.py | 18 +++++- 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 src/zarr/core/dtype/npy/vlen_bytes.py diff --git a/src/zarr/core/dtype/npy/vlen_bytes.py b/src/zarr/core/dtype/npy/vlen_bytes.py new file mode 100644 index 0000000000..6d804cac60 --- /dev/null +++ b/src/zarr/core/dtype/npy/vlen_bytes.py @@ -0,0 +1,75 @@ +from dataclasses import dataclass +from typing import ClassVar, Literal, Self, TypeGuard, overload + +import numpy as np + +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import HasObjectCodec +from zarr.core.dtype.wrapper import TBaseDType, ZDType + + +@dataclass(frozen=True, kw_only=True) +class VariableLengthString(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] + dtype_cls = np.dtypes.ObjectDType + _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" + object_codec_id = "vlen-bytes" + + @classmethod + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + return cls() + + def to_native_dtype(self) -> np.dtypes.ObjectDType: + return self.dtype_cls() + + @classmethod + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|O"]]: + """ + Check that the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. + """ + return data == "|O" and object_codec_id == cls.object_codec_id + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + return data == cls._zarr_v3_name + + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: + if zarr_format == 2: + return "|O" + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + return cls() + + def default_scalar(self) -> str: + return "" + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return data # type: ignore[return-value] + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + Strings pass through + """ + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return data + + def check_scalar(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_scalar_unchecked(self, data: object) -> str: + return str(data) diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index a5b77d9931..83a917dee8 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -7,7 +7,7 @@ import numcodecs import numpy as np import pytest -from numcodecs import LZ4, LZMA, Blosc, GZip, VLenUTF8, Zstd +from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd import zarr from zarr.core.array import Array @@ -67,7 +67,21 @@ class ArrayParams: compressor=GZip(), ) ] -array_cases = basic_array_cases + datetime_array_cases + string_array_cases + vlen_string_cases +vlen_bytes_cases = [ + ArrayParams( + values=np.array([b"a", b"bb", b"ccc", b"dddd"], dtype="O"), + fill_value=b"1", + filters=(VLenBytes(),), + compressor=GZip(), + ) +] +array_cases = ( + basic_array_cases + + datetime_array_cases + + string_array_cases + + vlen_string_cases + + vlen_bytes_cases +) @pytest.fixture From d7da3d9136f44d65341c11d2f9026bbbce72e6e8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 22:03:00 +0200 Subject: [PATCH 119/130] add vlen-bytes --- src/zarr/core/dtype/__init__.py | 3 ++ src/zarr/core/dtype/npy/vlen_bytes.py | 36 ++++++++++++------------ src/zarr/core/dtype/wrapper.py | 6 ++-- tests/test_regression/test_regression.py | 7 +++-- 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index a8bfe2b5c4..575086cb6f 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -12,6 +12,7 @@ Structured, ) from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 +from zarr.core.dtype.npy.vlen_bytes import VariableLengthBytes if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -88,6 +89,7 @@ | FixedLengthBytes | Structured | TimeDType + | VariableLengthBytes ) # mypy has trouble inferring the type of variablelengthstring dtype, because its class definition # depends on the installed numpy version. That's why the type: ignore statement is needed here. @@ -100,6 +102,7 @@ FixedLengthBytes, Structured, *TIME_DTYPE, + VariableLengthBytes, ) # This type models inputs that can be coerced to a ZDType diff --git a/src/zarr/core/dtype/npy/vlen_bytes.py b/src/zarr/core/dtype/npy/vlen_bytes.py index 6d804cac60..c25523f9ed 100644 --- a/src/zarr/core/dtype/npy/vlen_bytes.py +++ b/src/zarr/core/dtype/npy/vlen_bytes.py @@ -1,15 +1,17 @@ +import base64 from dataclasses import dataclass from typing import ClassVar, Literal, Self, TypeGuard, overload import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasObjectCodec -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.common import HasObjectCodec, v3_unstable_dtype_warning +from zarr.core.dtype.npy.common import check_json_str +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType @dataclass(frozen=True, kw_only=True) -class VariableLengthString(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] +class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): dtype_cls = np.dtypes.ObjectDType _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" object_codec_id = "vlen-bytes" @@ -39,12 +41,13 @@ def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]] def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_bytes"]: if zarr_format == 2: return "|O" elif zarr_format == 3: + v3_unstable_dtype_warning(self) return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -54,22 +57,19 @@ def _from_json_unchecked( ) -> Self: return cls() - def default_scalar(self) -> str: - return "" + def default_scalar(self) -> bytes: + return b"" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return data # type: ignore[return-value] + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - """ - Strings pass through - """ - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes: + if check_json_str(data): + return base64.standard_b64decode(data.encode("ascii")) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_scalar(self, data: object) -> bool: - return isinstance(data, str) + return isinstance(data, bytes | str) - def _cast_scalar_unchecked(self, data: object) -> str: - return str(data) + def _cast_scalar_unchecked(self, data: object) -> bytes: + return bytes(data) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index f3d6b0adca..4c399bbb84 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -45,7 +45,7 @@ # This the upper bound for the scalar types we support. It's numpy scalars + str, # because the new variable-length string dtype in numpy does not have a corresponding scalar type -TBaseScalar = np.generic | str +TBaseScalar = np.generic | str | bytes # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. TBaseDType = np.dtype[np.generic] @@ -174,8 +174,8 @@ def cast_scalar(self, data: object) -> TScalar_co: if self.check_scalar(data): return self._cast_scalar_unchecked(data) msg = ( - f"The value {data} failed a type check. " - f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}. " + f"The value {data!r} failed a type check. " + f"It cannot be safely cast to a scalar compatible with {self}. " f"Consult the documentation for {self} to determine the possible values that can " "be cast to scalars of the wrapped data type." ) diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index 83a917dee8..a1d13510c3 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -13,6 +13,7 @@ from zarr.core.array import Array from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.string import VariableLengthString +from zarr.core.dtype.npy.vlen_bytes import VariableLengthBytes from zarr.storage import LocalStore if TYPE_CHECKING: @@ -33,7 +34,7 @@ def runner_installed() -> bool: @dataclass(kw_only=True) class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] - fill_value: np.generic | str | int + fill_value: np.generic | str | int | bytes filters: tuple[numcodecs.abc.Codec, ...] = () compressor: numcodecs.abc.Codec @@ -92,8 +93,10 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: compressor = array_params.compressor chunk_key_encoding = V2ChunkKeyEncoding(separator="/") dtype: ZDTypeLike - if array_params.values.dtype == np.dtype("|O"): + if array_params.values.dtype == np.dtype("|O") and array_params.filters == (VLenUTF8(),): dtype = VariableLengthString() # type: ignore[assignment] + elif array_params.values.dtype == np.dtype("|O") and array_params.filters == (VLenBytes(),): + dtype = VariableLengthBytes() else: dtype = array_params.values.dtype z = zarr.create_array( From 1f767e45450ab758ffbe34bbead1a508ae82f9b5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Jun 2025 15:20:01 +0300 Subject: [PATCH 120/130] replace placeholder text with links to a github issue --- src/zarr/core/dtype/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 0423f69dbe..308bde602c 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -56,7 +56,7 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: "data type. " "In this case you should construct your array by providing a specific Zarr data " 'type. For a list of Zarr data types that are compatible with the numpy "Object"' - "data type, see xxxxxxxxxxx" + "data type, see https://github.com/zarr-developers/zarr-python/issues/3117" ) raise ValueError(msg) matched: list[ZDType[TBaseDType, TBaseScalar]] = [] @@ -71,7 +71,7 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " "You should unregister one of these data types, or avoid Zarr data type inference " "entirely by providing a specific Zarr data type when creating your array." - "For more information, see xxxxxxxxxxxxxxxxxx" + "For more information, see https://github.com/zarr-developers/zarr-python/issues/3117" ) raise ValueError(msg) raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") From cf5504194723d25cdbd38db5c2d6ed5b4c0022dc Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Jun 2025 19:25:30 +0300 Subject: [PATCH 121/130] refactor fixed-length bytes dtypes --- src/zarr/core/config.py | 4 +- src/zarr/core/dtype/__init__.py | 25 +- src/zarr/core/dtype/npy/bool.py | 8 +- src/zarr/core/dtype/npy/bytes.py | 283 ++++++++++++++++++ src/zarr/core/dtype/npy/complex.py | 8 +- src/zarr/core/dtype/npy/float.py | 8 +- src/zarr/core/dtype/npy/int.py | 24 +- src/zarr/core/dtype/npy/string.py | 153 +++------- .../dtype/npy/{sized.py => structured.py} | 132 +------- src/zarr/core/dtype/npy/time.py | 12 +- src/zarr/core/dtype/npy/vlen_bytes.py | 75 ----- src/zarr/core/dtype/wrapper.py | 40 +-- src/zarr/core/metadata/v3.py | 4 +- tests/test_array.py | 12 +- tests/test_codecs/test_vlen.py | 1 + tests/test_config.py | 4 +- tests/test_dtype/conftest.py | 2 +- tests/test_dtype/test_npy/test_bytes.py | 138 +++++++++ tests/test_dtype/test_npy/test_string.py | 78 +---- .../{test_sized.py => test_structured.py} | 47 --- tests/test_dtype/test_wrapper.py | 4 +- tests/test_properties.py | 10 + tests/test_regression/test_regression.py | 20 +- tests/test_v2.py | 7 +- 24 files changed, 599 insertions(+), 500 deletions(-) create mode 100644 src/zarr/core/dtype/npy/bytes.py rename src/zarr/core/dtype/npy/{sized.py => structured.py} (56%) delete mode 100644 src/zarr/core/dtype/npy/vlen_bytes.py create mode 100644 tests/test_dtype/test_npy/test_bytes.py rename tests/test_dtype/test_npy/{test_sized.py => test_structured.py} (71%) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 97df060cb8..74e9bdd8dd 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -149,8 +149,8 @@ def categorize_data_type(dtype: ZDType[Any, Any]) -> DTypeCategory: This is used by the config system to determine how to encode arrays with the associated data type when the user has not specified a particular serialization scheme. """ - from zarr.core.dtype import VariableLengthString + from zarr.core.dtype import VariableLengthUTF8 - if isinstance(dtype, VariableLengthString): + if isinstance(dtype, VariableLengthUTF8): return "variable-length-string" return "default" diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 575086cb6f..25e5163e43 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -4,15 +4,14 @@ from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.npy.bool import Bool +from zarr.core.dtype.npy.bytes import NullTerminatedBytes, RawBytes, VariableLengthBytes from zarr.core.dtype.npy.complex import Complex64, Complex128 from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 -from zarr.core.dtype.npy.sized import ( - FixedLengthBytes, +from zarr.core.dtype.npy.structured import ( Structured, ) from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 -from zarr.core.dtype.npy.vlen_bytes import VariableLengthBytes if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -24,9 +23,8 @@ from zarr.core.common import JSON from zarr.core.dtype.npy.string import ( - FixedLengthASCII, FixedLengthUTF32, - VariableLengthString, + VariableLengthUTF8, ) from zarr.core.dtype.registry import DataTypeRegistry from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -38,8 +36,6 @@ "DataTypeRegistry", "DataTypeValidationError", "DateTime64", - "FixedLengthASCII", - "FixedLengthBytes", "FixedLengthUTF32", "Float16", "Float32", @@ -48,6 +44,8 @@ "Int16", "Int32", "Int64", + "NullTerminatedBytes", + "RawBytes", "Structured", "TBaseDType", "TBaseScalar", @@ -57,7 +55,7 @@ "UInt16", "UInt32", "UInt64", - "VariableLengthString", + "VariableLengthUTF8", "ZDType", "data_type_registry", "parse_data_type", @@ -74,19 +72,22 @@ ComplexFloatDType = Complex64 | Complex128 COMPLEX_FLOAT_DTYPE: Final = Complex64, Complex128 -StringDType = FixedLengthUTF32 | VariableLengthString | FixedLengthASCII -STRING_DTYPE: Final = FixedLengthUTF32, VariableLengthString, FixedLengthASCII +StringDType = FixedLengthUTF32 | VariableLengthUTF8 +STRING_DTYPE: Final = FixedLengthUTF32, VariableLengthUTF8 TimeDType = DateTime64 | TimeDelta64 TIME_DTYPE: Final = DateTime64, TimeDelta64 +BytesDType = RawBytes | NullTerminatedBytes | VariableLengthBytes +BYTES_DTYPE: Final = RawBytes, NullTerminatedBytes, VariableLengthBytes + AnyDType = ( Bool | IntegerDType | FloatDType | ComplexFloatDType | StringDType - | FixedLengthBytes + | BytesDType | Structured | TimeDType | VariableLengthBytes @@ -99,7 +100,7 @@ *FLOAT_DTYPE, *COMPLEX_FLOAT_DTYPE, *STRING_DTYPE, - FixedLengthBytes, + *BYTES_DTYPE, Structured, *TIME_DTYPE, VariableLengthBytes, diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index b1800127e8..bee42b6a13 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -27,14 +27,14 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): dtype_cls = np.dtypes.BoolDType @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: return cls() def to_native_dtype(self: Self) -> np.dtypes.BoolDType: return self.dtype_cls() @classmethod - def check_json_v2( + def _check_json_v2( cls, data: JSON, *, object_codec_id: str | None = None ) -> TypeGuard[Literal["|b1"]]: """ @@ -43,7 +43,7 @@ def check_json_v2( return data in cls._zarr_v2_names @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["bool"]]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["bool"]]: return data == cls._zarr_v3_name @overload @@ -114,7 +114,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> bool: # Anything can become a bool return True diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py new file mode 100644 index 0000000000..9d815ab849 --- /dev/null +++ b/src/zarr/core/dtype/npy/bytes.py @@ -0,0 +1,283 @@ +import base64 +import re +from dataclasses import dataclass +from typing import Any, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload + +import numpy as np + +from zarr.core.common import JSON, NamedConfig, ZarrFormat +from zarr.core.dtype.common import HasItemSize, HasLength, HasObjectCodec, v3_unstable_dtype_warning +from zarr.core.dtype.npy.common import check_json_str +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType + + +class FixedLengthBytesConfig(TypedDict): + length_bytes: int + + +NTBytesJSONV3 = NamedConfig[Literal["null_terminated_bytes"], FixedLengthBytesConfig] +RawBytesJSONV3 = NamedConfig[Literal["raw_bytes"], FixedLengthBytesConfig] + + +@dataclass(frozen=True, kw_only=True) +class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): + dtype_cls = np.dtypes.BytesDType + _zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes" + + @classmethod + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: + return cls(length=dtype.itemsize) + + def to_native_dtype(self) -> np.dtypes.BytesDType[int]: + return self.dtype_cls(self.length) + + @classmethod + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + + @classmethod + def _check_json_v3(cls, data: JSON) -> TypeGuard[NTBytesJSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and "length_bytes" in data["configuration"] + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> NTBytesJSONV3: ... + + def to_json(self, zarr_format: ZarrFormat) -> str | NTBytesJSONV3: + if zarr_format == 2: + return self.to_native_dtype().str + elif zarr_format == 3: + v3_unstable_dtype_warning(self) + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + if zarr_format == 2: + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_scalar(self) -> np.bytes_: + return np.bytes_(b"") + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + as_bytes = self.cast_scalar(data) + return base64.standard_b64encode(as_bytes).decode("ascii") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + if check_json_str(data): + return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + raise TypeError( + f"Invalid type: {data}. Expected a base64-encoded string." + ) # pragma: no cover + + def _check_scalar(self, data: object) -> bool: + # this is generous for backwards compatibility + return isinstance(data, np.bytes_ | str | bytes | int) + + def _cast_scalar_unchecked(self, data: object) -> np.bytes_: + # We explicitly truncate the result because of the following numpy behavior: + # >>> x = np.dtype('S3').type('hello world') + # >>> x + # np.bytes_(b'hello world') + # >>> x.dtype + # dtype('S11') + + if isinstance(data, int): + return self.to_native_dtype().type(str(data)[: self.length]) + else: + return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] + + @property + def item_size(self) -> int: + return self.length + + +@dataclass(frozen=True, kw_only=True) +class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): + # np.dtypes.VoidDType is specified in an odd way in numpy + # it cannot be used to create instances of the dtype + # so we have to tell mypy to ignore this here + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] + _zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes" + + @classmethod + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: + return cls(length=dtype.itemsize) + + def to_native_dtype(self) -> np.dtypes.VoidDType[int]: + # Numpy does not allow creating a void type + # by invoking np.dtypes.VoidDType directly + return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) + + @classmethod + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + # Check that the dtype is |V1, |V2, ... + return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None + + @classmethod + def _check_json_v3(cls, data: JSON) -> TypeGuard[RawBytesJSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> RawBytesJSONV3: ... + + def to_json(self, zarr_format: ZarrFormat) -> str | RawBytesJSONV3: + if zarr_format == 2: + return self.to_native_dtype().str + elif zarr_format == 3: + return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + if zarr_format == 2: + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _check_native_dtype( + cls: type[Self], dtype: TBaseDType + ) -> TypeGuard[np.dtypes.VoidDType[Any]]: + """ + Numpy void dtype comes in two forms: + * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. + * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, + + In this check we ensure that ``fields`` is ``None``. + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] + + def default_scalar(self) -> np.void: + return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if check_json_str(data): + return self.to_native_dtype().type(base64.standard_b64decode(data)) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + + def _check_scalar(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes | np.void) + + def _cast_scalar_unchecked(self, data: object) -> np.void: + native_dtype = self.to_native_dtype() + # Without the second argument, numpy will return a void scalar for dtype V1. + # The second argument ensures that, if native_dtype is something like V10, + # the result will actually be a V10 scalar. + return native_dtype.type(data, native_dtype) + + @property + def item_size(self) -> int: + return self.length + + +@dataclass(frozen=True, kw_only=True) +class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): + dtype_cls = np.dtypes.ObjectDType + _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" + object_codec_id = "vlen-bytes" + + @classmethod + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: + return cls() + + def to_native_dtype(self) -> np.dtypes.ObjectDType: + return self.dtype_cls() + + @classmethod + def _check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|O"]]: + """ + Check that the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. + """ + return data == "|O" and object_codec_id == cls.object_codec_id + + @classmethod + def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_bytes"]]: + return data == cls._zarr_v3_name + + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_bytes"]: + if zarr_format == 2: + return "|O" + elif zarr_format == 3: + v3_unstable_dtype_warning(self) + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + return cls() + + def default_scalar(self) -> bytes: + return b"" + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes: + if check_json_str(data): + return base64.standard_b64decode(data.encode("ascii")) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + + def _check_scalar(self, data: object) -> bool: + return isinstance(data, bytes | str) + + def _cast_scalar_unchecked(self, data: object) -> bytes: + if isinstance(data, str): + return bytes(data, encoding="utf-8") + return bytes(data) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index f7db6fe94d..f68640e4ce 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -37,7 +37,7 @@ class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, Ha _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -76,17 +76,17 @@ def _from_json_unchecked( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ return data in cls._zarr_v2_names @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: return data == cls._zarr_v3_name - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> bool: return isinstance(data, ComplexLike) def _cast_scalar_unchecked(self, data: object) -> TComplexScalar_co: diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 174b2338ae..f87f032581 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -28,7 +28,7 @@ class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemS _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -67,17 +67,17 @@ def _from_json_unchecked( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ return data in cls._zarr_v2_names @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: return data == cls._zarr_v3_name - def check_scalar(self, data: object) -> TypeGuard[FloatLike]: + def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: return isinstance(data, FloatLike) def _cast_scalar_unchecked(self, data: object) -> TFloatScalar_co: diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 92705917f9..aed577ee44 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -47,17 +47,17 @@ class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ return data in cls._zarr_v2_names @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: return data == cls._zarr_v3_name - def check_scalar(self, data: object) -> TypeGuard[IntLike]: + def _check_scalar(self, data: object) -> TypeGuard[IntLike]: return isinstance(data, IntLike) def _cast_scalar_unchecked(self, data: object) -> TIntScalar_co: @@ -146,7 +146,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["int8", "|i1"]: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: return cls() def to_native_dtype(self: Self) -> np.dtypes.Int8DType: @@ -196,7 +196,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["uint8", "|u1"]: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: return cls() def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: @@ -246,7 +246,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["int16", ">i2", " Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -303,7 +303,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["uint16", ">u2", " Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -366,14 +366,14 @@ def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: # despite the two classes being different. Thus we will create an instance of `cls` with the # latter dtype, after pulling in the byte order of the input if dtype == np.dtypes.Int32DType(): - return cls._from_native_dtype_unsafe( + return cls._from_native_dtype_unchecked( np.dtypes.Int32DType().newbyteorder(dtype.byteorder) ) else: return super().from_native_dtype(dtype) @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -427,7 +427,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["uint32", ">u4", " Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -481,7 +481,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["int64", ">i8", " Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -537,7 +537,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["uint64", ">u8", " Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 2299b7aab1..853f32806d 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -1,6 +1,5 @@ from __future__ import annotations -import base64 import re from dataclasses import dataclass from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload @@ -8,7 +7,13 @@ import numpy as np from zarr.core.common import NamedConfig -from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength, HasObjectCodec +from zarr.core.dtype.common import ( + HasEndianness, + HasItemSize, + HasLength, + HasObjectCodec, + v3_unstable_dtype_warning, +) from zarr.core.dtype.npy.common import ( EndiannessNumpy, check_json_str, @@ -29,96 +34,7 @@ class LengthBytesConfig(TypedDict): # TDO: Fix this terrible name -FixedLengthASCIIJSONV3 = NamedConfig[Literal["fixed_length_ascii"], LengthBytesConfig] - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): - dtype_cls = np.dtypes.BytesDType - _zarr_v3_name: ClassVar[Literal["fixed_length_ascii"]] = "fixed_length_ascii" - - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize) - - def to_native_dtype(self) -> np.dtypes.BytesDType[int]: - return self.dtype_cls(self.length) - - @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None - - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthASCIIJSONV3]: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and "length_bytes" in data["configuration"] - ) - - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthASCIIJSONV3: ... - - def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthASCIIJSONV3: - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_scalar(self) -> np.bytes_: - return np.bytes_(b"") - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_scalar(self, data: object) -> bool: - # this is generous for backwards compatibility - return isinstance(data, np.bytes_ | str | bytes | int) - - def _cast_scalar_unchecked(self, data: object) -> np.bytes_: - # We explicitly truncate the result because of the following numpy behavior: - # >>> x = np.dtype('S3').type('hello world') - # >>> x - # np.bytes_(b'hello world') - # >>> x.dtype - # dtype('S11') - - if isinstance(data, int): - return self.to_native_dtype().type(str(data)[: self.length]) - else: - return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] - - @property - def item_size(self) -> int: - return self.length +FixedLengthBytesJSONV3 = NamedConfig[Literal["fixed_length_bytes"], LengthBytesConfig] # TODO: Fix this terrible name @@ -134,7 +50,7 @@ class FixedLengthUTF32( code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.code_point_bytes), @@ -146,14 +62,14 @@ def to_native_dtype(self) -> np.dtypes.StrDType[int]: return self.dtype_cls(self.length).newbyteorder(byte_order) @classmethod - def check_json_v2(cls, data: JSON, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: JSON, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of a numpy S dtype. """ return isinstance(data, str) and re.match(r"^[><]U\d+$", data) is not None @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthUTF32JSONV3]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthUTF32JSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -174,6 +90,7 @@ def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthUTF32JSONV3: if zarr_format == 2: return self.to_native_dtype().str elif zarr_format == 3: + v3_unstable_dtype_warning(self) return { "name": self._zarr_v3_name, "configuration": {"length_bytes": self.length * self.code_point_bytes}, @@ -201,7 +118,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: return self.to_native_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> TypeGuard[str | np.str_ | bytes | int]: # this is generous for backwards compatibility return isinstance(data, str | np.str_ | bytes | int) @@ -223,23 +140,32 @@ def item_size(self) -> int: return self.length * self.code_point_bytes +def check_vlen_string_json_scalar(data: object) -> TypeGuard[int | str | float]: + """ + This function checks the type of JSON-encoded variable length strings. It is generous for + backwards compatibility, as zarr-python v2 would use ints for variable length strings + fill values + """ + return isinstance(data, int | str | float) + + if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] + class VariableLengthUTF8(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] dtype_cls = np.dtypes.StringDType _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" object_codec_id = "vlen-utf8" @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: return cls() def to_native_dtype(self) -> np.dtypes.StringDType: return self.dtype_cls() @classmethod - def check_json_v2( + def _check_json_v2( cls, data: JSON, *, object_codec_id: str | None = None ) -> TypeGuard[Literal["|O"]]: """ @@ -249,7 +175,7 @@ def check_json_v2( return data == "|O" and object_codec_id == cls.object_codec_id @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: return data == cls._zarr_v3_name @overload @@ -265,6 +191,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf # that practice return "|O" elif zarr_format == 3: + v3_unstable_dtype_warning(self) return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -278,14 +205,16 @@ def default_scalar(self) -> str: return "" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return str(data) + if self._check_scalar(data): + return data + raise TypeError(f"Invalid type: {data}. Expected a string.") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data + if not check_vlen_string_json_scalar(data): + raise TypeError(f"Invalid type: {data}. Expected a string or number.") + return str(data) - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> TypeGuard[str]: return isinstance(data, str) def _cast_scalar_unchecked(self, data: object) -> str: @@ -294,20 +223,20 @@ def _cast_scalar_unchecked(self, data: object) -> str: else: # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] + class VariableLengthUTF8(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] dtype_cls = np.dtypes.ObjectDType _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" object_codec_id = "vlen-utf8" @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: return cls() def to_native_dtype(self) -> np.dtypes.ObjectDType: return self.dtype_cls() @classmethod - def check_json_v2( + def _check_json_v2( cls, data: JSON, *, object_codec_id: str | None = None ) -> TypeGuard[Literal["|O"]]: """ @@ -317,7 +246,7 @@ def check_json_v2( return data == "|O" and object_codec_id == cls.object_codec_id @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: return data == cls._zarr_v3_name @overload @@ -343,7 +272,9 @@ def default_scalar(self) -> str: return "" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return data # type: ignore[return-value] + if self._check_scalar(data): + return data + raise TypeError(f"Invalid type: {data}. Expected a string.") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ @@ -353,7 +284,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: raise TypeError(f"Invalid type: {data}. Expected a string.") return data - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> TypeGuard[str]: return isinstance(data, str) def _cast_scalar_unchecked(self, data: object) -> str: diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/structured.py similarity index 56% rename from src/zarr/core/dtype/npy/sized.py rename to src/zarr/core/dtype/npy/structured.py index 69d6145ad4..579e0a9e27 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -1,8 +1,6 @@ -import base64 -import re from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload +from typing import Literal, Self, TypeGuard, cast, overload import numpy as np @@ -10,7 +8,6 @@ from zarr.core.dtype.common import ( DataTypeValidationError, HasItemSize, - HasLength, v3_unstable_dtype_warning, ) from zarr.core.dtype.npy.common import ( @@ -21,117 +18,6 @@ from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, TBaseScalar, ZDType -class FixedLengthBytesConfig(TypedDict): - length_bytes: int - - -FixedLengthBytesJSONV3 = NamedConfig[Literal["fixed_length_bytes"], FixedLengthBytesConfig] - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): - # np.dtypes.VoidDType is specified in an odd way in numpy - # it cannot be used to create instances of the dtype - # so we have to tell mypy to ignore this here - dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name: ClassVar[Literal["fixed_length_bytes"]] = "fixed_length_bytes" - - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize) - - def to_native_dtype(self) -> np.dtypes.VoidDType[int]: - # Numpy does not allow creating a void type - # by invoking np.dtypes.VoidDType directly - return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) - - @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: - # Check that the dtype is |V1, |V2, ... - return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None - - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthBytesJSONV3]: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - ) - - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthBytesJSONV3: ... - - def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthBytesJSONV3: - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_native_dtype( - cls: type[Self], dtype: TBaseDType - ) -> TypeGuard[np.dtypes.VoidDType[Any]]: - """ - Numpy void dtype comes in two forms: - * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. - * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, - - In this check we ensure that ``fields`` is ``None``. - - Parameters - ---------- - dtype : TDType - The dtype to check. - - Returns - ------- - Bool - True if the dtype matches, False otherwise. - """ - return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - - def default_scalar(self) -> np.void: - return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if check_json_str(data): - return self.to_native_dtype().type(base64.standard_b64decode(data)) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_scalar(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes | np.void) - - def _cast_scalar_unchecked(self, data: object) -> np.void: - native_dtype = self.to_native_dtype() - # Without the second argument, numpy will return a void scalar for dtype V1. - # The second argument ensures that, if native_dtype is something like V10, - # the result will actually be a V10 scalar. - return native_dtype.type(data, native_dtype) - - @property - def item_size(self) -> int: - return self.length - - # TODO: tighten this up, get a v3 spec in place, handle endianness, etc. @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): @@ -153,7 +39,7 @@ def _cast_scalar_unchecked(self, data: object) -> np.void: return cast("np.void", res) @classmethod - def check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: + def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ Check that this dtype is a numpy structured dtype @@ -167,10 +53,10 @@ def check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[ TypeGuard[np.dtypes.VoidDType] True if the dtype matches, False otherwise. """ - return super().check_native_dtype(dtype) and dtype.fields is not None + return super()._check_native_dtype(dtype) and dtype.fields is not None @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] @@ -206,7 +92,7 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V3 | DTypeJSON_V2: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json_v2( + def _check_json_v2( cls, data: JSON, *, object_codec_id: str | None = None ) -> TypeGuard[list[object]]: # the actual JSON form is recursive and hard to annotate, so we give up and do @@ -222,7 +108,7 @@ def check_json_v2( ) @classmethod - def check_json_v3( + def _check_json_v3( cls, data: JSON ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, JSON]]]]]: return ( @@ -243,7 +129,7 @@ def _from_json_unchecked( # This is a horrible mess, because this data type is recursive if zarr_format == 2: - if cls.check_json_v2(data): # type: ignore[arg-type] + if cls._check_json_v2(data): # type: ignore[arg-type] # structured dtypes are constructed directly from a list of lists # note that we do not handle the object codec here! this will prevent structured # dtypes from containing object dtypes. @@ -256,7 +142,7 @@ def _from_json_unchecked( else: raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") elif zarr_format == 3: - if cls.check_json_v3(data): # type: ignore[arg-type] + if cls._check_json_v3(data): # type: ignore[arg-type] config = data["configuration"] meta_fields = config["fields"] fields = tuple( @@ -278,7 +164,7 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> bool: # TODO: implement something here! return True diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 4c5ce45442..9f82d3d168 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -108,7 +108,7 @@ def __post_init__(self) -> None: raise ValueError(f"unit must be one of {get_args(DateTimeUnit)}, got {self.unit!r}.") @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: unit, scale_factor = np.datetime_data(dtype.name) unit = cast("DateTimeUnit", unit) byteorder = cast("EndiannessNumpy", dtype.byteorder) @@ -156,7 +156,7 @@ def to_json(self, zarr_format: ZarrFormat) -> str | DateTime64JSONV3 | TimeDelta def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: return datetimelike_to_int(data) # type: ignore[arg-type] - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> bool: # TODO: decide which values we should accept for datetimes. try: np.array([data], dtype=self.to_native_dtype()) @@ -197,7 +197,7 @@ def _cast_scalar_unchecked(self, data: object) -> np.timedelta64: return self.to_native_dtype().type(data) # type: ignore[arg-type] @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: # match m[M], etc # consider making this a standalone function if not isinstance(data, str): @@ -212,7 +212,7 @@ def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Typ return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -243,7 +243,7 @@ def _cast_scalar_unchecked(self, data: object) -> np.datetime64: return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: # match M[M], etc # consider making this a standalone function if not isinstance(data, str): @@ -258,7 +258,7 @@ def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Typ return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} diff --git a/src/zarr/core/dtype/npy/vlen_bytes.py b/src/zarr/core/dtype/npy/vlen_bytes.py deleted file mode 100644 index c25523f9ed..0000000000 --- a/src/zarr/core/dtype/npy/vlen_bytes.py +++ /dev/null @@ -1,75 +0,0 @@ -import base64 -from dataclasses import dataclass -from typing import ClassVar, Literal, Self, TypeGuard, overload - -import numpy as np - -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasObjectCodec, v3_unstable_dtype_warning -from zarr.core.dtype.npy.common import check_json_str -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType - - -@dataclass(frozen=True, kw_only=True) -class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): - dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" - object_codec_id = "vlen-bytes" - - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls() - - def to_native_dtype(self) -> np.dtypes.ObjectDType: - return self.dtype_cls() - - @classmethod - def check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|O"]]: - """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. - """ - return data == "|O" and object_codec_id == cls.object_codec_id - - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: - return data == cls._zarr_v3_name - - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... - - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_bytes"]: - if zarr_format == 2: - return "|O" - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() - - def default_scalar(self) -> bytes: - return b"" - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes: - if check_json_str(data): - return base64.standard_b64decode(data.encode("ascii")) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_scalar(self, data: object) -> bool: - return isinstance(data, bytes | str) - - def _cast_scalar_unchecked(self, data: object) -> bytes: - return bytes(data) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 4c399bbb84..b117656c36 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -84,7 +84,7 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): _zarr_v3_name: ClassVar[str] @classmethod - def check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: + def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ Check that a data type matches the dtype_cls class attribute. Used as a type guard. @@ -120,15 +120,15 @@ def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: TypeError If the dtype does not match the dtype_cls class attribute. """ - if cls.check_native_dtype(dtype): - return cls._from_native_dtype_unsafe(dtype) + if cls._check_native_dtype(dtype): + return cls._from_native_dtype_unchecked(dtype) raise DataTypeValidationError( f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." ) @classmethod @abstractmethod - def _from_native_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: + def _from_native_dtype_unchecked(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a native dtype without checking. @@ -158,20 +158,21 @@ def to_native_dtype(self: Self) -> TDType_co: def cast_scalar(self, data: object) -> TScalar_co: """ - Cast a scalar to the wrapped scalar type. The type is first checked for compatibility. If - it's incompatible with the associated scalar type, a ``TypeError`` will be raised. + Cast a python object to the wrapped scalar type. + The type of the provided scalar is first checked for compatibility. + If it's incompatible with the associated scalar type, a ``TypeError`` will be raised. Parameters ---------- - data : TScalar - The scalar value to cast. + data : object + The python object to cast. Returns ------- TScalar The cast value. """ - if self.check_scalar(data): + if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = ( f"The value {data!r} failed a type check. " @@ -182,9 +183,9 @@ def cast_scalar(self, data: object) -> TScalar_co: raise TypeError(msg) @abstractmethod - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> bool: """ - Check that a scalar is a valid value for the wrapped data type. + Check that an python object is a valid scalar value for the wrapped data type. Parameters ---------- @@ -194,19 +195,20 @@ def check_scalar(self, data: object) -> bool: Returns ------- Bool - True if the value is valid, False otherwise. + True if the object is valid, False otherwise. """ ... @abstractmethod def _cast_scalar_unchecked(self, data: object) -> TScalar_co: """ - Cast a scalar to the wrapped data type. This method should not perform any input validation. + Cast a python object to the wrapped data type. + This method should not perform any type checking. Parameters ---------- - data : TScalar - The scalar value to cast. + data : object + The python object to cast. Returns ------- @@ -232,7 +234,7 @@ def default_scalar(self) -> TScalar_co: @classmethod @abstractmethod - def check_json_v2( + def _check_json_v2( cls: type[Self], data: JSON, *, object_codec_id: str | None = None ) -> TypeGuard[DTypeJSON_V2]: """ @@ -260,7 +262,7 @@ def check_json_v2( @classmethod @abstractmethod - def check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: + def _check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: """ Check that a JSON representation of a data type matches the dtype_cls class attribute. Used as a type guard. This base implementation checks that the input is a dictionary, @@ -317,7 +319,7 @@ def from_json_v3(cls: type[Self], data: JSON) -> Self: Self The wrapped data type. """ - if cls.check_json_v3(data): + if cls._check_json_v3(data): return cls._from_json_unchecked(data, zarr_format=3) raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}: {data}") @@ -336,7 +338,7 @@ def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None) -> Self The wrapped data type. """ - if cls.check_json_v2(data, object_codec_id=object_codec_id): + if cls._check_json_v2(data, object_codec_id=object_codec_id): return cls._from_json_unchecked(data, zarr_format=2) raise DataTypeValidationError( f"Invalid JSON representation of data type {cls}: {data!r}, object_codec_id={object_codec_id!r}" diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 83b9bd7bc8..bd02a67084 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -5,7 +5,7 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype from zarr.core.dtype import ( - VariableLengthString, + VariableLengthUTF8, ZDType, get_data_type_from_json_v3, ) @@ -97,7 +97,7 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseSc # TODO: use codec ID instead of class name codec_class_name = abc.__class__.__name__ # TODO: Fix typing here - if isinstance(dtype, VariableLengthString) and not codec_class_name == "VLenUTF8Codec": # type: ignore[unreachable] + if isinstance(dtype, VariableLengthUTF8) and not codec_class_name == "VLenUTF8Codec": # type: ignore[unreachable] raise ValueError( f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) diff --git a/tests/test_array.py b/tests/test_array.py index ee0a506538..e300b70f8d 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -46,10 +46,10 @@ from zarr.core.dtype.npy.common import endianness_from_numpy_str from zarr.core.dtype.npy.float import Float32, Float64 from zarr.core.dtype.npy.int import Int16, UInt8 -from zarr.core.dtype.npy.sized import ( +from zarr.core.dtype.npy.string import VariableLengthUTF8 +from zarr.core.dtype.npy.structured import ( Structured, ) -from zarr.core.dtype.npy.string import VariableLengthString from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup @@ -1036,7 +1036,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor # Structured dtypes do not have a numpy string representation that uniquely identifies them if not isinstance(dtype, Structured): - if isinstance(dtype, VariableLengthString): + if isinstance(dtype, VariableLengthUTF8): # in numpy 2.3, StringDType().str becomes the string 'StringDType()' which numpy # does not accept as a string representation of the dtype. c = zarr.create_array( @@ -1073,6 +1073,7 @@ def test_dtype_roundtrip( assert a.dtype == b.dtype @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("dtype", ["uint8", "float32", "U3", "S4", "V1"]) @pytest.mark.parametrize( "compressors", @@ -1298,9 +1299,9 @@ async def test_v2_chunk_encoding( assert arr.filters == filters_expected @staticmethod - @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthString()]) + @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) async def test_default_filters_compressors( - store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthString, zarr_format: ZarrFormat + store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthUTF8, zarr_format: ZarrFormat ) -> None: """ Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. @@ -1519,6 +1520,7 @@ def test_default_endianness( @pytest.mark.parametrize("value", [1, 1.4, "a", b"a", np.array(1)]) @pytest.mark.parametrize("zarr_format", [2, 3]) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_scalar_array(value: Any, zarr_format: ZarrFormat) -> None: arr = zarr.array(value, zarr_format=zarr_format) assert arr[...] == value diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 9024efa7ed..6fe1863464 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -22,6 +22,7 @@ expected_array_string_dtype = np.dtype("O") +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("dtype", numpy_str_dtypes) @pytest.mark.parametrize("as_object_array", [False, True]) diff --git a/tests/test_config.py b/tests/test_config.py index 58f88ec806..f02bb153e4 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -25,7 +25,7 @@ from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype import Int8, VariableLengthString +from zarr.core.dtype import Int8, VariableLengthUTF8 from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -312,7 +312,7 @@ async def test_default_codecs(dtype_category: str) -> None: """ zdtype: ZDType[Any, Any] if dtype_category == "variable-length-string": - zdtype = VariableLengthString() + zdtype = VariableLengthUTF8() else: zdtype = Int8() expected_compressors = (GzipCodec(),) diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index b2aa89afd7..0be1c60088 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -6,7 +6,7 @@ from zarr.core.dtype import data_type_registry from zarr.core.dtype.common import HasLength -from zarr.core.dtype.npy.sized import Structured +from zarr.core.dtype.npy.structured import Structured from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType diff --git a/tests/test_dtype/test_npy/test_bytes.py b/tests/test_dtype/test_npy/test_bytes.py new file mode 100644 index 0000000000..fcb43e551b --- /dev/null +++ b/tests/test_dtype/test_npy/test_bytes.py @@ -0,0 +1,138 @@ +import numpy as np + +from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from zarr.core.dtype.npy.bytes import NullTerminatedBytes, RawBytes, VariableLengthBytes + + +class TestNullTerminatedBytes(BaseTestZDType): + test_cls = NullTerminatedBytes + valid_dtype = (np.dtype("|S10"), np.dtype("|S4")) + invalid_dtype = ( + np.dtype(np.int8), + np.dtype(np.float64), + np.dtype("|U10"), + ) + valid_json_v2 = ( + V2JsonTestParams(dtype="|S0"), + V2JsonTestParams(dtype="|S2"), + V2JsonTestParams(dtype="|S4"), + ) + valid_json_v3 = ({"name": "null_terminated_bytes", "configuration": {"length_bytes": 10}},) + invalid_json_v2 = ( + "|S", + "|U10", + "|f8", + ) + invalid_json_v3 = ( + {"name": "fixed_length_ascii", "configuration": {"length_bits": 0}}, + {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, + ) + + scalar_v2_params = ( + (NullTerminatedBytes(length=0), ""), + (NullTerminatedBytes(length=2), "YWI="), + (NullTerminatedBytes(length=4), "YWJjZA=="), + ) + scalar_v3_params = ( + (NullTerminatedBytes(length=0), ""), + (NullTerminatedBytes(length=2), "YWI="), + (NullTerminatedBytes(length=4), "YWJjZA=="), + ) + cast_value_params = ( + (NullTerminatedBytes(length=0), "", np.bytes_("")), + (NullTerminatedBytes(length=2), "ab", np.bytes_("ab")), + (NullTerminatedBytes(length=4), "abcdefg", np.bytes_("abcd")), + ) + item_size_params = ( + NullTerminatedBytes(length=0), + NullTerminatedBytes(length=4), + NullTerminatedBytes(length=10), + ) + + +class TestRawBytes(BaseTestZDType): + test_cls = RawBytes + valid_dtype = (np.dtype("|V10"),) + invalid_dtype = ( + np.dtype(np.int8), + np.dtype(np.float64), + np.dtype("|S10"), + ) + valid_json_v2 = (V2JsonTestParams(dtype="|V10"),) + valid_json_v3 = ( + {"name": "raw_bytes", "configuration": {"length_bytes": 0}}, + {"name": "raw_bytes", "configuration": {"length_bytes": 8}}, + ) + + invalid_json_v2 = ( + "|V", + "|S10", + "|f8", + ) + invalid_json_v3 = ( + {"name": "r10"}, + {"name": "r-80"}, + ) + + scalar_v2_params = ( + (RawBytes(length=0), ""), + (RawBytes(length=2), "YWI="), + (RawBytes(length=4), "YWJjZA=="), + ) + scalar_v3_params = ( + (RawBytes(length=0), ""), + (RawBytes(length=2), "YWI="), + (RawBytes(length=4), "YWJjZA=="), + ) + cast_value_params = ( + (RawBytes(length=0), b"", np.void(b"")), + (RawBytes(length=2), b"ab", np.void(b"ab")), + (RawBytes(length=4), b"abcd", np.void(b"abcd")), + ) + item_size_params = ( + RawBytes(length=0), + RawBytes(length=4), + RawBytes(length=10), + ) + + +class TestVariableLengthBytes(BaseTestZDType): + test_cls = VariableLengthBytes + valid_dtype = (np.dtype("|O"),) + invalid_dtype = ( + np.dtype(np.int8), + np.dtype(np.float64), + np.dtype("|U10"), + ) + valid_json_v2 = (V2JsonTestParams(dtype="|O", object_codec_id="vlen-bytes"),) + valid_json_v3 = ("variable_length_bytes",) + invalid_json_v2 = ( + "|S", + "|U10", + "|f8", + ) + invalid_json_v3 = ( + {"name": "fixed_length_ascii", "configuration": {"length_bits": 0}}, + {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, + ) + + scalar_v2_params = ( + (VariableLengthBytes(), ""), + (VariableLengthBytes(), "YWI="), + (VariableLengthBytes(), "YWJjZA=="), + ) + scalar_v3_params = ( + (VariableLengthBytes(), ""), + (VariableLengthBytes(), "YWI="), + (VariableLengthBytes(), "YWJjZA=="), + ) + cast_value_params = ( + (VariableLengthBytes(), "", b""), + (VariableLengthBytes(), "ab", b"ab"), + (VariableLengthBytes(), "abcdefg", b"abcdefg"), + ) + item_size_params = ( + VariableLengthBytes(), + VariableLengthBytes(), + VariableLengthBytes(), + ) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 73c8612db4..66a8d8d1ff 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -3,13 +3,13 @@ import numpy as np from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams -from zarr.core.dtype import FixedLengthASCII, FixedLengthUTF32 -from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthString +from zarr.core.dtype import FixedLengthUTF32 +from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthUTF8 if _NUMPY_SUPPORTS_VLEN_STRING: class TestVariableLengthString(BaseTestZDType): - test_cls = VariableLengthString # type: ignore[assignment] + test_cls = VariableLengthUTF8 # type: ignore[assignment] valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment] invalid_dtype = ( np.dtype(np.int8), @@ -28,22 +28,22 @@ class TestVariableLengthString(BaseTestZDType): {"name": "invalid_name"}, ) - scalar_v2_params = ((VariableLengthString(), ""), (VariableLengthString(), "hi")) + scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi")) scalar_v3_params = ( - (VariableLengthString(), ""), - (VariableLengthString(), "hi"), + (VariableLengthUTF8(), ""), + (VariableLengthUTF8(), "hi"), ) cast_value_params = ( - (VariableLengthString(), "", np.str_("")), - (VariableLengthString(), "hi", np.str_("hi")), + (VariableLengthUTF8(), "", np.str_("")), + (VariableLengthUTF8(), "hi", np.str_("hi")), ) - item_size_params = (VariableLengthString(),) + item_size_params = (VariableLengthUTF8(),) else: class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] - test_cls = VariableLengthString # type: ignore[assignment] + test_cls = VariableLengthUTF8 # type: ignore[assignment] valid_dtype = (np.dtype("O"),) invalid_dtype = ( np.dtype(np.int8), @@ -62,64 +62,18 @@ class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] {"name": "invalid_name"}, ) - scalar_v2_params = ((VariableLengthString(), ""), (VariableLengthString(), "hi")) + scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi")) scalar_v3_params = ( - (VariableLengthString(), ""), - (VariableLengthString(), "hi"), + (VariableLengthUTF8(), ""), + (VariableLengthUTF8(), "hi"), ) cast_value_params = ( - (VariableLengthString(), "", np.str_("")), - (VariableLengthString(), "hi", np.str_("hi")), + (VariableLengthUTF8(), "", np.str_("")), + (VariableLengthUTF8(), "hi", np.str_("hi")), ) - item_size_params = (VariableLengthString(),) - - -class TestFixedLengthAscii(BaseTestZDType): - test_cls = FixedLengthASCII - valid_dtype = (np.dtype("|S10"), np.dtype("|S4")) - invalid_dtype = ( - np.dtype(np.int8), - np.dtype(np.float64), - np.dtype("|U10"), - ) - valid_json_v2 = ( - V2JsonTestParams(dtype="|S0"), - V2JsonTestParams(dtype="|S2"), - V2JsonTestParams(dtype="|S4"), - ) - valid_json_v3 = ({"name": "fixed_length_ascii", "configuration": {"length_bytes": 10}},) - invalid_json_v2 = ( - "|S", - "|U10", - "|f8", - ) - invalid_json_v3 = ( - {"name": "fixed_length_ascii", "configuration": {"length_bits": 0}}, - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, - ) - - scalar_v2_params = ( - (FixedLengthASCII(length=0), ""), - (FixedLengthASCII(length=2), "YWI="), - (FixedLengthASCII(length=4), "YWJjZA=="), - ) - scalar_v3_params = ( - (FixedLengthASCII(length=0), ""), - (FixedLengthASCII(length=2), "YWI="), - (FixedLengthASCII(length=4), "YWJjZA=="), - ) - cast_value_params = ( - (FixedLengthASCII(length=0), "", np.bytes_("")), - (FixedLengthASCII(length=2), "ab", np.bytes_("ab")), - (FixedLengthASCII(length=4), "abcd", np.bytes_("abcd")), - ) - item_size_params = ( - FixedLengthASCII(length=0), - FixedLengthASCII(length=4), - FixedLengthASCII(length=10), - ) + item_size_params = (VariableLengthUTF8(),) class TestFixedLengthUTF32(BaseTestZDType): diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_structured.py similarity index 71% rename from tests/test_dtype/test_npy/test_sized.py rename to tests/test_dtype/test_npy/test_structured.py index d7aef88168..71bbcdcefb 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_structured.py @@ -6,7 +6,6 @@ from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams from zarr.core.dtype import ( - FixedLengthBytes, Float16, Float64, Int32, @@ -15,52 +14,6 @@ ) -class TestFixedLengthBytes(BaseTestZDType): - test_cls = FixedLengthBytes - valid_dtype = (np.dtype("|V10"),) - invalid_dtype = ( - np.dtype(np.int8), - np.dtype(np.float64), - np.dtype("|S10"), - ) - valid_json_v2 = (V2JsonTestParams(dtype="|V10"),) - valid_json_v3 = ( - {"name": "fixed_length_bytes", "configuration": {"length_bytes": 0}}, - {"name": "fixed_length_bytes", "configuration": {"length_bytes": 8}}, - ) - - invalid_json_v2 = ( - "|V", - "|S10", - "|f8", - ) - invalid_json_v3 = ( - {"name": "r10"}, - {"name": "r-80"}, - ) - - scalar_v2_params = ( - (FixedLengthBytes(length=0), ""), - (FixedLengthBytes(length=2), "YWI="), - (FixedLengthBytes(length=4), "YWJjZA=="), - ) - scalar_v3_params = ( - (FixedLengthBytes(length=0), ""), - (FixedLengthBytes(length=2), "YWI="), - (FixedLengthBytes(length=4), "YWJjZA=="), - ) - cast_value_params = ( - (FixedLengthBytes(length=0), b"", np.void(b"")), - (FixedLengthBytes(length=2), b"ab", np.void(b"ab")), - (FixedLengthBytes(length=4), b"abcd", np.void(b"abcd")), - ) - item_size_params = ( - FixedLengthBytes(length=0), - FixedLengthBytes(length=4), - FixedLengthBytes(length=10), - ) - - class TestStructured(BaseTestZDType): test_cls = Structured valid_dtype = ( diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 0c3a2b106f..d359475a0d 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -99,10 +99,10 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: return scalar1 == scalar2 def test_check_dtype_valid(self, valid_dtype: TBaseDType) -> None: - assert self.test_cls.check_native_dtype(valid_dtype) + assert self.test_cls._check_native_dtype(valid_dtype) def test_check_dtype_invalid(self, invalid_dtype: object) -> None: - assert not self.test_cls.check_native_dtype(invalid_dtype) # type: ignore[arg-type] + assert not self.test_cls._check_native_dtype(invalid_dtype) # type: ignore[arg-type] def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: zdtype = self.test_cls.from_native_dtype(valid_dtype) diff --git a/tests/test_properties.py b/tests/test_properties.py index ed8aa997c0..b8d50ef0b1 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -75,6 +75,7 @@ def deep_equal(a: Any, b: Any) -> bool: return a == b +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @given(data=st.data(), zarr_format=zarr_formats) def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format))) @@ -82,6 +83,7 @@ def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: assert_array_equal(nparray, zarray[:]) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @given(array=arrays()) def test_array_creates_implicit_groups(array): path = array.path @@ -101,7 +103,10 @@ def test_array_creates_implicit_groups(array): # this decorator removes timeout; not ideal but it should avoid intermittent CI failures + + @settings(deadline=None) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @given(data=st.data()) def test_basic_indexing(data: st.DataObject) -> None: zarray = data.draw(simple_arrays()) @@ -117,6 +122,7 @@ def test_basic_indexing(data: st.DataObject) -> None: @given(data=st.data()) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_oindex(data: st.DataObject) -> None: # integer_array_indices can't handle 0-size dimensions. zarray = data.draw(simple_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) @@ -138,6 +144,7 @@ def test_oindex(data: st.DataObject) -> None: @given(data=st.data()) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_vindex(data: st.DataObject) -> None: # integer_array_indices can't handle 0-size dimensions. zarray = data.draw(simple_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) @@ -161,6 +168,7 @@ def test_vindex(data: st.DataObject) -> None: @given(store=stores, meta=array_metadata()) # type: ignore[misc] +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") async def test_roundtrip_array_metadata_from_store( store: Store, meta: ArrayV2Metadata | ArrayV3Metadata ) -> None: @@ -180,6 +188,7 @@ async def test_roundtrip_array_metadata_from_store( @given(data=st.data(), zarr_format=zarr_formats) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_roundtrip_array_metadata_from_json(data: st.DataObject, zarr_format: int) -> None: """ Verify that JSON serialization and deserialization of metadata is lossless. @@ -281,6 +290,7 @@ def serialized_float_is_valid(serialized: numbers.Real | str) -> bool: @given(meta=array_metadata()) # type: ignore[misc] +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_array_metadata_meets_spec(meta: ArrayV2Metadata | ArrayV3Metadata) -> None: """ Validate that the array metadata produced by the library conforms to the relevant spec (V2 vs V3). diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index a1d13510c3..34c48a6933 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -12,8 +12,8 @@ import zarr from zarr.core.array import Array from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding -from zarr.core.dtype.npy.string import VariableLengthString -from zarr.core.dtype.npy.vlen_bytes import VariableLengthBytes +from zarr.core.dtype.npy.bytes import VariableLengthBytes +from zarr.core.dtype.npy.string import VariableLengthUTF8 from zarr.storage import LocalStore if TYPE_CHECKING: @@ -42,7 +42,8 @@ class ArrayParams: basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" -string_dtypes = ">S1", "U4" +string_dtypes = "U4" +bytes_dtypes = ">S1", "V10", " Array: chunk_key_encoding = V2ChunkKeyEncoding(separator="/") dtype: ZDTypeLike if array_params.values.dtype == np.dtype("|O") and array_params.filters == (VLenUTF8(),): - dtype = VariableLengthString() # type: ignore[assignment] + dtype = VariableLengthUTF8() # type: ignore[assignment] elif array_params.values.dtype == np.dtype("|O") and array_params.filters == (VLenBytes(),): dtype = VariableLengthBytes() else: diff --git a/tests/test_v2.py b/tests/test_v2.py index fa2aa65b22..66e5a1ecfb 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -15,7 +15,8 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype import FixedLengthASCII, FixedLengthUTF32, Structured, VariableLengthString +from zarr.core.dtype import FixedLengthUTF32, Structured, VariableLengthUTF8 +from zarr.core.dtype.npy.bytes import NullTerminatedBytes from zarr.core.dtype.wrapper import ZDType from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath @@ -105,9 +106,9 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js @pytest.mark.parametrize( ("dtype", "value"), [ - (FixedLengthASCII(length=1), b"Y"), + (NullTerminatedBytes(length=1), b"Y"), (FixedLengthUTF32(length=1), "Y"), - (VariableLengthString(), "Y"), + (VariableLengthUTF8(), "Y"), ], ) def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str): From 24b6b356a0b0164e61fe6775c584810fb6e59d4b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Jun 2025 20:38:06 +0300 Subject: [PATCH 122/130] more v3 unstable dtype warnings, and their exemptions from tests --- src/zarr/core/dtype/common.py | 4 ++-- src/zarr/core/dtype/npy/bytes.py | 1 + src/zarr/core/dtype/npy/string.py | 1 + tests/test_array.py | 1 + tests/test_config.py | 1 + tests/test_dtype/test_npy/test_bytes.py | 16 ++++++++++++++++ tests/test_dtype/test_npy/test_string.py | 12 ++++++++++++ tests/test_store/test_stateful.py | 1 + 8 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index bbdc06c50d..5630f1692e 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -77,9 +77,9 @@ def v3_unstable_dtype_warning(dtype: object) -> None: """ msg = ( f"The data type ({dtype}) does not have a Zarr V3 specification. " - "That means that the representation of data saved with this data type may change without " + "That means that the representation of array saved with this data type may change without " "warning in a future version of Zarr Python. " - "Arrays stored with this data type may be unreadable by other Zarr libraries " + "Arrays stored with this data type may be unreadable by other Zarr libraries. " "Use this data type at your own risk! " "Check https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for the " "status of data type specifications for Zarr V3." diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 9d815ab849..347e058f12 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -154,6 +154,7 @@ def to_json(self, zarr_format: ZarrFormat) -> str | RawBytesJSONV3: if zarr_format == 2: return self.to_native_dtype().str elif zarr_format == 3: + v3_unstable_dtype_warning(self) return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 853f32806d..21727b0c8c 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -259,6 +259,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf if zarr_format == 2: return "|O" elif zarr_format == 3: + v3_unstable_dtype_warning(self) return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/tests/test_array.py b/tests/test_array.py index e300b70f8d..862b49da61 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1300,6 +1300,7 @@ async def test_v2_chunk_encoding( @staticmethod @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") async def test_default_filters_compressors( store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthUTF8, zarr_format: ZarrFormat ) -> None: diff --git a/tests/test_config.py b/tests/test_config.py index f02bb153e4..1dc6f8bf4f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -306,6 +306,7 @@ class NewCodec2(BytesCodec): @pytest.mark.parametrize("dtype_category", ["variable-length-string", "default"]) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") async def test_default_codecs(dtype_category: str) -> None: """ Test that the default compressors are sensitive to the current setting of the config. diff --git a/tests/test_dtype/test_npy/test_bytes.py b/tests/test_dtype/test_npy/test_bytes.py index fcb43e551b..53636891cb 100644 --- a/tests/test_dtype/test_npy/test_bytes.py +++ b/tests/test_dtype/test_npy/test_bytes.py @@ -1,6 +1,8 @@ import numpy as np +import pytest from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from zarr.core.dtype.common import UnstableSpecificationWarning from zarr.core.dtype.npy.bytes import NullTerminatedBytes, RawBytes, VariableLengthBytes @@ -136,3 +138,17 @@ class TestVariableLengthBytes(BaseTestZDType): VariableLengthBytes(), VariableLengthBytes(), ) + + +@pytest.mark.parametrize( + "zdtype", [NullTerminatedBytes(length=10), RawBytes(length=10), VariableLengthBytes()] +) +def test_unstable_dtype_warning( + zdtype: NullTerminatedBytes | RawBytes | VariableLengthBytes, +) -> None: + """ + Test that we get a warning when serializing a dtype without a zarr v3 spec to json + when zarr_format is 3 + """ + with pytest.raises(UnstableSpecificationWarning): + zdtype.to_json(zarr_format=3) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 66a8d8d1ff..c9bcdce29f 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -1,9 +1,11 @@ from __future__ import annotations import numpy as np +import pytest from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams from zarr.core.dtype import FixedLengthUTF32 +from zarr.core.dtype.common import UnstableSpecificationWarning from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthUTF8 if _NUMPY_SUPPORTS_VLEN_STRING: @@ -113,3 +115,13 @@ class TestFixedLengthUTF32(BaseTestZDType): FixedLengthUTF32(length=4), FixedLengthUTF32(length=10), ) + + +@pytest.mark.parametrize("zdtype", [FixedLengthUTF32(length=10), VariableLengthUTF8()]) +def test_unstable_dtype_warning(zdtype: FixedLengthUTF32 | VariableLengthUTF8) -> None: + """ + Test that we get a warning when serializing a dtype without a zarr v3 spec to json + when zarr_format is 3 + """ + with pytest.raises(UnstableSpecificationWarning): + zdtype.to_json(zarr_format=3) diff --git a/tests/test_store/test_stateful.py b/tests/test_store/test_stateful.py index a17d7a55be..c0997c3df3 100644 --- a/tests/test_store/test_stateful.py +++ b/tests/test_store/test_stateful.py @@ -15,6 +15,7 @@ ] +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_zarr_hierarchy(sync_store: Store): def mk_test_instance_sync() -> ZarrHierarchyStateMachine: return ZarrHierarchyStateMachine(sync_store) From cbb0b0df980bb9575fda897c928d057c17b3fba6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 7 Jun 2025 14:15:53 +0300 Subject: [PATCH 123/130] clean up typeddicts --- src/zarr/core/dtype/npy/bytes.py | 8 ++++---- src/zarr/core/dtype/npy/string.py | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 347e058f12..d489f2f610 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -15,7 +15,7 @@ class FixedLengthBytesConfig(TypedDict): length_bytes: int -NTBytesJSONV3 = NamedConfig[Literal["null_terminated_bytes"], FixedLengthBytesConfig] +NullTerminatedBytesJSONV3 = NamedConfig[Literal["null_terminated_bytes"], FixedLengthBytesConfig] RawBytesJSONV3 = NamedConfig[Literal["raw_bytes"], FixedLengthBytesConfig] @@ -40,7 +40,7 @@ def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Ty return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[NTBytesJSONV3]: + def _check_json_v3(cls, data: JSON) -> TypeGuard[NullTerminatedBytesJSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -53,9 +53,9 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[NTBytesJSONV3]: def to_json(self, zarr_format: Literal[2]) -> str: ... @overload - def to_json(self, zarr_format: Literal[3]) -> NTBytesJSONV3: ... + def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSONV3: ... - def to_json(self, zarr_format: ZarrFormat) -> str | NTBytesJSONV3: + def to_json(self, zarr_format: ZarrFormat) -> str | NullTerminatedBytesJSONV3: if zarr_format == 2: return self.to_native_dtype().str elif zarr_format == 3: diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 21727b0c8c..377c364ca2 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -33,10 +33,6 @@ class LengthBytesConfig(TypedDict): length_bytes: int -# TDO: Fix this terrible name -FixedLengthBytesJSONV3 = NamedConfig[Literal["fixed_length_bytes"], LengthBytesConfig] - - # TODO: Fix this terrible name FixedLengthUTF32JSONV3 = NamedConfig[Literal["fixed_length_utf32"], LengthBytesConfig] From e8858697d2fb2d7e41065e7b0ccb53d77fab38bb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 9 Jun 2025 12:46:07 +0300 Subject: [PATCH 124/130] update docstrings --- src/zarr/core/dtype/wrapper.py | 91 ++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index b117656c36..94fbe60242 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -1,8 +1,8 @@ """ Wrapper for native array data types. -The `ZDType` class is an abstract base class for wrapping native array data types, e.g. numpy dtypes. -It provides a common interface for working with data types in a way that is independent of the +The ``ZDType`` class is an abstract base class for wrapping native array data types, e.g. NumPy dtypes. +``ZDType`` provides a common interface for working with data types in a way that is independent of the underlying data type system. The wrapper class encapsulates a native data type. Instances of the class can be created from a @@ -10,14 +10,15 @@ wrapper class. The wrapper class is responsible for: -- Reversibly serializing a native data type to Zarr V2 or Zarr V3 metadata. +- Serializing and deserializing a native data type to Zarr V2 or Zarr V3 metadata. This ensures that the data type can be properly stored and retrieved from array metadata. -- Reversibly serializing scalar values to Zarr V2 or Zarr V3 metadata. This is important for +- Serializing and deserializing scalar values to Zarr V2 or Zarr V3 metadata. This is important for storing a fill value for an array in a manner that is valid for the data type. -To add support for a new data type in Zarr, you should subclass the wrapper class and adapt its methods +You can add support for a new data type in Zarr by subclassing ``ZDType`` wrapper class and adapt its methods to support your native data type. The wrapper class must be added to a data type registry -(defined elsewhere) before ``create_array`` can properly handle the new data type. +(defined elsewhere) before array creation routines or array reading routines can use your new data +type. """ from __future__ import annotations @@ -69,11 +70,10 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): Attributes ---------- dtype_cls : ClassVar[type[TDType]] - The wrapped dtype class. This is a class variable. Instances of this class cannot set it. + The wrapped dtype class. This is a class variable. _zarr_v3_name : ClassVar[str] - The name given to the wrapped data type by a zarr v3 data type specification. Note that this - is not necessarily the same name that will appear in metadata documents, as some data types - have names that depend on their configuration. + The name given to the data type by a Zarr v3 data type specification. This is a + class variable, and it should generally be unique across different data types. """ # this class will create a native data type @@ -86,7 +86,7 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): @classmethod def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ - Check that a data type matches the dtype_cls class attribute. Used as a type guard. + Check that a native data type matches the dtype_cls class attribute. Used as a type guard. Parameters ---------- @@ -103,22 +103,28 @@ def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_ @classmethod def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ - Wrap a dtype object. + Create a ZDType instance from a native data type. The default implementation first performs + a type check via ``cls._check_native_dtype``. If that type check succeeds, then + ``cls._from_native_dtype_unchecked`` is called, which assumes that the incoming object + as all the properties necessary for instantiating the ZDType. + + This method is used when taking a user-provided native data type, like a NumPy data type, + and creating the corresponding ZDType instance from them. Parameters ---------- dtype : TDType - The dtype object to wrap. + The native data type object to wrap. Returns ------- Self - The wrapped dtype. + The ZDType that wraps the native data type. Raises ------ TypeError - If the dtype does not match the dtype_cls class attribute. + If the native data type is not consistent with the wrapped data type. """ if cls._check_native_dtype(dtype): return cls._from_native_dtype_unchecked(dtype) @@ -130,7 +136,8 @@ def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: @abstractmethod def _from_native_dtype_unchecked(cls: type[Self], dtype: TBaseDType) -> Self: """ - Wrap a native dtype without checking. + Create a ZDType instance from a native data type without performing any type checking of + that data type. Parameters ---------- @@ -140,19 +147,19 @@ def _from_native_dtype_unchecked(cls: type[Self], dtype: TBaseDType) -> Self: Returns ------- Self - The wrapped dtype. + A ZDType that wraps the native dtype. """ ... @abstractmethod def to_native_dtype(self: Self) -> TDType_co: """ - Return an instance of the wrapped dtype. + Return an instance of the wrapped data type. This operation inverts ``from_native_dtype``. Returns ------- TDType - The unwrapped dtype. + The native data type wrapped by this ZDType. """ ... @@ -220,10 +227,10 @@ def _cast_scalar_unchecked(self, data: object) -> TScalar_co: @abstractmethod def default_scalar(self) -> TScalar_co: """ - Get the default scalar value for the wrapped data type. This is a method, rather than an attribute, - because the default value for some data types may depend on parameters that are not known - until a concrete data type is wrapped. For example, data types parametrized by a length like - fixed-length strings or bytes will generate scalars consistent with that length. + Get the default scalar value for the wrapped data type. This is a method, rather than an + attribute, because the default value for some data types depends on parameters that are + not known until a concrete data type is wrapped. For example, data types parametrized by a + length like fixed-length strings or bytes will generate scalars consistent with that length. Returns ------- @@ -238,7 +245,7 @@ def _check_json_v2( cls: type[Self], data: JSON, *, object_codec_id: str | None = None ) -> TypeGuard[DTypeJSON_V2]: """ - Check that a JSON representation of a data type is consistent with the ZDType class. + Check that JSON data matches the Zarr V2 JSON serialization of this ZDType. Parameters ---------- @@ -246,17 +253,17 @@ def _check_json_v2( The JSON representation of the data type. object_codec_id : str | None - The object codec ID, if applicable. Object codecs are specific numcodecs codecs that - zarr-python 2.x used to serialize numpy "Object" scalars. For example, a dtype field set - to ``"|O"`` with an object codec ID of "vlen-utf8" indicates that the data type is a - variable-length string. + The string identifier of an object codec, if applicable. Object codecs are specific + numcodecs codecs that zarr-python 2.x used to serialize numpy "Object" scalars. + For example, a dtype field set to ``"|O"`` with an object codec ID of "vlen-utf8" + indicates that the data type is a variable-length string. Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. Returns ------- Bool - True if the JSON representation matches, False otherwise. + True if the JSON representation matches this data type, False otherwise. """ ... @@ -264,10 +271,7 @@ def _check_json_v2( @abstractmethod def _check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: """ - Check that a JSON representation of a data type matches the dtype_cls class attribute. Used - as a type guard. This base implementation checks that the input is a dictionary, - that the key "name" is in that dictionary, and that the value of "name" - matches the _zarr_v3_name class attribute. + Check that JSON data matches the Zarr V3 JSON serialization of this ZDType. Parameters ---------- @@ -290,7 +294,7 @@ def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... @abstractmethod def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: """ - Convert the wrapped data type to a JSON-serializable form. + Serialize this ZDType to JSON. Parameters ---------- @@ -307,7 +311,7 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: @classmethod def from_json_v3(cls: type[Self], data: JSON) -> Self: """ - Wrap a Zarr V3 JSON representation of a data type. + Create an instance of this ZDType from Zarr V3 JSON data. Parameters ---------- @@ -326,7 +330,7 @@ def from_json_v3(cls: type[Self], data: JSON) -> Self: @classmethod def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None) -> Self: """ - Wrap a Zarr V2 JSON representation of a data type. + Create an instance of this ZDType from Zarr V2 JSON data. Parameters ---------- @@ -377,19 +381,21 @@ def _from_json_unchecked( @abstractmethod def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ - Convert a single value to JSON-serializable format. + Serialize a python object to the JSON representation of a scalar. The value will first be + cast to the scalar type associated with this ZDType, then serialized to JSON. Parameters ---------- data : object The value to convert. zarr_format : ZarrFormat - The zarr format version. + The zarr format version. This is specified because the JSON serialization of scalars + differs between Zarr V2 and Zarr V3. Returns ------- JSON - The JSON-serializable form of the scalar. + The JSON-serialized scalar. """ ... @@ -401,13 +407,14 @@ def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TSca Parameters ---------- data : JSON - The JSON-serializable value. + A JSON representation of a scalar value. zarr_format : ZarrFormat - The zarr format version. + The zarr format version. This is specified because the JSON serialization of scalars + differs between Zarr V2 and Zarr V3. Returns ------- TScalar - The native scalar value. + The deserialized scalar value. """ ... From 63de7c492c0cfd1b0d9ffe6b896be6a081fa0e68 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 11 Jun 2025 19:56:08 +0300 Subject: [PATCH 125/130] Update docs/user-guide/data_types.rst Co-authored-by: Ryan Abernathey --- docs/user-guide/data_types.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index c101ae50fc..0150e025e3 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -83,9 +83,9 @@ Zarr V3 brings several key changes to how data types are represented: { "name": "numpy.datetime64", "configuration": { - "unit": "s", - "scale_factor": 10 - } + "unit": "s", + "scale_factor": 10 + } } From b069d3684789ae67f60f5603e3d69e51792a2bcd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 13 Jun 2025 18:53:56 +0300 Subject: [PATCH 126/130] refactor wrapper to allow subclasses to freely define their own type guards for native dtype and json input --- src/zarr/codecs/bytes.py | 15 +- src/zarr/core/dtype/common.py | 8 +- src/zarr/core/dtype/npy/bool.py | 58 +++-- src/zarr/core/dtype/npy/bytes.py | 126 +++++++--- src/zarr/core/dtype/npy/common.py | 65 +++-- src/zarr/core/dtype/npy/complex.py | 58 +++-- src/zarr/core/dtype/npy/float.py | 51 +++- src/zarr/core/dtype/npy/int.py | 298 ++++++++++++++-------- src/zarr/core/dtype/npy/string.py | 308 ++++++++++++----------- src/zarr/core/dtype/npy/structured.py | 138 +++++----- src/zarr/core/dtype/npy/time.py | 157 ++++++++---- src/zarr/core/dtype/wrapper.py | 116 ++------- tests/test_array.py | 14 +- tests/test_dtype/test_npy/test_common.py | 12 +- 14 files changed, 835 insertions(+), 589 deletions(-) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 6ef0fef60b..d663a3b2cc 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -3,7 +3,7 @@ import sys from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import numpy as np @@ -11,14 +11,12 @@ from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration from zarr.core.dtype.common import HasEndianness -from zarr.core.dtype.npy.common import endianness_to_numpy_str from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self from zarr.core.array_spec import ArraySpec - from zarr.core.dtype.common import Endianness class Endian(Enum): @@ -75,12 +73,11 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian_str = cast( - "Endianness | None", self.endian.value if self.endian is not None else None - ) - new_byte_order = endianness_to_numpy_str(endian_str) - dtype = chunk_spec.dtype.to_native_dtype().newbyteorder(new_byte_order) - + endian_str = self.endian.value if self.endian is not None else None + if isinstance(chunk_spec.dtype, HasEndianness): + dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg] + else: + dtype = chunk_spec.dtype.to_native_dtype() as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): as_nd_array_like = as_array_like diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 5630f1692e..9fabfa2737 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -4,7 +4,8 @@ from dataclasses import dataclass from typing import ClassVar, Final, Literal -Endianness = Literal["little", "big"] +EndiannessStr = Literal["little", "big"] +ENDIANNESS_STR: Final = "little", "big" SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity") JSONFloatV2 = float | SpecialFloatStrings @@ -14,6 +15,9 @@ class DataTypeValidationError(ValueError): ... +class ScalarTypeValidationError(ValueError): ... + + @dataclass(frozen=True) class HasLength: """ @@ -30,7 +34,7 @@ class HasEndianness: A mix-in class for data types with an endianness attribute """ - endianness: Endianness | None = "little" + endianness: EndiannessStr = "little" @dataclass(frozen=True) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index bee42b6a13..2d045ce28a 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -4,9 +4,8 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasItemSize -from zarr.core.dtype.npy.common import check_json_bool -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.common import DataTypeValidationError, HasItemSize +from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True, kw_only=True, slots=True) @@ -23,14 +22,24 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) + _zarr_v2_name: ClassVar[Literal["|b1"]] = "|b1" dtype_cls = np.dtypes.BoolDType @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - return cls() + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a Bool from a np.dtype('bool') instance. + """ + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self: Self) -> np.dtypes.BoolDType: + """ + Create a NumPy boolean dtype instance from this ZDType + """ return self.dtype_cls() @classmethod @@ -38,14 +47,28 @@ def _check_json_v2( cls, data: JSON, *, object_codec_id: str | None = None ) -> TypeGuard[Literal["|b1"]]: """ - Check that the input is a valid JSON representation of a bool. + Check that the input is a valid JSON representation of a Bool. """ - return data in cls._zarr_v2_names + return data == cls._zarr_v2_name @classmethod def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["bool"]]: return data == cls._zarr_v3_name + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls: type[Self], data: JSON) -> Self: + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + @overload def to_json(self, zarr_format: Literal[2]) -> Literal["|b1"]: ... @@ -54,17 +77,11 @@ def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... def to_json(self, zarr_format: ZarrFormat) -> Literal["|b1", "bool"]: if zarr_format == 2: - return self.to_native_dtype().str + return self._zarr_v2_name elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() - def default_scalar(self) -> np.bool_: """ Get the default value for the boolean dtype. @@ -110,16 +127,19 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: np.bool_ The numpy boolean scalar. """ - if check_json_bool(data): - return self._cast_scalar_unchecked(data) + if self._check_scalar(data): + return np.bool_(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover def _check_scalar(self, data: object) -> bool: # Anything can become a bool return True - def _cast_scalar_unchecked(self, data: object) -> np.bool_: - return np.bool_(data) + def cast_scalar(self, data: object) -> np.bool_: + if self._check_scalar(data): + return np.bool_(data) + msg = f"Cannot convert object with type {type(data)} to a numpy boolean." + raise TypeError(msg) @property def item_size(self) -> int: diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index d489f2f610..d98114e9e1 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -6,10 +6,18 @@ import numpy as np from zarr.core.common import JSON, NamedConfig, ZarrFormat -from zarr.core.dtype.common import HasItemSize, HasLength, HasObjectCodec, v3_unstable_dtype_warning +from zarr.core.dtype.common import ( + DataTypeValidationError, + HasItemSize, + HasLength, + HasObjectCodec, + v3_unstable_dtype_warning, +) from zarr.core.dtype.npy.common import check_json_str from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +BytesLike = np.bytes_ | str | bytes | int + class FixedLengthBytesConfig(TypedDict): length_bytes: int @@ -25,8 +33,12 @@ class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLengt _zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes" @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(length=dtype.itemsize) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) @@ -49,6 +61,20 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[NullTerminatedBytesJSONV3]: and "length_bytes" in data["configuration"] ) + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data): + return cls(length=int(data[2:])) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|S1', '|S2', etc" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls(length=data["configuration"]["length_bytes"]) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + @overload def to_json(self, zarr_format: Literal[2]) -> str: ... @@ -90,11 +116,11 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: f"Invalid type: {data}. Expected a base64-encoded string." ) # pragma: no cover - def _check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: # this is generous for backwards compatibility - return isinstance(data, np.bytes_ | str | bytes | int) + return isinstance(data, BytesLike) - def _cast_scalar_unchecked(self, data: object) -> np.bytes_: + def _cast_scalar_unchecked(self, data: BytesLike) -> np.bytes_: # We explicitly truncate the result because of the following numpy behavior: # >>> x = np.dtype('S3').type('hello world') # >>> x @@ -105,7 +131,13 @@ def _cast_scalar_unchecked(self, data: object) -> np.bytes_: if isinstance(data, int): return self.to_native_dtype().type(str(data)[: self.length]) else: - return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] + return self.to_native_dtype().type(data[: self.length]) + + def cast_scalar(self, data: object) -> np.bytes_: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy bytes scalar." + raise TypeError(msg) @property def item_size(self) -> int: @@ -121,8 +153,12 @@ class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize _zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes" @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(length=dtype.itemsize) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" # type: ignore[has-type] + ) def to_native_dtype(self) -> np.dtypes.VoidDType[int]: # Numpy does not allow creating a void type @@ -144,6 +180,20 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[RawBytesJSONV3]: and set(data["configuration"].keys()) == {"length_bytes"} ) + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data): + return cls(length=int(data[2:])) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|V1', '|V2', etc" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls(length=data["configuration"]["length_bytes"]) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + @overload def to_json(self, zarr_format: Literal[2]) -> str: ... @@ -158,16 +208,6 @@ def to_json(self, zarr_format: ZarrFormat) -> str | RawBytesJSONV3: return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _check_native_dtype( cls: type[Self], dtype: TBaseDType @@ -212,6 +252,12 @@ def _cast_scalar_unchecked(self, data: object) -> np.void: # the result will actually be a V10 scalar. return native_dtype.type(data, native_dtype) + def cast_scalar(self, data: object) -> np.void: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy void scalar." + raise TypeError(msg) + @property def item_size(self) -> int: return self.length @@ -224,8 +270,12 @@ class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): object_codec_id = "vlen-bytes" @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - return cls() + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.ObjectDType: return self.dtype_cls() @@ -244,6 +294,20 @@ def _check_json_v2( def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_bytes"]]: return data == cls._zarr_v3_name + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O' and an object_codec_id of {cls.object_codec_id}" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + @overload def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... @@ -258,12 +322,6 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_byt return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() - def default_scalar(self) -> bytes: return b"" @@ -275,10 +333,16 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes: return base64.standard_b64decode(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def _check_scalar(self, data: object) -> bool: - return isinstance(data, bytes | str) + def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: + return isinstance(data, BytesLike) - def _cast_scalar_unchecked(self, data: object) -> bytes: + def _cast_scalar_unchecked(self, data: BytesLike) -> bytes: if isinstance(data, str): return bytes(data, encoding="utf-8") - return bytes(data) # type: ignore[no-any-return, call-overload] + return bytes(data) + + def cast_scalar(self, data: object) -> bytes: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to bytes." + raise TypeError(msg) diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 03dc194a7a..264561f25c 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -7,6 +7,7 @@ from typing import ( TYPE_CHECKING, Any, + Final, Literal, SupportsComplex, SupportsFloat, @@ -14,12 +15,17 @@ SupportsInt, TypeGuard, TypeVar, - get_args, ) import numpy as np -from zarr.core.dtype.common import SPECIAL_FLOAT_STRINGS, Endianness, JSONFloatV2, JSONFloatV3 +from zarr.core.dtype.common import ( + ENDIANNESS_STR, + SPECIAL_FLOAT_STRINGS, + EndiannessStr, + JSONFloatV2, + JSONFloatV3, +) if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -30,7 +36,26 @@ DateTimeUnit = Literal[ "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", "generic" ] -EndiannessNumpy = Literal[">", "<", "|", "="] +DATETIME_UNIT: Final = ( + "Y", + "M", + "W", + "D", + "h", + "m", + "s", + "ms", + "us", + "μs", + "ns", + "ps", + "fs", + "as", + "generic", +) + +NumpyEndiannessStr = Literal[">", "<", "="] +NUMPY_ENDIANNESS_STR: Final = ">", "<", "=" TFloatDType_co = TypeVar( "TFloatDType_co", @@ -47,18 +72,18 @@ TComplexScalar_co = TypeVar("TComplexScalar_co", bound=np.complex64 | np.complex128, covariant=True) -def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: +def endianness_from_numpy_str(endianness: NumpyEndiannessStr) -> EndiannessStr: """ Convert a numpy endianness string literal to a human-readable literal value. Parameters ---------- - endianness : Literal[">", "<", "=", "|"] + endianness : Literal[">", "<", "="] The numpy string representation of the endianness. Returns ------- - Endianness or None + Endianness The human-readable representation of the endianness. Raises @@ -74,26 +99,21 @@ def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: return "little" case ">": return "big" - case "|": - # for dtypes without byte ordering semantics - return None - raise ValueError( - f"Invalid endianness: {endianness!r}. Expected one of {get_args(EndiannessNumpy)}" - ) + raise ValueError(f"Invalid endianness: {endianness!r}. Expected one of {NUMPY_ENDIANNESS_STR}") -def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: +def endianness_to_numpy_str(endianness: EndiannessStr) -> NumpyEndiannessStr: """ Convert an endianness literal to its numpy string representation. Parameters ---------- - endianness : Endianness or None + endianness : Endianness The endianness to convert. Returns ------- - Literal[">", "<", "|"] + Literal[">", "<"] The numpy string representation of the endianness. Raises @@ -106,13 +126,22 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: return "<" case "big": return ">" - case None: - return "|" raise ValueError( - f"Invalid endianness: {endianness!r}. Expected one of {get_args(Endianness)} or None" + f"Invalid endianness: {endianness!r}. Expected one of {ENDIANNESS_STR} or None" ) +def get_endianness_from_numpy_dtype(dtype: np.dtype[np.generic]) -> EndiannessStr: + """ + Gets the endianness from a numpy dtype that has an endianness. This function will + raise a ValueError if the numpy data type does not have a concrete endianness. + """ + endianness = dtype.byteorder + if dtype.byteorder in NUMPY_ENDIANNESS_STR: + return endianness_from_numpy_str(endianness) # type: ignore [arg-type] + raise ValueError(f"The dtype {dtype} has an unsupported endianness: {endianness}") + + def float_from_json_v2(data: JSONFloatV2) -> float: """ Convert a JSON float to a float (Zarr v2). diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index f68640e4ce..2df60f930b 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -1,17 +1,15 @@ from dataclasses import dataclass from typing import ( - TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, - cast, ) import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( ComplexLike, TComplexDType_co, @@ -22,13 +20,10 @@ complex_float_from_json_v3, complex_float_to_json_v2, complex_float_to_json_v3, - endianness_from_numpy_str, endianness_to_numpy_str, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType - -if TYPE_CHECKING: - from zarr.core.dtype.npy.common import EndiannessNumpy +from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True) @@ -37,9 +32,12 @@ class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, Ha _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> TComplexDType_co: byte_order = endianness_to_numpy_str(self.endianness) @@ -65,16 +63,6 @@ def to_json(self, zarr_format: ZarrFormat) -> str: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ @@ -86,11 +74,33 @@ def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Ty def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: return data == cls._zarr_v3_name - def _check_scalar(self, data: object) -> bool: + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." + raise DataTypeValidationError(msg) + + def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: return isinstance(data, ComplexLike) - def _cast_scalar_unchecked(self, data: object) -> TComplexScalar_co: - return self.to_native_dtype().type(data) # type: ignore[arg-type, return-value] + def cast_scalar(self, data: object) -> TComplexScalar_co: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." + raise TypeError(msg) + + def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: + return self.to_native_dtype().type(data) # type: ignore[return-value] def default_scalar(self) -> TComplexScalar_co: """ diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index f87f032581..60a05326d5 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -1,23 +1,27 @@ from dataclasses import dataclass -from typing import ClassVar, Self, TypeGuard, cast +from typing import ClassVar, Self, TypeGuard import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + HasEndianness, + HasItemSize, + ScalarTypeValidationError, +) from zarr.core.dtype.npy.common import ( - EndiannessNumpy, FloatLike, TFloatDType_co, TFloatScalar_co, check_json_float_v2, check_json_float_v3, - endianness_from_numpy_str, endianness_to_numpy_str, float_from_json_v2, float_from_json_v3, float_to_json_v2, float_to_json_v3, + get_endianness_from_numpy_dtype, ) from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType @@ -28,9 +32,12 @@ class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemS _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> TFloatDType_co: byte_order = endianness_to_numpy_str(self.endianness) @@ -77,11 +84,33 @@ def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Ty def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: return data == cls._zarr_v3_name + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." + raise DataTypeValidationError(msg) + def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: return isinstance(data, FloatLike) - def _cast_scalar_unchecked(self, data: object) -> TFloatScalar_co: - return self.to_native_dtype().type(data) # type: ignore[return-value, arg-type] + def cast_scalar(self, data: object) -> TFloatScalar_co: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." + raise ScalarTypeValidationError(msg) + + def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: + return self.to_native_dtype().type(data) # type: ignore[return-value] def default_scalar(self) -> TFloatScalar_co: """ @@ -145,9 +174,9 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | st See the zarr specifications for details on the JSON encoding for floats. """ if zarr_format == 2: - return float_to_json_v2(self._cast_scalar_unchecked(data)) + return float_to_json_v2(self.cast_scalar(data)) elif zarr_format == 3: - return float_to_json_v3(self._cast_scalar_unchecked(data)) + return float_to_json_v3(self.cast_scalar(data)) else: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index aed577ee44..804e9e359a 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -7,21 +7,19 @@ SupportsInt, TypeGuard, TypeVar, - cast, overload, ) import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( - EndiannessNumpy, check_json_int, - endianness_from_numpy_str, endianness_to_numpy_str, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType _NumpyIntDType = ( np.dtypes.Int8DType @@ -43,7 +41,7 @@ @dataclass(frozen=True) class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): - # This attribute holds the possible zarr v2 JSON names for the data type + # This attribute holds the possible zarr V2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod @@ -55,13 +53,28 @@ def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Ty @classmethod def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: + """ + Check that a JSON value is consistent with the zarr v3 spec for this data type. + """ return data == cls._zarr_v3_name def _check_scalar(self, data: object) -> TypeGuard[IntLike]: + """ + Check that a python object is IntLike + """ return isinstance(data, IntLike) - def _cast_scalar_unchecked(self, data: object) -> TIntScalar_co: - return self.to_native_dtype().type(data) # type: ignore[return-value, arg-type] + def _cast_scalar_unchecked(self, data: IntLike) -> TIntScalar_co: + """ + Create an integer without any type checking of the input. + """ + return self.to_native_dtype().type(data) # type: ignore[return-value] + + def cast_scalar(self, data: object) -> TIntScalar_co: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy integer." + raise TypeError(msg) def default_scalar(self) -> TIntScalar_co: """ @@ -117,7 +130,18 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): dtype_cls = np.dtypes.Int8DType _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) + _zarr_v2_names: ClassVar[tuple[Literal["|i1"]]] = ("|i1",) + + @classmethod + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a Int8 from a np.dtype('int8') instance. + """ + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) @overload def to_json(self, zarr_format: Literal[2]) -> Literal["|i1"]: ... @@ -140,23 +164,27 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["int8", "|i1"]: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self.to_native_dtype().str + return self._zarr_v2_names[0] elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - return cls() - def to_native_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + return cls() + msg = f"Invalid JSON representation of Int8. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of Int8. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @property def item_size(self) -> int: @@ -167,7 +195,18 @@ def item_size(self) -> int: class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): dtype_cls = np.dtypes.UInt8DType _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) + _zarr_v2_names: ClassVar[tuple[Literal["|u1"]]] = ("|u1",) + + @classmethod + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a Bool from a np.dtype('uint8') instance. + """ + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) @overload def to_json(self, zarr_format: Literal[2]) -> Literal["|u1"]: ... @@ -190,23 +229,27 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["uint8", "|u1"]: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self.to_native_dtype().str + return self._zarr_v2_names[0] elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - return cls() - def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: return self.dtype_cls() @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + return cls() + msg = f"Invalid JSON representation of UInt8. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of UInt8. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @property def item_size(self) -> int: @@ -217,7 +260,7 @@ def item_size(self) -> int: class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", "i2"], Literal["i2", " Literal[">i2", " Literal["int16", ">i2", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.Int16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - # This ensures that we get the endianness correct without annoying string parsing - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of Int16. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + msg = f"Invalid JSON representation of Int16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @property def item_size(self) -> int: @@ -274,7 +325,7 @@ def item_size(self) -> int: class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", "u2"], Literal["u2", " Literal[">u2", " Literal["uint16", ">u2", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.UInt16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @property def item_size(self) -> int: @@ -330,7 +390,7 @@ def item_size(self) -> int: class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", "i4"], Literal["i4", " Literal[">i4", " Literal["int32", ">i4", " Self: - # We override the base implementation to address a windows-specific, pre-numpy 2 issue where - # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` - # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, - # despite the two classes being different. Thus we will create an instance of `cls` with the - # latter dtype, after pulling in the byte order of the input - if dtype == np.dtypes.Int32DType(): - return cls._from_native_dtype_unchecked( - np.dtypes.Int32DType().newbyteorder(dtype.byteorder) - ) - else: - return super().from_native_dtype(dtype) - - @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.Int32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of Int32. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + msg = f"Invalid JSON representation of Int32. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @property def item_size(self) -> int: @@ -400,7 +455,7 @@ def item_size(self) -> int: class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", "u4"], Literal["u4", " Literal[">u4", " Literal["uint32", ">u4", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.UInt32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of UInt32. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + msg = f"Invalid JSON representation of UInt32. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @property def item_size(self) -> int: @@ -454,7 +518,7 @@ def item_size(self) -> int: class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", "i8"], Literal["i8", " Literal[">i8", " Literal["int64", ">i8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.Int64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of Int64. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + msg = f"Invalid JSON representation of Int64. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @property def item_size(self) -> int: @@ -508,7 +581,7 @@ def item_size(self) -> int: class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", "u8"], Literal["u8", " Literal[">u8", " Literal["uint64", ">u8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.UInt64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of UInt64. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + msg = f"Invalid JSON representation of UInt64. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @property def item_size(self) -> int: diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 377c364ca2..f811dce00a 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -2,12 +2,23 @@ import re from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload +from typing import ( + TYPE_CHECKING, + ClassVar, + Literal, + Protocol, + Self, + TypedDict, + TypeGuard, + overload, + runtime_checkable, +) import numpy as np from zarr.core.common import NamedConfig from zarr.core.dtype.common import ( + DataTypeValidationError, HasEndianness, HasItemSize, HasLength, @@ -15,12 +26,11 @@ v3_unstable_dtype_warning, ) from zarr.core.dtype.npy.common import ( - EndiannessNumpy, check_json_str, - endianness_from_numpy_str, endianness_to_numpy_str, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TDType_co, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -29,6 +39,11 @@ _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +@runtime_checkable +class SupportsStr(Protocol): + def __str__(self) -> str: ... + + class LengthBytesConfig(TypedDict): length_bytes: int @@ -46,11 +61,15 @@ class FixedLengthUTF32( code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls( - length=dtype.itemsize // (cls.code_point_bytes), - endianness=endianness_from_numpy_str(byte_order), + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + endianness = get_endianness_from_numpy_dtype(dtype) + return cls( + length=dtype.itemsize // (cls.code_point_bytes), + endianness=endianness, + ) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) def to_native_dtype(self) -> np.dtypes.StrDType[int]: @@ -58,9 +77,9 @@ def to_native_dtype(self) -> np.dtypes.StrDType[int]: return self.dtype_cls(self.length).newbyteorder(byte_order) @classmethod - def _check_json_v2(cls, data: JSON, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ - Check that the input is a valid JSON representation of a numpy S dtype. + Check that the input is a valid JSON representation of a numpy U dtype. """ return isinstance(data, str) and re.match(r"^[><]U\d+$", data) is not None @@ -94,14 +113,20 @@ def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthUTF32JSONV3: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + # Construct the numpy dtype instead of string parsing. + return cls.from_native_dtype(np.dtype(data)) + raise DataTypeValidationError( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a numpy U dtype." + ) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." + raise DataTypeValidationError(msg) def default_scalar(self) -> np.str_: return np.str_("") @@ -118,18 +143,22 @@ def _check_scalar(self, data: object) -> TypeGuard[str | np.str_ | bytes | int]: # this is generous for backwards compatibility return isinstance(data, str | np.str_ | bytes | int) - def _cast_scalar_unchecked(self, data: object) -> np.str_: - # We explicitly truncate the result because of the following numpy behavior: - # >>> x = np.dtype('U3').type('hello world') - # >>> x - # np.str_('hello world') - # >>> x.dtype - # dtype('U11') - - if isinstance(data, int): - return self.to_native_dtype().type(str(data)[: self.length]) - else: - return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] + def cast_scalar(self, data: object) -> np.str_: + if self._check_scalar(data): + # We explicitly truncate before casting because of the following numpy behavior: + # >>> x = np.dtype('U3').type('hello world') + # >>> x + # np.str_('hello world') + # >>> x.dtype + # dtype('U11') + + if isinstance(data, int): + return self.to_native_dtype().type(str(data)[: self.length]) + else: + return self.to_native_dtype().type(data[: self.length]) + raise TypeError( + f"Cannot convert object with type {type(data)} to a numpy unicode string scalar." + ) @property def item_size(self) -> int: @@ -145,144 +174,119 @@ def check_vlen_string_json_scalar(data: object) -> TypeGuard[int | str | float]: return isinstance(data, int | str | float) -if _NUMPY_SUPPORTS_VLEN_STRING: +# VariableLengthUTF8 is defined in two places, conditioned on the version of numpy. +# If numpy 2 is installed, then VariableLengthUTF8 is defined with the numpy variable length +# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the numpy object +# dtype as the native dtype. +class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): + """ + A base class for the variable length UTF-8 string data type. This class should not be used + as data type, but as a base class for other variable length string data types. + """ - @dataclass(frozen=True, kw_only=True) - class VariableLengthUTF8(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] - dtype_cls = np.dtypes.StringDType - _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" - object_codec_id = "vlen-utf8" + _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" - @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: + @classmethod + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) - def to_native_dtype(self) -> np.dtypes.StringDType: - return self.dtype_cls() + @classmethod + def _check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|O"]]: + """ + Check that the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. + """ + return data == "|O" and object_codec_id == cls.object_codec_id + + @classmethod + def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + return data == cls._zarr_v3_name + + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + return cls() + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O'" + ) + raise DataTypeValidationError(msg) - @classmethod - def _check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|O"]]: - """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. - """ - return data == "|O" and object_codec_id == cls.object_codec_id - - @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: - return data == cls._zarr_v3_name - - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... - - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: - if zarr_format == 2: - # Note: unlike many other numpy data types, we don't serialize the .str attribute - # of the data type to JSON. This is because Zarr was using `|O` for strings before the - # numpy variable length string data type existed, and we want to be consistent with - # that practice - return "|O" - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." + raise DataTypeValidationError(msg) - def default_scalar(self) -> str: - return "" + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - if self._check_scalar(data): - return data - raise TypeError(f"Invalid type: {data}. Expected a string.") + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: + if zarr_format == 2: + # Note: unlike many other numpy data types, we don't serialize the .str attribute + # of the data type to JSON. This is because Zarr was using `|O` for strings before the + # numpy variable length string data type existed, and we want to be consistent with + # that practice + return "|O" + elif zarr_format == 3: + v3_unstable_dtype_warning(self) + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - if not check_vlen_string_json_scalar(data): - raise TypeError(f"Invalid type: {data}. Expected a string or number.") - return str(data) + @classmethod + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + return cls() - def _check_scalar(self, data: object) -> TypeGuard[str]: - return isinstance(data, str) + def default_scalar(self) -> str: + return "" - def _cast_scalar_unchecked(self, data: object) -> str: - return str(data) + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + raise TypeError(f"Invalid type: {data}. Expected a string.") -else: - # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. - @dataclass(frozen=True, kw_only=True) - class VariableLengthUTF8(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] - dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" - object_codec_id = "vlen-utf8" + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_vlen_string_json_scalar(data): + raise TypeError(f"Invalid type: {data}. Expected a string or number.") + return str(data) - @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - return cls() + def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: + return isinstance(data, SupportsStr) - def to_native_dtype(self) -> np.dtypes.ObjectDType: - return self.dtype_cls() + def _cast_scalar_unchecked(self, data: SupportsStr) -> str: + return str(data) - @classmethod - def _check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|O"]]: - """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. - """ - return data == "|O" and object_codec_id == cls.object_codec_id - - @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: - return data == cls._zarr_v3_name - - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... - - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: - if zarr_format == 2: - return "|O" - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() + def cast_scalar(self, data: object) -> str: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + raise TypeError(f"Cannot convert object with type {type(data)} to a python string.") - def default_scalar(self) -> str: - return "" - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - if self._check_scalar(data): - return data - raise TypeError(f"Invalid type: {data}. Expected a string.") +if _NUMPY_SUPPORTS_VLEN_STRING: + + @dataclass(frozen=True, kw_only=True) + class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] + dtype_cls = np.dtypes.StringDType - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - """ - Strings pass through - """ - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data + def to_native_dtype(self) -> np.dtypes.StringDType: + return self.dtype_cls() - def _check_scalar(self, data: object) -> TypeGuard[str]: - return isinstance(data, str) +else: + # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. + @dataclass(frozen=True, kw_only=True) + class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] + dtype_cls = np.dtypes.ObjectDType - def _cast_scalar_unchecked(self, data: object) -> str: - return str(data) + def to_native_dtype(self) -> np.dtypes.ObjectDType: + return self.dtype_cls() diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 579e0a9e27..b6196b7fed 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -17,6 +17,8 @@ ) from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, TBaseScalar, ZDType +StructuredScalarLike = list[object] | tuple[object, ...] | bytes | int + # TODO: tighten this up, get a v3 spec in place, handle endianness, etc. @dataclass(frozen=True, kw_only=True) @@ -25,19 +27,6 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): _zarr_v3_name = "structured" fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] - def default_scalar(self) -> np.void: - return self._cast_scalar_unchecked(0) - - def _cast_scalar_unchecked(self, data: object) -> np.void: - na_dtype = self.to_native_dtype() - if isinstance(data, bytes): - res = np.frombuffer(data, dtype=na_dtype)[0] - elif isinstance(data, list | tuple): - res = np.array([tuple(data)], dtype=na_dtype)[0] - else: - res = np.array([data], dtype=na_dtype)[0] - return cast("np.void", res) - @classmethod def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ @@ -53,24 +42,30 @@ def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType TypeGuard[np.dtypes.VoidDType] True if the dtype matches, False otherwise. """ - return super()._check_native_dtype(dtype) and dtype.fields is not None + return isinstance(dtype, cls.dtype_cls) and dtype.fields is not None # type: ignore[has-type] @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: + def from_native_dtype(cls, dtype: TBaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] + if cls._check_native_dtype(dtype): + # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only + # care about the first element in either case. + for key, (dtype_instance, *_) in dtype.fields.items(): # type: ignore[union-attr] + dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) + fields.append((key, dtype_wrapped)) + + return cls(fields=tuple(fields)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" # type: ignore[has-type] + ) - if dtype.fields is None: - raise ValueError("numpy dtype has no fields") - - # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only - # care about the first element in either case. - for key, (dtype_instance, *_) in dtype.fields.items(): - dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) - fields.append((key, dtype_wrapped)) - - return cls(fields=tuple(fields)) + def to_native_dtype(self) -> np.dtypes.VoidDType[int]: + return cast( + "np.dtypes.VoidDType[int]", + np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), + ) @overload def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... @@ -113,67 +108,78 @@ def _check_json_v3( ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, JSON]]]]]: return ( isinstance(data, dict) - and "name" in data + and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name - and "configuration" in data and isinstance(data["configuration"], dict) - and "fields" in data["configuration"] + and set(data["configuration"].keys()) == {"fields"} ) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - # avoid circular import issues by importing these functions here - from zarr.core.dtype import get_data_type_from_json_v2, get_data_type_from_json_v3 - - # This is a horrible mess, because this data type is recursive - if zarr_format == 2: - if cls._check_json_v2(data): # type: ignore[arg-type] - # structured dtypes are constructed directly from a list of lists - # note that we do not handle the object codec here! this will prevent structured - # dtypes from containing object dtypes. - return cls( - fields=tuple( # type: ignore[misc] - (f_name, get_data_type_from_json_v2(f_dtype, object_codec_id=None)) # type: ignore[has-type] - for f_name, f_dtype in data - ) + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + # avoid circular import + from zarr.core.dtype import get_data_type_from_json_v2 + + if cls._check_json_v2(data): + # structured dtypes are constructed directly from a list of lists + # note that we do not handle the object codec here! this will prevent structured + # dtypes from containing object dtypes. + return cls( + fields=tuple( # type: ignore[misc] + (f_name, get_data_type_from_json_v2(f_dtype, object_codec_id=None)) # type: ignore[has-type] + for f_name, f_dtype in data ) - else: - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - elif zarr_format == 3: - if cls._check_json_v3(data): # type: ignore[arg-type] - config = data["configuration"] - meta_fields = config["fields"] - fields = tuple( + ) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON array of arrays" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + # avoid circular import + from zarr.core.dtype import get_data_type_from_json_v3 + + if cls._check_json_v3(data): + config = data["configuration"] + meta_fields = config["fields"] + return cls( + fields=tuple( (f_name, get_data_type_from_json_v3(f_dtype)) for f_name, f_dtype in meta_fields ) - else: - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - else: - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - return cls(fields=fields) - - def to_native_dtype(self) -> np.dtypes.VoidDType[int]: - return cast( - "np.dtypes.VoidDType[int]", - np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), - ) + ) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) - def _check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: # TODO: implement something here! return True + def default_scalar(self) -> np.void: + return self._cast_scalar_unchecked(0) + + def cast_scalar(self, data: object) -> np.void: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy structured scalar." + raise TypeError(msg) + + def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: + na_dtype = self.to_native_dtype() + if isinstance(data, bytes): + res = np.frombuffer(data, dtype=na_dtype)[0] + elif isinstance(data, list | tuple): + res = np.array([tuple(data)], dtype=na_dtype)[0] + else: + res = np.array([data], dtype=na_dtype)[0] + return cast("np.void", res) + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_native_dtype() return cast("np.void", np.array([as_bytes]).view(dtype)[0]) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + raise TypeError(f"Invalid type: {data}. Expected a string.") @property def item_size(self) -> int: diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 9f82d3d168..dd4f3840b1 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import dataclass +from datetime import datetime, timedelta from typing import ( TYPE_CHECKING, ClassVar, @@ -17,20 +18,21 @@ import numpy as np from zarr.core.common import NamedConfig -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( DateTimeUnit, - EndiannessNumpy, check_json_int, - endianness_from_numpy_str, endianness_to_numpy_str, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat _DTypeName = Literal["datetime64", "timedelta64"] +TimeDeltaLike = str | int | bytes | np.timedelta64 | timedelta | None +DateTimeLike = str | int | bytes | np.datetime64 | datetime | None def datetime_from_int(data: int, *, unit: DateTimeUnit, scale_factor: int) -> np.datetime64: @@ -72,17 +74,27 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: return data.view(np.int64).item() -_BaseTimeDType_co = TypeVar( - "_BaseTimeDType_co", +def check_json_time(data: JSON) -> TypeGuard[Literal["NaT"] | int]: + """ + Type guard to check if the input JSON data is the literal string "NaT" + or an integer. + """ + return check_json_int(data) or data == "NaT" + + +BaseTimeDType_co = TypeVar( + "BaseTimeDType_co", bound=np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, covariant=True, ) -_BaseTimeScalar = TypeVar("_BaseTimeScalar", bound=np.timedelta64 | np.datetime64) +BaseTimeScalar_co = TypeVar( + "BaseTimeScalar_co", bound=np.timedelta64 | np.datetime64, covariant=True +) class TimeConfig(TypedDict): unit: DateTimeUnit - interval: int + scale_factor: int DateTime64JSONV3 = NamedConfig[Literal["numpy.datetime64"], TimeConfig] @@ -90,7 +102,7 @@ class TimeConfig(TypedDict): @dataclass(frozen=True, kw_only=True, slots=True) -class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness, HasItemSize): +class TimeDTypeBase(ZDType[BaseTimeDType_co, BaseTimeScalar_co], HasEndianness, HasItemSize): _zarr_v2_names: ClassVar[tuple[str, ...]] # this attribute exists so that we can programmatically create a numpy dtype instance # because the particular numpy dtype we are wrapping does not allow direct construction via @@ -108,33 +120,26 @@ def __post_init__(self) -> None: raise ValueError(f"unit must be one of {get_args(DateTimeUnit)}, got {self.unit!r}.") @classmethod - def _from_native_dtype_unchecked(cls, dtype: TBaseDType) -> Self: - unit, scale_factor = np.datetime_data(dtype.name) - unit = cast("DateTimeUnit", unit) - byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls( - unit=unit, scale_factor=scale_factor, endianness=endianness_from_numpy_str(byteorder) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + if cls._check_native_dtype(dtype): + unit, scale_factor = np.datetime_data(dtype.name) + unit = cast("DateTimeUnit", unit) + return cls( + unit=unit, + scale_factor=scale_factor, + endianness=get_endianness_from_numpy_dtype(dtype), + ) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> _BaseTimeDType_co: + def to_native_dtype(self) -> BaseTimeDType_co: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - unit = data["configuration"]["unit"] # type: ignore[index, call-overload] - scale_factor = data["configuration"]["scale_factor"] # type: ignore[index, call-overload] - return cls(unit=unit, scale_factor=scale_factor) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @overload def to_json(self, zarr_format: Literal[2]) -> str: ... @overload @@ -156,14 +161,6 @@ def to_json(self, zarr_format: ZarrFormat) -> str | DateTime64JSONV3 | TimeDelta def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: return datetimelike_to_int(data) # type: ignore[arg-type] - def _check_scalar(self, data: object) -> bool: - # TODO: decide which values we should accept for datetimes. - try: - np.array([data], dtype=self.to_native_dtype()) - return True # noqa: TRY300 - except ValueError: - return False - @property def item_size(self) -> int: return 8 @@ -178,6 +175,8 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has unit for ``TimeDelta64`` is optional. """ + # mypy infers the type of np.dtypes.TimeDelta64DType to be + # "Callable[[Literal['Y', 'M', 'W', 'D'] | Literal['h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', 'as']], Never]" dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" _zarr_v2_names = (">m8", " np.timedelta64: return np.timedelta64("NaT") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: - if check_json_int(data) or data == "NaT": - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] + if check_json_time(data): + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_scalar_unchecked(self, data: object) -> np.timedelta64: - return self.to_native_dtype().type(data) # type: ignore[arg-type] + def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: + if data is None: + return True + return isinstance(data, str | int | bytes | np.timedelta64 | timedelta) + + def _cast_scalar_unchecked(self, data: TimeDeltaLike) -> np.timedelta64: + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") + + def cast_scalar(self, data: object) -> np.timedelta64: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy timedelta64 scalar." + raise TypeError(msg) @classmethod def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: @@ -221,6 +231,30 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data): + return cls.from_native_dtype(np.dtype(data)) + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " + f"representation of an instance of {cls.dtype_cls}" # type: ignore[has-type] + ) + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + unit = data["configuration"]["unit"] + scale_factor = data["configuration"]["scale_factor"] + return cls(unit=unit, scale_factor=scale_factor) + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a dict " + f"with a 'name' key with the value 'numpy.timedelta64', " + "and a 'configuration' key with a value of a dict with a 'unit' key and a " + "'scale_factor' key" + ) + raise DataTypeValidationError(msg) + @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): @@ -235,12 +269,23 @@ def default_scalar(self) -> np.datetime64: return np.datetime64("NaT") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_int(data) or data == "NaT": - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] + if check_json_time(data): + return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_scalar_unchecked(self, data: object) -> np.datetime64: - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] + def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: + if data is None: + return True + return isinstance(data, str | int | bytes | np.datetime64 | datetime) + + def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") + + def cast_scalar(self, data: object) -> np.datetime64: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy datetime scalar." + raise TypeError(msg) @classmethod def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: @@ -266,3 +311,27 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) + + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data): + return cls.from_native_dtype(np.dtype(data)) + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " + f"representation of an instance of {cls.dtype_cls}" # type: ignore[has-type] + ) + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + unit = data["configuration"]["unit"] + scale_factor = data["configuration"]["scale_factor"] + return cls(unit=unit, scale_factor=scale_factor) + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a dict " + f"with a 'name' key with the value 'numpy.datetime64', " + "and a 'configuration' key with a value of a dict with a 'unit' key and a " + "'scale_factor' key" + ) + raise DataTypeValidationError(msg) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 94fbe60242..557da87fcf 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -39,8 +39,6 @@ import numpy as np -from zarr.core.dtype.common import DataTypeValidationError - if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -101,12 +99,12 @@ def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_ return type(dtype) is cls.dtype_cls @classmethod + @abstractmethod def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ Create a ZDType instance from a native data type. The default implementation first performs - a type check via ``cls._check_native_dtype``. If that type check succeeds, then - ``cls._from_native_dtype_unchecked`` is called, which assumes that the incoming object - as all the properties necessary for instantiating the ZDType. + a type check via ``cls._check_native_dtype``. If that type check succeeds, the ZDType class + instance is created. This method is used when taking a user-provided native data type, like a NumPy data type, and creating the corresponding ZDType instance from them. @@ -126,29 +124,6 @@ def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: TypeError If the native data type is not consistent with the wrapped data type. """ - if cls._check_native_dtype(dtype): - return cls._from_native_dtype_unchecked(dtype) - raise DataTypeValidationError( - f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." - ) - - @classmethod - @abstractmethod - def _from_native_dtype_unchecked(cls: type[Self], dtype: TBaseDType) -> Self: - """ - Create a ZDType instance from a native data type without performing any type checking of - that data type. - - Parameters - ---------- - dtype : TDType - The native dtype to wrap. - - Returns - ------- - Self - A ZDType that wraps the native dtype. - """ ... @abstractmethod @@ -163,6 +138,7 @@ def to_native_dtype(self: Self) -> TDType_co: """ ... + @abstractmethod def cast_scalar(self, data: object) -> TScalar_co: """ Cast a python object to the wrapped scalar type. @@ -179,15 +155,6 @@ def cast_scalar(self, data: object) -> TScalar_co: TScalar The cast value. """ - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = ( - f"The value {data!r} failed a type check. " - f"It cannot be safely cast to a scalar compatible with {self}. " - f"Consult the documentation for {self} to determine the possible values that can " - "be cast to scalars of the wrapped data type." - ) - raise TypeError(msg) @abstractmethod def _check_scalar(self, data: object) -> bool: @@ -206,24 +173,6 @@ def _check_scalar(self, data: object) -> bool: """ ... - @abstractmethod - def _cast_scalar_unchecked(self, data: object) -> TScalar_co: - """ - Cast a python object to the wrapped data type. - This method should not perform any type checking. - - Parameters - ---------- - data : object - The python object to cast. - - Returns - ------- - TScalar - The cast value. - """ - ... - @abstractmethod def default_scalar(self) -> TScalar_co: """ @@ -309,6 +258,7 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: ... @classmethod + @abstractmethod def from_json_v3(cls: type[Self], data: JSON) -> Self: """ Create an instance of this ZDType from Zarr V3 JSON data. @@ -323,48 +273,13 @@ def from_json_v3(cls: type[Self], data: JSON) -> Self: Self The wrapped data type. """ - if cls._check_json_v3(data): - return cls._from_json_unchecked(data, zarr_format=3) - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}: {data}") - - @classmethod - def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None) -> Self: - """ - Create an instance of this ZDType from Zarr V2 JSON data. - - Parameters - ---------- - data : JSON - The JSON representation of the data type. - - Returns - ------- - Self - The wrapped data type. - """ - if cls._check_json_v2(data, object_codec_id=object_codec_id): - return cls._from_json_unchecked(data, zarr_format=2) - raise DataTypeValidationError( - f"Invalid JSON representation of data type {cls}: {data!r}, object_codec_id={object_codec_id!r}" - ) - - @classmethod - @overload - def _from_json_unchecked(cls, data: DTypeJSON_V2, *, zarr_format: Literal[2]) -> Self: ... - @classmethod - @overload - def _from_json_unchecked(cls, data: DTypeJSON_V3, *, zarr_format: Literal[3]) -> Self: ... + ... @classmethod @abstractmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: + def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None = None) -> Self: """ - Create a ZDType instance from a JSON representation of a data type. - - This method should be called after input has been type checked, and so it should not perform - any input validation. + Create an instance of this ZDType from Zarr V2 JSON data. Parameters ---------- @@ -418,3 +333,18 @@ def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TSca The deserialized scalar value. """ ... + + +def scalar_failed_type_check_msg( + cls_instance: ZDType[TBaseDType, TBaseScalar], bad_scalar: object +) -> str: + """ + Generate an error message reporting that a particular value failed a type check when attempting + to cast that value to a scalar. + """ + return ( + f"The value {bad_scalar!r} failed a type check. " + f"It cannot be safely cast to a scalar compatible with {cls_instance}. " + f"Consult the documentation for {cls_instance} to determine the possible values that can " + "be cast to scalars of the wrapped data type." + ) diff --git a/tests/test_array.py b/tests/test_array.py index 862b49da61..28ea812967 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -7,7 +7,7 @@ import re import sys from itertools import accumulate -from typing import TYPE_CHECKING, Any, Literal, get_args +from typing import TYPE_CHECKING, Any, Literal from unittest import mock import numcodecs @@ -42,8 +42,8 @@ from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.common import Endianness -from zarr.core.dtype.npy.common import endianness_from_numpy_str +from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr +from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str from zarr.core.dtype.npy.float import Float32, Float64 from zarr.core.dtype.npy.int import Int16, UInt8 from zarr.core.dtype.npy.string import VariableLengthUTF8 @@ -1507,16 +1507,18 @@ async def test_name(store: Store, zarr_format: ZarrFormat, path: str | None) -> ) @staticmethod - @pytest.mark.parametrize("endianness", get_args(Endianness)) + @pytest.mark.parametrize("endianness", ENDIANNESS_STR) def test_default_endianness( - store: Store, zarr_format: ZarrFormat, endianness: Endianness + store: Store, zarr_format: ZarrFormat, endianness: EndiannessStr ) -> None: """ Test that that endianness is correctly set when creating an array when not specifying a serializer """ dtype = Int16(endianness=endianness) arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) - assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness # type: ignore[union-attr] + byte_order: str = arr[:].dtype.byteorder # type: ignore[union-attr] + assert byte_order in NUMPY_ENDIANNESS_STR + assert endianness_from_numpy_str(byte_order) == endianness # type: ignore[arg-type] @pytest.mark.parametrize("value", [1, 1.4, "a", b"a", np.array(1)]) diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index c4a82e22b0..d39d308112 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -9,9 +9,9 @@ import numpy as np import pytest -from zarr.core.dtype.common import Endianness, JSONFloatV2, SpecialFloatStrings +from zarr.core.dtype.common import ENDIANNESS_STR, JSONFloatV2, SpecialFloatStrings from zarr.core.dtype.npy.common import ( - EndiannessNumpy, + NumpyEndiannessStr, bytes_from_json, bytes_to_json, check_json_bool, @@ -67,10 +67,10 @@ def test_endianness_from_numpy_str(data: str, expected: str | None) -> None: Test that endianness_from_numpy_str correctly converts a numpy str literal to a human-readable literal value. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(EndiannessNumpy): + if data in get_args(NumpyEndiannessStr): assert endianness_from_numpy_str(data) == expected # type: ignore[arg-type] else: - msg = f"Invalid endianness: {data!r}. Expected one of {get_args(EndiannessNumpy)}" + msg = f"Invalid endianness: {data!r}. Expected one of {get_args(NumpyEndiannessStr)}" with pytest.raises(ValueError, match=re.escape(msg)): endianness_from_numpy_str(data) # type: ignore[arg-type] @@ -84,10 +84,10 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: Test that endianness_to_numpy_str correctly converts a human-readable literal value to a numpy str literal. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(Endianness) + (None,): + if data in ENDIANNESS_STR: assert endianness_to_numpy_str(data) == expected # type: ignore[arg-type] else: - msg = f"Invalid endianness: {data!r}. Expected one of {get_args(Endianness)}" + msg = f"Invalid endianness: {data!r}. Expected one of {ENDIANNESS_STR}" with pytest.raises(ValueError, match=re.escape(msg)): endianness_to_numpy_str(data) # type: ignore[arg-type] From b2e56c8e9d342163f9d28b92188994a2985aa3c7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 14 Jun 2025 15:16:02 +0300 Subject: [PATCH 127/130] make method definition order consistent --- src/zarr/core/dtype/npy/bool.py | 20 +- src/zarr/core/dtype/npy/bytes.py | 108 +++++------ src/zarr/core/dtype/npy/complex.py | 46 ++--- src/zarr/core/dtype/npy/float.py | 58 +++--- src/zarr/core/dtype/npy/int.py | 264 +++++++++++++------------- src/zarr/core/dtype/npy/structured.py | 62 +++--- src/zarr/core/dtype/npy/time.py | 121 ++++++------ src/zarr/core/dtype/wrapper.py | 146 +++++++------- 8 files changed, 404 insertions(+), 421 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 2d045ce28a..7570dd1f4f 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -82,6 +82,16 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["|b1", "bool"]: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> bool: + # Anything can become a bool + return True + + def cast_scalar(self, data: object) -> np.bool_: + if self._check_scalar(data): + return np.bool_(data) + msg = f"Cannot convert object with type {type(data)} to a numpy boolean." + raise TypeError(msg) + def default_scalar(self) -> np.bool_: """ Get the default value for the boolean dtype. @@ -131,16 +141,6 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: return np.bool_(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover - def _check_scalar(self, data: object) -> bool: - # Anything can become a bool - return True - - def cast_scalar(self, data: object) -> np.bool_: - if self._check_scalar(data): - return np.bool_(data) - msg = f"Cannot convert object with type {type(data)} to a numpy boolean." - raise TypeError(msg) - @property def item_size(self) -> int: return 1 diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index d98114e9e1..9dc0bb1a68 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -14,7 +14,7 @@ v3_unstable_dtype_warning, ) from zarr.core.dtype.npy.common import check_json_str -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType BytesLike = np.bytes_ | str | bytes | int @@ -92,30 +92,6 @@ def to_json(self, zarr_format: ZarrFormat) -> str | NullTerminatedBytesJSONV3: } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_scalar(self) -> np.bytes_: - return np.bytes_(b"") - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - as_bytes = self.cast_scalar(data) - return base64.standard_b64encode(as_bytes).decode("ascii") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError( - f"Invalid type: {data}. Expected a base64-encoded string." - ) # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: # this is generous for backwards compatibility return isinstance(data, BytesLike) @@ -139,6 +115,20 @@ def cast_scalar(self, data: object) -> np.bytes_: msg = f"Cannot convert object with type {type(data)} to a numpy bytes scalar." raise TypeError(msg) + def default_scalar(self) -> np.bytes_: + return np.bytes_(b"") + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + as_bytes = self.cast_scalar(data) + return base64.standard_b64encode(as_bytes).decode("ascii") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + if check_json_str(data): + return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + raise TypeError( + f"Invalid type: {data}. Expected a base64-encoded string." + ) # pragma: no cover + @property def item_size(self) -> int: return self.length @@ -152,6 +142,29 @@ class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes" + @classmethod + def _check_native_dtype( + cls: type[Self], dtype: TBaseDType + ) -> TypeGuard[np.dtypes.VoidDType[Any]]: + """ + Numpy void dtype comes in two forms: + * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. + * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, + + In this check we ensure that ``fields`` is ``None``. + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] + @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: if cls._check_native_dtype(dtype): @@ -208,40 +221,6 @@ def to_json(self, zarr_format: ZarrFormat) -> str | RawBytesJSONV3: return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _check_native_dtype( - cls: type[Self], dtype: TBaseDType - ) -> TypeGuard[np.dtypes.VoidDType[Any]]: - """ - Numpy void dtype comes in two forms: - * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. - * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, - - In this check we ensure that ``fields`` is ``None``. - - Parameters - ---------- - dtype : TDType - The dtype to check. - - Returns - ------- - Bool - True if the dtype matches, False otherwise. - """ - return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - - def default_scalar(self) -> np.void: - return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if check_json_str(data): - return self.to_native_dtype().type(base64.standard_b64decode(data)) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def _check_scalar(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) @@ -258,6 +237,17 @@ def cast_scalar(self, data: object) -> np.void: msg = f"Cannot convert object with type {type(data)} to a numpy void scalar." raise TypeError(msg) + def default_scalar(self) -> np.void: + return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if check_json_str(data): + return self.to_native_dtype().type(base64.standard_b64decode(data)) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + @property def item_size(self) -> int: return self.length diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 2df60f930b..bc06af00b5 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -43,26 +43,6 @@ def to_native_dtype(self) -> TComplexDType_co: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ @@ -90,18 +70,38 @@ def from_json_v3(cls, data: JSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) + def to_json(self, zarr_format: ZarrFormat) -> str: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_native_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: return isinstance(data, ComplexLike) + def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: + return self.to_native_dtype().type(data) # type: ignore[return-value] + def cast_scalar(self, data: object) -> TComplexScalar_co: if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." raise TypeError(msg) - def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: - return self.to_native_dtype().type(data) # type: ignore[return-value] - def default_scalar(self) -> TComplexScalar_co: """ Get the default value, which is 0 cast to this dtype diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 60a05326d5..9271886c20 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -23,7 +23,7 @@ float_to_json_v3, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True) @@ -43,36 +43,6 @@ def to_native_dtype(self) -> TFloatDType_co: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ @@ -100,18 +70,38 @@ def from_json_v3(cls, data: JSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) + def to_json(self, zarr_format: ZarrFormat) -> str: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_native_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: return isinstance(data, FloatLike) + def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: + return self.to_native_dtype().type(data) # type: ignore[return-value] + def cast_scalar(self, data: object) -> TFloatScalar_co: if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." raise ScalarTypeValidationError(msg) - def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: - return self.to_native_dtype().type(data) # type: ignore[return-value] - def default_scalar(self) -> TFloatScalar_co: """ Get the default value, which is 0 cast to this dtype diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 804e9e359a..cddcb26c5e 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -143,6 +143,23 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) + def to_native_dtype(self: Self) -> np.dtypes.Int8DType: + return self.dtype_cls() + + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + return cls() + msg = f"Invalid JSON representation of Int8. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of Int8. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + @overload def to_json(self, zarr_format: Literal[2]) -> Literal["|i1"]: ... @@ -169,23 +186,6 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["int8", "|i1"]: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_native_dtype(self: Self) -> np.dtypes.Int8DType: - return self.dtype_cls() - - @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): - return cls() - msg = f"Invalid JSON representation of Int8. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" - raise DataTypeValidationError(msg) - - @classmethod - def from_json_v3(cls, data: JSON) -> Self: - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of Int8. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - @property def item_size(self) -> int: return 1 @@ -208,6 +208,23 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) + def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: + return self.dtype_cls() + + @classmethod + def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + if cls._check_json_v2(data, object_codec_id=object_codec_id): + return cls() + msg = f"Invalid JSON representation of UInt8. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + raise DataTypeValidationError(msg) + + @classmethod + def from_json_v3(cls, data: JSON) -> Self: + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of UInt8. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + @overload def to_json(self, zarr_format: Literal[2]) -> Literal["|u1"]: ... @@ -234,34 +251,45 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["uint8", "|u1"]: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: - return self.dtype_cls() + @property + def item_size(self) -> int: + return 1 + + +@dataclass(frozen=True, kw_only=True) +class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): + dtype_cls = np.dtypes.Int16DType + _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" + _zarr_v2_names: ClassVar[tuple[Literal[">i2"], Literal["i2", " Self: + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) + + def to_native_dtype(self) -> np.dtypes.Int16DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) @classmethod def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: if cls._check_json_v2(data, object_codec_id=object_codec_id): - return cls() - msg = f"Invalid JSON representation of UInt8. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + return cls.from_native_dtype(np.dtype(data)) + msg = f"Invalid JSON representation of Int16. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." raise DataTypeValidationError(msg) @classmethod def from_json_v3(cls, data: JSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of UInt8. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of Int16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @property - def item_size(self) -> int: - return 1 - - -@dataclass(frozen=True, kw_only=True) -class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): - dtype_cls = np.dtypes.Int16DType - _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" - _zarr_v2_names: ClassVar[tuple[Literal[">i2"], Literal["i2", " Literal[">i2", " Literal["int16", ">i2", " int: + return 2 + + +@dataclass(frozen=True, kw_only=True) +class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): + dtype_cls = np.dtypes.UInt16DType + _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" + _zarr_v2_names: ClassVar[tuple[Literal[">u2"], Literal["u2", " Self: if cls._check_native_dtype(dtype): @@ -296,7 +335,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> np.dtypes.Int16DType: + def to_native_dtype(self) -> np.dtypes.UInt16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @@ -306,27 +345,16 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self # Going via numpy ensures that we get the endianness correct without # annoying string parsing. return cls.from_native_dtype(np.dtype(data)) - msg = f"Invalid JSON representation of Int16. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." + msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod def from_json_v3(cls, data: JSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of Int16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @property - def item_size(self) -> int: - return 2 - - -@dataclass(frozen=True, kw_only=True) -class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): - dtype_cls = np.dtypes.UInt16DType - _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" - _zarr_v2_names: ClassVar[tuple[Literal[">u2"], Literal["u2", " Literal[">u2", " Literal["uint16", ">u2", " int: + return 2 + + +@dataclass(frozen=True, kw_only=True) +class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): + dtype_cls = np.dtypes.Int32DType + _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" + _zarr_v2_names: ClassVar[tuple[Literal[">i4"], Literal["i4", " Self: + def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> np.dtypes.UInt16DType: + def to_native_dtype(self) -> np.dtypes.Int32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @@ -371,27 +410,16 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self # Going via numpy ensures that we get the endianness correct without # annoying string parsing. return cls.from_native_dtype(np.dtype(data)) - msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod def from_json_v3(cls, data: JSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @property - def item_size(self) -> int: - return 2 - - -@dataclass(frozen=True, kw_only=True) -class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): - dtype_cls = np.dtypes.Int32DType - _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" - _zarr_v2_names: ClassVar[tuple[Literal[">i4"], Literal["i4", " Literal[">i4", " Literal["int32", ">i4", " int: + return 4 + + +@dataclass(frozen=True, kw_only=True) +class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): + dtype_cls = np.dtypes.UInt32DType + _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" + _zarr_v2_names: ClassVar[tuple[Literal[">u4"], Literal["u4", " Self: + def from_native_dtype(cls, dtype: TBaseDType) -> Self: if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> np.dtypes.Int32DType: + def to_native_dtype(self) -> np.dtypes.UInt32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @@ -436,27 +475,16 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self # Going via numpy ensures that we get the endianness correct without # annoying string parsing. return cls.from_native_dtype(np.dtype(data)) - msg = f"Invalid JSON representation of Int32. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod def from_json_v3(cls, data: JSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of Int32. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @property - def item_size(self) -> int: - return 4 - - -@dataclass(frozen=True, kw_only=True) -class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): - dtype_cls = np.dtypes.UInt32DType - _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" - _zarr_v2_names: ClassVar[tuple[Literal[">u4"], Literal["u4", " Literal[">u4", " Literal["uint32", ">u4", " int: + return 4 + + +@dataclass(frozen=True, kw_only=True) +class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): + dtype_cls = np.dtypes.Int64DType + _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" + _zarr_v2_names: ClassVar[tuple[Literal[">i8"], Literal["i8", " Self: if cls._check_native_dtype(dtype): @@ -489,7 +528,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> np.dtypes.UInt32DType: + def to_native_dtype(self) -> np.dtypes.Int64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @@ -499,27 +538,16 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self # Going via numpy ensures that we get the endianness correct without # annoying string parsing. return cls.from_native_dtype(np.dtype(data)) - msg = f"Invalid JSON representation of UInt32. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod def from_json_v3(cls, data: JSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of UInt32. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @property - def item_size(self) -> int: - return 4 - - -@dataclass(frozen=True, kw_only=True) -class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): - dtype_cls = np.dtypes.Int64DType - _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" - _zarr_v2_names: ClassVar[tuple[Literal[">i8"], Literal["i8", " Literal[">i8", " Literal["int64", ">i8", " Self: - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) + @property + def item_size(self) -> int: + return 8 - def to_native_dtype(self) -> np.dtypes.Int64DType: + +@dataclass(frozen=True, kw_only=True) +class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): + dtype_cls = np.dtypes.UInt64DType + _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" + _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", " np.dtypes.UInt64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @@ -562,27 +593,16 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self # Going via numpy ensures that we get the endianness correct without # annoying string parsing. return cls.from_native_dtype(np.dtype(data)) - msg = f"Invalid JSON representation of Int64. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod def from_json_v3(cls, data: JSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of Int64. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @property - def item_size(self) -> int: - return 8 - - -@dataclass(frozen=True, kw_only=True) -class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): - dtype_cls = np.dtypes.UInt64DType - _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" - _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", " Literal[">u8", " Self: f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> np.dtypes.UInt64DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): - # Going via numpy ensures that we get the endianness correct without - # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) - msg = f"Invalid JSON representation of UInt64. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def from_json_v3(cls, data: JSON) -> Self: - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of UInt64. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - @property def item_size(self) -> int: return 8 diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index b6196b7fed..190647c1e1 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -67,25 +67,6 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), ) - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V3 | DTypeJSON_V2: - fields = [ - (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields - ] - if zarr_format == 2: - return fields - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - base_dict = {"name": self._zarr_v3_name} - base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] - return cast("DTypeJSON_V3", base_dict) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _check_json_v2( cls, data: JSON, *, object_codec_id: str | None = None @@ -148,21 +129,28 @@ def from_json_v3(cls, data: JSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... - def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: - # TODO: implement something here! - return True + @overload + def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... - def default_scalar(self) -> np.void: - return self._cast_scalar_unchecked(0) + def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V3 | DTypeJSON_V2: + fields = [ + (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields + ] + if zarr_format == 2: + return fields + elif zarr_format == 3: + v3_unstable_dtype_warning(self) + base_dict = {"name": self._zarr_v3_name} + base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] + return cast("DTypeJSON_V3", base_dict) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def cast_scalar(self, data: object) -> np.void: - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy structured scalar." - raise TypeError(msg) + def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: + # TODO: implement something more precise here! + return isinstance(data, (bytes, list, tuple, int)) def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: na_dtype = self.to_native_dtype() @@ -174,6 +162,15 @@ def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: res = np.array([data], dtype=na_dtype)[0] return cast("np.void", res) + def cast_scalar(self, data: object) -> np.void: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy structured scalar." + raise TypeError(msg) + + def default_scalar(self) -> np.void: + return self._cast_scalar_unchecked(0) + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): as_bytes = bytes_from_json(data, zarr_format=zarr_format) @@ -181,6 +178,9 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: return cast("np.void", np.array([as_bytes]).view(dtype)[0]) raise TypeError(f"Invalid type: {data}. Expected a string.") + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) + @property def item_size(self) -> int: # Lets have numpy do the arithmetic here diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index dd4f3840b1..a99703dd3e 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -184,28 +184,6 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has scale_factor: int = 1 unit: DateTimeUnit = "generic" - def default_scalar(self) -> np.timedelta64: - return np.timedelta64("NaT") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: - if check_json_time(data): - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") - raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - - def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: - if data is None: - return True - return isinstance(data, str | int | bytes | np.timedelta64 | timedelta) - - def _cast_scalar_unchecked(self, data: TimeDeltaLike) -> np.timedelta64: - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") - - def cast_scalar(self, data: object) -> np.timedelta64: - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy timedelta64 scalar." - raise TypeError(msg) - @classmethod def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: # match m[M], etc @@ -255,52 +233,37 @@ def from_json_v3(cls, data: JSON) -> Self: ) raise DataTypeValidationError(msg) - -@dataclass(frozen=True, kw_only=True, slots=True) -class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): - dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] - _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" - _zarr_v2_names = (">M8", " np.datetime64: - return np.datetime64("NaT") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_time(data): - return self._cast_scalar_unchecked(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - - def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: + def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: if data is None: return True - return isinstance(data, str | int | bytes | np.datetime64 | datetime) + return isinstance(data, str | int | bytes | np.timedelta64 | timedelta) - def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: + def _cast_scalar_unchecked(self, data: TimeDeltaLike) -> np.timedelta64: return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") - def cast_scalar(self, data: object) -> np.datetime64: + def cast_scalar(self, data: object) -> np.timedelta64: if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy datetime scalar." + msg = f"Cannot convert object with type {type(data)} to a numpy timedelta64 scalar." raise TypeError(msg) - @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: - # match M[M], etc - # consider making this a standalone function - if not isinstance(data, str): - return False - if not data.startswith(cls._zarr_v2_names): - return False - if len(data) == 3: - # no unit, and - # we already checked that this string is either M8 - return True - else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" + def default_scalar(self) -> np.timedelta64: + return np.timedelta64("NaT") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + if check_json_time(data): + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") + raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover + + +@dataclass(frozen=True, kw_only=True, slots=True) +class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] + _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" + _zarr_v2_names = (">M8", " TypeGuard[DateTime64JSONV3]: @@ -335,3 +298,43 @@ def from_json_v3(cls, data: JSON) -> Self: "'scale_factor' key" ) raise DataTypeValidationError(msg) + + def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: + if data is None: + return True + return isinstance(data, str | int | bytes | np.datetime64 | datetime) + + def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") + + def cast_scalar(self, data: object) -> np.datetime64: + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = f"Cannot convert object with type {type(data)} to a numpy datetime scalar." + raise TypeError(msg) + + def default_scalar(self) -> np.datetime64: + return np.datetime64("NaT") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + if check_json_time(data): + return self._cast_scalar_unchecked(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover + + @classmethod + def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + """ + Check that JSON input is a string representation of a NumPy datetime64 data type, like "M8[10s]". This function can be used as a type guard to narrow the type of unknown JSON + input. + """ + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either M8 + return True + else: + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 557da87fcf..fa34dc000d 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -138,89 +138,75 @@ def to_native_dtype(self: Self) -> TDType_co: """ ... + @classmethod @abstractmethod - def cast_scalar(self, data: object) -> TScalar_co: + def _check_json_v2( + cls: type[Self], data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[DTypeJSON_V2]: """ - Cast a python object to the wrapped scalar type. - The type of the provided scalar is first checked for compatibility. - If it's incompatible with the associated scalar type, a ``TypeError`` will be raised. + Check that JSON data matches the Zarr V2 JSON serialization of this ZDType. Parameters ---------- - data : object - The python object to cast. - - Returns - ------- - TScalar - The cast value. - """ + data : JSON + The JSON representation of the data type. - @abstractmethod - def _check_scalar(self, data: object) -> bool: - """ - Check that an python object is a valid scalar value for the wrapped data type. + object_codec_id : str | None + The string identifier of an object codec, if applicable. Object codecs are specific + numcodecs codecs that zarr-python 2.x used to serialize numpy "Object" scalars. + For example, a dtype field set to ``"|O"`` with an object codec ID of "vlen-utf8" + indicates that the data type is a variable-length string. - Parameters - ---------- - data : object - A value to check. + Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. Returns ------- Bool - True if the object is valid, False otherwise. + True if the JSON representation matches this data type, False otherwise. """ ... + @classmethod @abstractmethod - def default_scalar(self) -> TScalar_co: + def _check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: """ - Get the default scalar value for the wrapped data type. This is a method, rather than an - attribute, because the default value for some data types depends on parameters that are - not known until a concrete data type is wrapped. For example, data types parametrized by a - length like fixed-length strings or bytes will generate scalars consistent with that length. + Check that JSON data matches the Zarr V3 JSON serialization of this ZDType. + + Parameters + ---------- + data : JSON + The JSON representation of the data type. Returns ------- - TScalar - The default value for this data type. + Bool + True if the JSON representation matches, False otherwise. """ ... @classmethod @abstractmethod - def _check_json_v2( - cls: type[Self], data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[DTypeJSON_V2]: + def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None = None) -> Self: """ - Check that JSON data matches the Zarr V2 JSON serialization of this ZDType. + Create an instance of this ZDType from Zarr V2 JSON data. Parameters ---------- data : JSON The JSON representation of the data type. - object_codec_id : str | None - The string identifier of an object codec, if applicable. Object codecs are specific - numcodecs codecs that zarr-python 2.x used to serialize numpy "Object" scalars. - For example, a dtype field set to ``"|O"`` with an object codec ID of "vlen-utf8" - indicates that the data type is a variable-length string. - - Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. - Returns ------- - Bool - True if the JSON representation matches this data type, False otherwise. + Self + The wrapped data type. """ ... @classmethod @abstractmethod - def _check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: + def from_json_v3(cls: type[Self], data: JSON) -> Self: """ - Check that JSON data matches the Zarr V3 JSON serialization of this ZDType. + Create an instance of this ZDType from Zarr V3 JSON data. Parameters ---------- @@ -229,8 +215,8 @@ def _check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: Returns ------- - Bool - True if the JSON representation matches, False otherwise. + Self + The wrapped data type. """ ... @@ -257,80 +243,94 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: """ ... - @classmethod @abstractmethod - def from_json_v3(cls: type[Self], data: JSON) -> Self: + def _check_scalar(self, data: object) -> bool: """ - Create an instance of this ZDType from Zarr V3 JSON data. + Check that an python object is a valid scalar value for the wrapped data type. Parameters ---------- - data : JSON - The JSON representation of the data type. + data : object + A value to check. Returns ------- - Self - The wrapped data type. + Bool + True if the object is valid, False otherwise. """ ... - @classmethod @abstractmethod - def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None = None) -> Self: + def cast_scalar(self, data: object) -> TScalar_co: """ - Create an instance of this ZDType from Zarr V2 JSON data. + Cast a python object to the wrapped scalar type. + The type of the provided scalar is first checked for compatibility. + If it's incompatible with the associated scalar type, a ``TypeError`` will be raised. Parameters ---------- - data : JSON - The JSON representation of the data type. + data : object + The python object to cast. Returns ------- - Self - The wrapped data type. + TScalar + The cast value. + """ + + @abstractmethod + def default_scalar(self) -> TScalar_co: + """ + Get the default scalar value for the wrapped data type. This is a method, rather than an + attribute, because the default value for some data types depends on parameters that are + not known until a concrete data type is wrapped. For example, data types parametrized by a + length like fixed-length strings or bytes will generate scalars consistent with that length. + + Returns + ------- + TScalar + The default value for this data type. """ ... @abstractmethod - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: + def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: """ - Serialize a python object to the JSON representation of a scalar. The value will first be - cast to the scalar type associated with this ZDType, then serialized to JSON. + Read a JSON-serializable value as a scalar. Parameters ---------- - data : object - The value to convert. + data : JSON + A JSON representation of a scalar value. zarr_format : ZarrFormat The zarr format version. This is specified because the JSON serialization of scalars differs between Zarr V2 and Zarr V3. Returns ------- - JSON - The JSON-serialized scalar. + TScalar + The deserialized scalar value. """ ... @abstractmethod - def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ - Read a JSON-serializable value as a scalar. + Serialize a python object to the JSON representation of a scalar. The value will first be + cast to the scalar type associated with this ZDType, then serialized to JSON. Parameters ---------- - data : JSON - A JSON representation of a scalar value. + data : object + The value to convert. zarr_format : ZarrFormat The zarr format version. This is specified because the JSON serialization of scalars differs between Zarr V2 and Zarr V3. Returns ------- - TScalar - The deserialized scalar value. + JSON + The JSON-serialized scalar. """ ... From d26b695ae27e974349cc95f5e0368abb6dcb1fcb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 14 Jun 2025 15:28:34 +0300 Subject: [PATCH 128/130] allow structured scalars to be np.void --- src/zarr/core/dtype/npy/structured.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 190647c1e1..66dfed87f6 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -150,7 +150,7 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V3 | DTypeJSON_V2: def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: # TODO: implement something more precise here! - return isinstance(data, (bytes, list, tuple, int)) + return isinstance(data, (bytes, list, tuple, int, np.void)) def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: na_dtype = self.to_native_dtype() From 49f00622ecf5d5483c8b4e58c17bffde95d449d3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 15 Jun 2025 23:09:42 +0200 Subject: [PATCH 129/130] use a common function signature for from_json by packing the object_codec_id in a typeddict for zarr v2 metadata --- src/zarr/core/dtype/__init__.py | 27 +-- src/zarr/core/dtype/common.py | 135 ++++++++++++- src/zarr/core/dtype/npy/bool.py | 43 +++-- src/zarr/core/dtype/npy/bytes.py | 101 ++++++---- src/zarr/core/dtype/npy/complex.py | 46 +++-- src/zarr/core/dtype/npy/float.py | 46 +++-- src/zarr/core/dtype/npy/int.py | 190 ++++++++++++------- src/zarr/core/dtype/npy/string.py | 76 ++++---- src/zarr/core/dtype/npy/structured.py | 87 +++++---- src/zarr/core/dtype/npy/time.py | 91 +++++---- src/zarr/core/dtype/registry.py | 29 +-- src/zarr/core/dtype/wrapper.py | 89 ++------- src/zarr/core/metadata/v2.py | 81 +++++--- src/zarr/core/metadata/v3.py | 11 +- tests/package_with_entrypoint/__init__.py | 37 ++-- tests/test_dtype/test_npy/test_bool.py | 4 +- tests/test_dtype/test_npy/test_bytes.py | 12 +- tests/test_dtype/test_npy/test_complex.py | 12 +- tests/test_dtype/test_npy/test_float.py | 17 +- tests/test_dtype/test_npy/test_int.py | 36 +++- tests/test_dtype/test_npy/test_string.py | 11 +- tests/test_dtype/test_npy/test_structured.py | 18 +- tests/test_dtype/test_npy/test_time.py | 18 +- tests/test_dtype/test_wrapper.py | 26 +-- tests/test_dtype_registry.py | 44 ++--- tests/test_group.py | 5 +- 26 files changed, 801 insertions(+), 491 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 25e5163e43..735690d4bc 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -2,7 +2,10 @@ from typing import TYPE_CHECKING, Final, TypeAlias -from zarr.core.dtype.common import DataTypeValidationError +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeJSON, +) from zarr.core.dtype.npy.bool import Bool from zarr.core.dtype.npy.bytes import NullTerminatedBytes, RawBytes, VariableLengthBytes from zarr.core.dtype.npy.complex import Complex64, Complex128 @@ -131,20 +134,20 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, return data_type_registry.match_dtype(dtype=na_dtype) -def get_data_type_from_json_v3( - dtype_spec: JSON, -) -> ZDType[TBaseDType, TBaseScalar]: - return data_type_registry.match_json_v3(dtype_spec) - - -def get_data_type_from_json_v2( - dtype_spec: JSON, *, object_codec_id: str | None = None +def get_data_type_from_json( + dtype_spec: DTypeJSON, *, zarr_format: ZarrFormat ) -> ZDType[TBaseDType, TBaseScalar]: - return data_type_registry.match_json_v2(dtype_spec, object_codec_id=object_codec_id) + """ + Given a JSON representation of a data type and a Zarr format version, + attempt to create a ZDType instance from the registered ZDType classes. + """ + return data_type_registry.match_json(dtype_spec, zarr_format=zarr_format) def parse_data_type( - dtype_spec: ZDTypeLike, *, zarr_format: ZarrFormat, object_codec_id: str | None = None + dtype_spec: ZDTypeLike, + *, + zarr_format: ZarrFormat, ) -> ZDType[TBaseDType, TBaseScalar]: """ Interpret the input as a ZDType instance. @@ -153,7 +156,7 @@ def parse_data_type( return dtype_spec # dict and zarr_format 3 means that we have a JSON object representation of the dtype if zarr_format == 3 and isinstance(dtype_spec, Mapping): - return get_data_type_from_json_v3(dtype_spec) # type: ignore[arg-type] + return get_data_type_from_json(dtype_spec, zarr_format=3) # otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case # we can create a numpy dtype from it, and do the dtype inference from that return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type] diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 9fabfa2737..6f61b6775e 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,16 +1,149 @@ from __future__ import annotations import warnings +from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import ClassVar, Final, Literal +from typing import ( + ClassVar, + Final, + Generic, + Literal, + TypedDict, + TypeGuard, + TypeVar, +) + +from zarr.core.common import NamedConfig EndiannessStr = Literal["little", "big"] ENDIANNESS_STR: Final = "little", "big" + SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity") + JSONFloatV2 = float | SpecialFloatStrings JSONFloatV3 = float | SpecialFloatStrings | str +ObjectCodecID = Literal["vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2"] +# These are the ids of the known object codecs for zarr v2. +OBJECT_CODEC_IDS: Final = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2") + +# This is a wider type than our standard JSON type because we need +# to work with typeddict objects which are assignable to Mapping[str, object] +DTypeJSON = str | int | float | Sequence["DTypeJSON"] | None | Mapping[str, object] + +# The DTypeJSON_V2 type exists because ZDType.from_json takes a single argument, which must contain +# all the information necessary to decode the data type. Zarr v2 supports multiple distinct +# data types that all used the "|O" data type identifier. These data types can only be +# discriminated on the basis of their "object codec", i.e. a special data type specific +# compressor or filter. So to figure out what data type a zarr v2 array has, we need the +# data type identifier from metadata, as well as an object codec id if the data type identifier +# is "|O". +# So we will pack the name of the dtype alongside the name of the object codec id, if applicable, +# in a single dict, and pass that to the data type inference logic. +# These type variables have a very wide bound because the individual zdtype +# classes can perform a very specific type check. + +# This is the JSON representation of a structured dtype in zarr v2 +StructuredName_V2 = Sequence["str | StructuredName_V2"] + +# This models the type of the name a dtype might have in zarr v2 array metadata +DTypeName_V2 = StructuredName_V2 | str + +TDTypeNameV2_co = TypeVar("TDTypeNameV2_co", bound=DTypeName_V2, covariant=True) +TObjectCodecID_co = TypeVar("TObjectCodecID_co", bound=None | str, covariant=True) + + +class DTypeConfig_V2(TypedDict, Generic[TDTypeNameV2_co, TObjectCodecID_co]): + name: TDTypeNameV2_co + object_codec_id: TObjectCodecID_co + + +DTypeSpec_V2 = DTypeConfig_V2[DTypeName_V2, None | str] + + +def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2]: + """ + A type guard for the inner elements of a structured dtype. This is a recursive check because + the type is itself recursive. + + This check ensures that all the elements are 2-element sequences beginning with a string + and ending with either another string or another 2-element sequence beginning with a string and + ending with another instance of that type. + """ + if isinstance(data, (str, Mapping)): + return False + if not isinstance(data, Sequence): + return False + if len(data) != 2: + return False + if not (isinstance(data[0], str)): + return False + if isinstance(data[-1], str): + return True + elif isinstance(data[-1], Sequence): + return check_structured_dtype_v2_inner(data[-1]) + return False + + +def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]: + return all(check_structured_dtype_v2_inner(d) for d in data) + + +def check_dtype_name_v2(data: object) -> TypeGuard[DTypeName_V2]: + """ + Type guard for narrowing the type of a python object to an valid zarr v2 dtype name. + """ + if isinstance(data, str): + return True + elif isinstance(data, Sequence): + return check_structured_dtype_name_v2(data) + return False + + +def check_dtype_spec_v2(data: object) -> TypeGuard[DTypeSpec_V2]: + """ + Type guard for narrowing a python object to an instance of DTypeSpec_V2 + """ + if not isinstance(data, Mapping): + return False + if set(data.keys()) != {"name", "object_codec_id"}: + return False + if not check_dtype_name_v2(data["name"]): + return False + return isinstance(data["object_codec_id"], str | None) + + +# By comparison, The JSON representation of a dtype in zarr v3 is much simpler. +# It's either a string, or a structured dict +DTypeSpec_V3 = str | NamedConfig[str, Mapping[str, object]] + + +def check_dtype_spec_v3(data: object) -> TypeGuard[DTypeSpec_V3]: + """ + Type guard for narrowing the type of a python object to an instance of + DTypeSpec_V3, i.e either a string or a dict with a "name" field that's a string and a + "configuration" field that's a mapping with string keys. + """ + if isinstance(data, str) or ( # noqa: SIM103 + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and isinstance(data["configuration"], Mapping) + and all(isinstance(k, str) for k in data["configuration"]) + ): + return True + return False + + +def unpack_dtype_json(data: DTypeSpec_V2 | DTypeSpec_V3) -> DTypeJSON: + """ + Return the array metadata form of the dtype JSON representation. For the Zarr V3 form of dtype + metadata, this is a no-op. For the Zarr V2 form of dtype metadata, this unpacks the dtype name. + """ + if isinstance(data, Mapping) and set(data.keys()) == {"name", "object_codec_id"}: + return data["name"] + return data + class DataTypeValidationError(ValueError): ... diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 7570dd1f4f..d8d52468bf 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -1,12 +1,22 @@ +from __future__ import annotations + from dataclasses import dataclass -from typing import ClassVar, Literal, Self, TypeGuard, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload import numpy as np -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import DataTypeValidationError, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasItemSize, + check_dtype_spec_v2, +) from zarr.core.dtype.wrapper import TBaseDType, ZDType +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + @dataclass(frozen=True, kw_only=True, slots=True) class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): @@ -44,40 +54,47 @@ def to_native_dtype(self: Self) -> np.dtypes.BoolDType: @classmethod def _check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|b1"]]: + cls, + data: DTypeJSON, + ) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: """ Check that the input is a valid JSON representation of a Bool. """ - return data == cls._zarr_v2_name + return ( + check_dtype_spec_v2(data) + and data["name"] == cls._zarr_v2_name + and data["object_codec_id"] is None + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["bool"]]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]: return data == cls._zarr_v3_name @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + def _from_json_v2(cls, data: DTypeJSON) -> Self: if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls: type[Self], data: JSON) -> Self: + def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|b1"]: ... + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|b1"], None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["|b1", "bool"]: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]: if zarr_format == 2: - return self._zarr_v2_name + return {"name": self._zarr_v2_name, "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 9dc0bb1a68..e363c75053 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import base64 import re from dataclasses import dataclass @@ -8,9 +10,12 @@ from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import ( DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, HasItemSize, HasLength, HasObjectCodec, + check_dtype_spec_v2, v3_unstable_dtype_warning, ) from zarr.core.dtype.npy.common import check_json_str @@ -44,15 +49,20 @@ def to_native_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ - Check that the input is a valid JSON representation of a numpy S dtype. + Check that the input is a valid representation of a numpy S dtype. We expect + something like ``{"name": "|S10", "object_codec_id": None}`` """ - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + return ( + check_dtype_spec_v2(data) + and isinstance(data["name"], str) + and re.match(r"^\|S\d+$", data["name"]) is not None + and data["object_codec_id"] is None + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[NullTerminatedBytesJSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -62,28 +72,31 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[NullTerminatedBytesJSONV3]: ) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + def _from_json_v2(cls, data: DTypeJSON) -> Self: if cls._check_json_v2(data): - return cls(length=int(data[2:])) + name = data["name"] + return cls(length=int(name[2:])) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|S1', '|S2', etc" raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSONV3: ... - def to_json(self, zarr_format: ZarrFormat) -> str | NullTerminatedBytesJSONV3: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSONV3: if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: v3_unstable_dtype_warning(self) return { @@ -179,12 +192,20 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: - # Check that the dtype is |V1, |V2, ... - return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + """ + Check that the input is a valid representation of a numpy S dtype. We expect + something like ``{"name": "|V10", "object_codec_id": None}`` + """ + return ( + check_dtype_spec_v2(data) + and isinstance(data["name"], str) + and re.match(r"^\|V\d+$", data["name"]) is not None + and data["object_codec_id"] is None + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[RawBytesJSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -194,28 +215,29 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[RawBytesJSONV3]: ) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + def _from_json_v2(cls, data: DTypeJSON) -> Self: if cls._check_json_v2(data): - return cls(length=int(data[2:])) + name = data["name"] + return cls(length=int(name[2:])) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|V1', '|V2', etc" raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> RawBytesJSONV3: ... - def to_json(self, zarr_format: ZarrFormat) -> str | RawBytesJSONV3: + def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | RawBytesJSONV3: if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: v3_unstable_dtype_warning(self) return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} @@ -257,7 +279,7 @@ def item_size(self) -> int: class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): dtype_cls = np.dtypes.ObjectDType _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" - object_codec_id = "vlen-bytes" + object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes" @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: @@ -272,41 +294,50 @@ def to_native_dtype(self) -> np.dtypes.ObjectDType: @classmethod def _check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|O"]]: + cls, + data: DTypeJSON, + ) -> TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]]: """ Check that the input is a valid JSON representation of a numpy O dtype, and that the object codec id is appropriate for variable-length UTF-8 strings. """ - return data == "|O" and object_codec_id == cls.object_codec_id + return ( + check_dtype_spec_v2(data) + and data["name"] == "|O" + and data["object_codec_id"] == cls.object_codec_id + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_bytes"]]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_bytes"]]: return data == cls._zarr_v3_name @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O' and an object_codec_id of {cls.object_codec_id}" raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + @overload # type: ignore[override] + def to_json( + self, zarr_format: Literal[2] + ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_bytes"]: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]: if zarr_format == 2: - return "|O" + return {"name": "|O", "object_codec_id": self.object_codec_id} elif zarr_format == 3: v3_unstable_dtype_warning(self) return self._zarr_v3_name diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index bc06af00b5..38e506f1bc 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -1,15 +1,25 @@ +from __future__ import annotations + from dataclasses import dataclass from typing import ( + TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, + overload, ) import numpy as np -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasEndianness, + HasItemSize, + check_dtype_spec_v2, +) from zarr.core.dtype.npy.common import ( ComplexLike, TComplexDType_co, @@ -25,6 +35,9 @@ ) from zarr.core.dtype.wrapper import TBaseDType, ZDType +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + @dataclass(frozen=True) class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): @@ -44,33 +57,44 @@ def to_native_dtype(self) -> TComplexDType_co: return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of this data type. """ - return data in cls._zarr_v2_names + return ( + check_dtype_spec_v2(data) + and data["name"] in cls._zarr_v2_names + and data["object_codec_id"] is None + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: return data == cls._zarr_v3_name @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - def to_json(self, zarr_format: ZarrFormat) -> str: + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> str: ... + + def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: """ Convert the wrapped data type to a JSON-serializable form. @@ -85,7 +109,7 @@ def to_json(self, zarr_format: ZarrFormat) -> str: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 9271886c20..7b7243993f 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -1,14 +1,18 @@ +from __future__ import annotations + from dataclasses import dataclass -from typing import ClassVar, Self, TypeGuard +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload import numpy as np -from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.common import ( DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, HasEndianness, HasItemSize, ScalarTypeValidationError, + check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( FloatLike, @@ -25,6 +29,9 @@ ) from zarr.core.dtype.wrapper import TBaseDType, ZDType +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + @dataclass(frozen=True) class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): @@ -44,33 +51,44 @@ def to_native_dtype(self) -> TFloatDType_co: return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of this data type. """ - return data in cls._zarr_v2_names + return ( + check_dtype_spec_v2(data) + and data["name"] in cls._zarr_v2_names + and data["object_codec_id"] is None + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: return data == cls._zarr_v3_name @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - def to_json(self, zarr_format: ZarrFormat) -> str: + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> str: ... + + def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: """ Convert the wrapped data type to a JSON-serializable form. @@ -85,7 +103,7 @@ def to_json(self, zarr_format: ZarrFormat) -> str: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -175,7 +193,7 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | st class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", "f2"], Literal["f2", " int: @@ -186,7 +204,7 @@ def item_size(self) -> int: class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", "f4"], Literal["f4", " int: @@ -197,7 +215,7 @@ def item_size(self) -> int: class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", "f8"], Literal["f8", " int: diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index cddcb26c5e..79d3ce2d47 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -1,5 +1,8 @@ +from __future__ import annotations + from dataclasses import dataclass from typing import ( + TYPE_CHECKING, ClassVar, Literal, Self, @@ -12,8 +15,14 @@ import numpy as np -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasEndianness, + HasItemSize, + check_dtype_spec_v2, +) from zarr.core.dtype.npy.common import ( check_json_int, endianness_to_numpy_str, @@ -21,6 +30,9 @@ ) from zarr.core.dtype.wrapper import TBaseDType, ZDType +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + _NumpyIntDType = ( np.dtypes.Int8DType | np.dtypes.Int16DType @@ -45,14 +57,18 @@ class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of this data type. """ - return data in cls._zarr_v2_names + return ( + check_dtype_spec_v2(data) + and data["name"] in cls._zarr_v2_names + and data["object_codec_id"] is None + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[str]: + def _check_json_v3(cls, data: object) -> TypeGuard[str]: """ Check that a JSON value is consistent with the zarr v3 spec for this data type. """ @@ -147,26 +163,28 @@ def to_native_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): return cls() - msg = f"Invalid JSON representation of Int8. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of Int8. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|i1"]: ... + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|i1"], None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["int8"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["int8", "|i1"]: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]: """ Convert the wrapped data type to a JSON-serializable form. @@ -181,7 +199,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["int8", "|i1"]: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self._zarr_v2_names[0] + return {"name": self._zarr_v2_names[0], "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -212,26 +230,28 @@ def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: return self.dtype_cls() @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): return cls() - msg = f"Invalid JSON representation of UInt8. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of UInt8. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|u1"]: ... + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|u1"], None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["uint8"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["uint8", "|u1"]: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]: """ Convert the wrapped data type to a JSON-serializable form. @@ -246,7 +266,7 @@ def to_json(self, zarr_format: ZarrFormat) -> Literal["uint8", "|u1"]: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self._zarr_v2_names[0] + return {"name": self._zarr_v2_names[0], "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -275,28 +295,31 @@ def to_native_dtype(self) -> np.dtypes.Int16DType: return self.dtype_cls().newbyteorder(byte_order) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) - msg = f"Invalid JSON representation of Int16. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() - msg = f"Invalid JSON representation of Int16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal[">i2", " DTypeConfig_V2[Literal[">i2", " Literal["int16"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["int16", ">i2", " DTypeConfig_V2[Literal[">i2", " Literal["int16", ">i2", " np.dtypes.UInt16DType: return self.dtype_cls().newbyteorder(byte_order) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal[">u2", " DTypeConfig_V2[Literal[">u2", " Literal["uint16"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["uint16", ">u2", " DTypeConfig_V2[Literal[">u2", " Literal["uint16", ">u2", " np.dtypes.Int32DType: return self.dtype_cls().newbyteorder(byte_order) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal[">i4", " DTypeConfig_V2[Literal[">i4", " Literal["int32"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["int32", ">i4", " DTypeConfig_V2[Literal[">i4", " Literal["int32", ">i4", " np.dtypes.UInt32DType: return self.dtype_cls().newbyteorder(byte_order) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal[">u4", " DTypeConfig_V2[Literal[">u4", " Literal["uint32"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["uint32", ">u4", " DTypeConfig_V2[Literal[">u4", " Literal["uint32", ">u4", " np.dtypes.Int64DType: return self.dtype_cls().newbyteorder(byte_order) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal[">i8", " DTypeConfig_V2[Literal[">i8", " Literal["int64"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["int64", ">i8", " DTypeConfig_V2[Literal[">i8", " Literal["int64", ">i8", " np.dtypes.UInt64DType: return self.dtype_cls().newbyteorder(byte_order) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal[">u8", " DTypeConfig_V2[Literal[">u8", " Literal["uint64"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["uint64", ">u8", " DTypeConfig_V2[Literal[">u8", " Literal["uint64", ">u8", " np.dtypes.StrDType[int]: return self.dtype_cls(self.length).newbyteorder(byte_order) @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of a numpy U dtype. """ - return isinstance(data, str) and re.match(r"^[><]U\d+$", data) is not None + return ( + check_dtype_spec_v2(data) + and isinstance(data["name"], str) + and re.match(r"^[><]U\d+$", data["name"]) is not None + and data["object_codec_id"] is None + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthUTF32JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -95,15 +103,17 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthUTF32JSONV3]: and isinstance(data["configuration"]["length_bytes"], int) ) - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSONV3: ... - def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthUTF32JSONV3: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSONV3: if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: v3_unstable_dtype_warning(self) return { @@ -113,16 +123,17 @@ def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthUTF32JSONV3: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): # Construct the numpy dtype instead of string parsing. - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) raise DataTypeValidationError( f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a numpy U dtype." ) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." @@ -197,21 +208,26 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: @classmethod def _check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|O"]]: + cls, + data: DTypeJSON, + ) -> TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]]: """ Check that the input is a valid JSON representation of a numpy O dtype, and that the object codec id is appropriate for variable-length UTF-8 strings. """ - return data == "|O" and object_codec_id == cls.object_codec_id + return ( + check_dtype_spec_v2(data) + and data["name"] == "|O" + and data["object_codec_id"] == cls.object_codec_id + ) @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: return data == cls._zarr_v3_name @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: - if cls._check_json_v2(data, object_codec_id=object_codec_id): + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if cls._check_json_v2(data): return cls() msg = ( f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O'" @@ -219,35 +235,29 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + @overload # type: ignore[override] + def to_json( + self, zarr_format: Literal[2] + ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"]: if zarr_format == 2: - # Note: unlike many other numpy data types, we don't serialize the .str attribute - # of the data type to JSON. This is because Zarr was using `|O` for strings before the - # numpy variable length string data type existed, and we want to be consistent with - # that practice - return "|O" + return {"name": "|O", "object_codec_id": self.object_codec_id} elif zarr_format == 3: v3_unstable_dtype_warning(self) return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() - def default_scalar(self) -> str: return "" diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 66dfed87f6..d9e1ff55ae 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -1,13 +1,19 @@ -from collections.abc import Sequence +from __future__ import annotations + from dataclasses import dataclass -from typing import Literal, Self, TypeGuard, cast, overload +from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, overload import numpy as np -from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import ( DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + DTypeSpec_V3, HasItemSize, + StructuredName_V2, + check_dtype_spec_v2, + check_structured_dtype_name_v2, v3_unstable_dtype_warning, ) from zarr.core.dtype.npy.common import ( @@ -15,12 +21,16 @@ bytes_to_json, check_json_str, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, TBaseScalar, ZDType +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + +if TYPE_CHECKING: + from collections.abc import Sequence + + from zarr.core.common import JSON, NamedConfig, ZarrFormat StructuredScalarLike = list[object] | tuple[object, ...] | bytes | int -# TODO: tighten this up, get a v3 spec in place, handle endianness, etc. @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] @@ -69,24 +79,20 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: @classmethod def _check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[list[object]]: - # the actual JSON form is recursive and hard to annotate, so we give up and do - # list[object] for now - + cls, + data: DTypeJSON, + ) -> TypeGuard[DTypeConfig_V2[StructuredName_V2, None]]: return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and all( - not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 - for field in data - ) + check_dtype_spec_v2(data) + and not isinstance(data["name"], str) + and check_structured_dtype_name_v2(data["name"]) + and data["object_codec_id"] is None ) @classmethod def _check_json_v3( - cls, data: JSON - ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, JSON]]]]]: + cls, data: DTypeJSON + ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, DTypeJSON]]]]]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -96,9 +102,9 @@ def _check_json_v3( ) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + def _from_json_v2(cls, data: DTypeJSON) -> Self: # avoid circular import - from zarr.core.dtype import get_data_type_from_json_v2 + from zarr.core.dtype import get_data_type_from_json if cls._check_json_v2(data): # structured dtypes are constructed directly from a list of lists @@ -106,46 +112,59 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self # dtypes from containing object dtypes. return cls( fields=tuple( # type: ignore[misc] - (f_name, get_data_type_from_json_v2(f_dtype, object_codec_id=None)) # type: ignore[has-type] - for f_name, f_dtype in data + ( # type: ignore[misc] + f_name, + get_data_type_from_json( + {"name": f_dtype, "object_codec_id": None}, zarr_format=2 + ), + ) + for f_name, f_dtype in data["name"] ) ) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON array of arrays" raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: # avoid circular import - from zarr.core.dtype import get_data_type_from_json_v3 + from zarr.core.dtype import get_data_type_from_json if cls._check_json_v3(data): config = data["configuration"] meta_fields = config["fields"] return cls( fields=tuple( - (f_name, get_data_type_from_json_v3(f_dtype)) for f_name, f_dtype in meta_fields + (f_name, get_data_type_from_json(f_dtype, zarr_format=3)) + for f_name, f_dtype in meta_fields ) ) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[StructuredName_V2, None]: ... @overload - def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... + def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... - def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V3 | DTypeJSON_V2: - fields = [ - (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields - ] + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[StructuredName_V2, None] | DTypeSpec_V3: if zarr_format == 2: - return fields + fields = [ + [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] + for f_name, f_dtype in self.fields + ] + return {"name": fields, "object_codec_id": None} elif zarr_format == 3: v3_unstable_dtype_warning(self) + fields = [ + [f_name, f_dtype.to_json(zarr_format=zarr_format)] # type: ignore[list-item] + for f_name, f_dtype in self.fields + ] base_dict = {"name": self._zarr_v3_name} base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] - return cast("DTypeJSON_V3", base_dict) + return cast("DTypeSpec_V3", base_dict) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index a99703dd3e..1f9080475c 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -18,8 +18,16 @@ import numpy as np from zarr.core.common import NamedConfig -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasEndianness, + HasItemSize, + check_dtype_spec_v2, +) from zarr.core.dtype.npy.common import ( + DATETIME_UNIT, DateTimeUnit, check_json_int, endianness_to_numpy_str, @@ -140,14 +148,17 @@ def to_native_dtype(self) -> BaseTimeDType_co: dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> DateTime64JSONV3 | TimeDelta64JSONV3: ... - def to_json(self, zarr_format: ZarrFormat) -> str | DateTime64JSONV3 | TimeDelta64JSONV3: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[str, None] | DateTime64JSONV3 | TimeDelta64JSONV3: if zarr_format == 2: - return cast("str", self.to_native_dtype().str) + name = self.to_native_dtype().str + return {"name": name, "object_codec_id": None} elif zarr_format == 3: return cast( "DateTime64JSONV3 | TimeDelta64JSONV3", @@ -185,22 +196,25 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has unit: DateTimeUnit = "generic" @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + if not check_dtype_spec_v2(data): + return False + name = data["name"] # match m[M], etc # consider making this a standalone function - if not isinstance(data, str): + if not isinstance(name, str): return False - if not data.startswith(cls._zarr_v2_names): + if not name.startswith(cls._zarr_v2_names): return False - if len(data) == 3: + if len(name) == 3: # no unit, and # we already checked that this string is either m8 return True else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" + return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -210,9 +224,10 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: ) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + def _from_json_v2(cls, data: DTypeJSON) -> Self: if cls._check_json_v2(data): - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = ( f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " f"representation of an instance of {cls.dtype_cls}" # type: ignore[has-type] @@ -220,7 +235,7 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): unit = data["configuration"]["unit"] scale_factor = data["configuration"]["scale_factor"] @@ -266,7 +281,28 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd scale_factor: int = 1 @classmethod - def _check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + """ + Check that JSON input is a string representation of a NumPy datetime64 data type, like "M8[10s]". This function can be used as a type guard to narrow the type of unknown JSON + input. + """ + if not check_dtype_spec_v2(data): + return False + name = data["name"] + if not isinstance(name, str): + return False + if not name.startswith(cls._zarr_v2_names): + return False + if len(name) == 3: + # no unit, and + # we already checked that this string is either M8 + return True + else: + return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" + + @classmethod + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -276,9 +312,10 @@ def _check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: ) @classmethod - def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self: + def _from_json_v2(cls, data: DTypeJSON) -> Self: if cls._check_json_v2(data): - return cls.from_native_dtype(np.dtype(data)) + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) msg = ( f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " f"representation of an instance of {cls.dtype_cls}" # type: ignore[has-type] @@ -286,7 +323,7 @@ def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self raise DataTypeValidationError(msg) @classmethod - def from_json_v3(cls, data: JSON) -> Self: + def _from_json_v3(cls, data: DTypeJSON) -> Self: if cls._check_json_v3(data): unit = data["configuration"]["unit"] scale_factor = data["configuration"]["scale_factor"] @@ -320,21 +357,3 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetim if check_json_time(data): return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - - @classmethod - def _check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: - """ - Check that JSON input is a string representation of a NumPy datetime64 data type, like "M8[10s]". This function can be used as a type guard to narrow the type of unknown JSON - input. - """ - if not isinstance(data, str): - return False - if not data.startswith(cls._zarr_v2_names): - return False - if len(data) == 3: - # no unit, and - # we already checked that this string is either M8 - return True - else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 308bde602c..1d2a97a90a 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -6,12 +6,15 @@ import numpy as np -from zarr.core.dtype.common import DataTypeValidationError +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeJSON, +) if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.core.common import JSON + from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -74,26 +77,14 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: "For more information, see https://github.com/zarr-developers/zarr-python/issues/3117" ) raise ValueError(msg) - raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") + raise ValueError(f"No Zarr data type found that matches dtype '{dtype!r}'") - def match_json_v2( - self, data: JSON, *, object_codec_id: str | None = None + def match_json( + self, data: DTypeJSON, *, zarr_format: ZarrFormat ) -> ZDType[TBaseDType, TBaseScalar]: - # The dtype field in zarr v2 JSON metadata is not unique across different distinct data types. - # Specifically, multiple distinct data types all use the "|O" data type representation. - # These must be disambiguated by the presence of an "object codec", which is a codec - # like variable-length utf8 encoding for strings. for val in self.contents.values(): try: - return val.from_json_v2(data, object_codec_id=object_codec_id) + return val.from_json(data, zarr_format=zarr_format) except DataTypeValidationError: pass - raise ValueError(f"No data type wrapper found that matches {data}") - - def match_json_v3(self, data: JSON) -> ZDType[TBaseDType, TBaseScalar]: - for val in self.contents.values(): - try: - return val.from_json_v3(data) - except DataTypeValidationError: - pass - raise ValueError(f"No data type wrapper found that matches {data}") + raise ValueError(f"No Zarr data type found that matches {data!r}") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index fa34dc000d..e974712e38 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -24,7 +24,6 @@ from __future__ import annotations from abc import ABC, abstractmethod -from collections.abc import Mapping, Sequence from dataclasses import dataclass from typing import ( TYPE_CHECKING, @@ -41,6 +40,7 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.common import DTypeJSON, DTypeSpec_V2, DTypeSpec_V3 # This the upper bound for the scalar types we support. It's numpy scalars + str, # because the new variable-length string dtype in numpy does not have a corresponding scalar type @@ -55,10 +55,6 @@ TScalar_co = TypeVar("TScalar_co", bound=TBaseScalar, covariant=True) TDType_co = TypeVar("TDType_co", bound=TBaseDType, covariant=True) -# These types should include all JSON-serializable types that can be used to represent a data type. -DTypeJSON_V2 = str | Sequence[object] -DTypeJSON_V3 = str | Mapping[str, object] - @dataclass(frozen=True, kw_only=True, slots=True) class ZDType(Generic[TDType_co, TScalar_co], ABC): @@ -140,94 +136,45 @@ def to_native_dtype(self: Self) -> TDType_co: @classmethod @abstractmethod - def _check_json_v2( - cls: type[Self], data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[DTypeJSON_V2]: - """ - Check that JSON data matches the Zarr V2 JSON serialization of this ZDType. - - Parameters - ---------- - data : JSON - The JSON representation of the data type. - - object_codec_id : str | None - The string identifier of an object codec, if applicable. Object codecs are specific - numcodecs codecs that zarr-python 2.x used to serialize numpy "Object" scalars. - For example, a dtype field set to ``"|O"`` with an object codec ID of "vlen-utf8" - indicates that the data type is a variable-length string. - - Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. - - Returns - ------- - Bool - True if the JSON representation matches this data type, False otherwise. - """ - ... + def _from_json_v2(cls: type[Self], data: DTypeJSON) -> Self: ... @classmethod @abstractmethod - def _check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: - """ - Check that JSON data matches the Zarr V3 JSON serialization of this ZDType. - - Parameters - ---------- - data : JSON - The JSON representation of the data type. - - Returns - ------- - Bool - True if the JSON representation matches, False otherwise. - """ - ... + def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: ... @classmethod - @abstractmethod - def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None = None) -> Self: + def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> Self: """ - Create an instance of this ZDType from Zarr V2 JSON data. + Create an instance of this ZDType from JSON data. Parameters ---------- - data : JSON - The JSON representation of the data type. - - Returns - ------- - Self - The wrapped data type. - """ - ... - - @classmethod - @abstractmethod - def from_json_v3(cls: type[Self], data: JSON) -> Self: - """ - Create an instance of this ZDType from Zarr V3 JSON data. + data : DTypeJSON + The JSON representation of the data type. The type annotation includes + Mapping[str, object] to accommodate typed dictionaries. - Parameters - ---------- - data : JSON - The JSON representation of the data type. + zarr_format : ZarrFormat + The zarr format version. Returns ------- Self The wrapped data type. """ - ... + if zarr_format == 2: + return cls._from_json_v2(data) + if zarr_format == 3: + return cls._from_json_v3(data) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... + def to_json(self, zarr_format: Literal[2]) -> DTypeSpec_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... + def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... @abstractmethod - def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: + def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3: """ Serialize this ZDType to JSON. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index ec1ac42264..3ac75e0418 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -3,14 +3,14 @@ import warnings from collections.abc import Iterable, Sequence from functools import cached_property -from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict +from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast import numcodecs.abc from zarr.abc.metadata import Metadata from zarr.core.chunk_grids import RegularChunkGrid -from zarr.core.dtype import get_data_type_from_json_v2 -from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType +from zarr.core.dtype import get_data_type_from_json +from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 if TYPE_CHECKING: from typing import Literal, Self @@ -19,6 +19,13 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.common import ChunkCoords + from zarr.core.dtype.wrapper import ( + TBaseDType, + TBaseScalar, + TDType_co, + TScalar_co, + ZDType, + ) import json from dataclasses import dataclass, field, fields, replace @@ -28,7 +35,13 @@ from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_key_encodings import parse_separator -from zarr.core.common import JSON, ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike +from zarr.core.common import ( + JSON, + ZARRAY_JSON, + ZATTRS_JSON, + MemoryOrder, + parse_shapelike, +) from zarr.core.config import config, parse_indexing_order from zarr.core.metadata.common import parse_attributes @@ -45,9 +58,6 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None -# These are the ids of the known object codecs for zarr v2. -ObjectCodecIds = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2") - @dataclass(frozen=True, kw_only=True) class ArrayV2Metadata(Metadata): @@ -80,9 +90,6 @@ def __init__( """ shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) - # TODO: remove this - if not isinstance(dtype, ZDType): - raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -141,22 +148,22 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: # To resolve a numpy object dtype array, we need to search for an object codec, # which could be in filters or as a compressor. - # we will use a hard-coded list of object codecs for this search. - object_codec_id: str | None = None - maybe_object_codecs = (data.get("filters"), data.get("compressor")) - for maybe_object_codec in maybe_object_codecs: - if isinstance(maybe_object_codec, Sequence): - for codec in maybe_object_codec: - if isinstance(codec, dict) and codec.get("id") in ObjectCodecIds: - object_codec_id = codec["id"] - break - elif ( - isinstance(maybe_object_codec, dict) - and maybe_object_codec.get("id") in ObjectCodecIds - ): - object_codec_id = maybe_object_codec["id"] - break - dtype = get_data_type_from_json_v2(data["dtype"], object_codec_id=object_codec_id) + # we will reference a hard-coded collection of object codec ids for this search. + + _filters, _compressor = (data.get("filters"), data.get("compressor")) + if _filters is not None: + _filters = cast("tuple[dict[str, JSON], ...]", _filters) + object_codec_id = get_object_codec_id(tuple(_filters) + (_compressor,)) + else: + object_codec_id = get_object_codec_id((_compressor,)) + # we add a layer of indirection here around the dtype attribute of the array metadata + # because we also need to know the object codec id, if any, to resolve the data type + dtype_spec: DTypeSpec_V2 = { + "name": data["dtype"], + "object_codec_id": object_codec_id, + } + dtype = get_data_type_from_json(dtype_spec, zarr_format=2) + _data["dtype"] = dtype fill_value_encoded = _data.get("fill_value") if fill_value_encoded is not None: @@ -216,8 +223,8 @@ def to_dict(self) -> dict[str, JSON]: fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value - # serialize the dtype after fill value-specific JSON encoding - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) # type: ignore[assignment] + # pull the "name" attribute out of the dtype spec returned by self.dtype.to_json + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] return zarray_dict @@ -304,3 +311,21 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: ) raise ValueError(msg) return data + + +def get_object_codec_id(maybe_object_codecs: Sequence[JSON]) -> str | None: + """ + Inspect a sequence of codecs / filters for an "object codec", i.e. a codec + that can serialize object arrays to contiguous bytes. Zarr python + maintains a hard-coded set of object codec ids. If any element from the input + has an id that matches one of the hard-coded object codec ids, that id + is returned immediately. + """ + object_codec_id = None + for maybe_object_codec in maybe_object_codecs: + if ( + isinstance(maybe_object_codec, dict) + and maybe_object_codec.get("id") in OBJECT_CODEC_IDS + ): + return cast("str", maybe_object_codec["id"]) + return object_codec_id diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index bd02a67084..84872d3dbd 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,11 +4,8 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype import ( - VariableLengthUTF8, - ZDType, - get_data_type_from_json_v3, -) +from zarr.core.dtype import VariableLengthUTF8, ZDType, get_data_type_from_json +from zarr.core.dtype.common import check_dtype_spec_v3 if TYPE_CHECKING: from typing import Self @@ -306,7 +303,9 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: _ = parse_node_type_array(_data.pop("node_type")) data_type_json = _data.pop("data_type") - data_type = get_data_type_from_json_v3(data_type_json) + if not check_dtype_spec_v3(data_type_json): + raise ValueError(f"Invalid data_type: {data_type_json!r}") + data_type = get_data_type_from_json(data_type_json, zarr_format=3) # check that the fill value is consistent with the data type try: diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index 834d5654c0..e0d8a52c4d 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -1,5 +1,6 @@ -from collections.abc import Iterable -from typing import Any, Literal, Self +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Literal, Self import numpy as np import numpy.typing as npt @@ -7,11 +8,17 @@ import zarr.core.buffer from zarr.abc.codec import ArrayBytesCodec, CodecInput, CodecPipeline from zarr.codecs import BytesCodec -from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import DataTypeValidationError, DTypeJSON, DTypeSpec_V2 from zarr.core.dtype.npy.bool import Bool +if TYPE_CHECKING: + from collections.abc import Iterable + from typing import ClassVar, Literal + + from zarr.core.array_spec import ArraySpec + from zarr.core.common import ZarrFormat + class TestEntrypointCodec(ArrayBytesCodec): is_fixed_size = True @@ -74,13 +81,21 @@ class TestDataType(Bool): This is a "data type" that serializes to "test" """ - _zarr_v3_name = "test" # type: ignore[assignment] + _zarr_v3_name: ClassVar[Literal["test"]] = "test" # type: ignore[assignment] @classmethod - def from_json(cls, data: JSON, zarr_format: Literal[2, 3]) -> Self: - if data == cls._zarr_v3_name: # type: ignore[has-type] + def from_json(cls, data: DTypeJSON, *, zarr_format: Literal[2, 3]) -> Self: + if zarr_format == 2 and data == {"name": cls._zarr_v3_name, "object_codec_id": None}: return cls() - raise ValueError - - def to_json(self, zarr_format: ZarrFormat) -> str: # type: ignore[override] - return self._zarr_v3_name # type: ignore[no-any-return, has-type] + if zarr_format == 3 and data == cls._zarr_v3_name: + return cls() + raise DataTypeValidationError( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}" + ) + + def to_json(self, zarr_format: ZarrFormat) -> str | DTypeSpec_V2: # type: ignore[override] + if zarr_format == 2: + return {"name": self._zarr_v3_name, "object_codec_id": None} + if zarr_format == 3: + return self._zarr_v3_name + raise ValueError("zarr_format must be 2 or 3") diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 03dc550a9d..010dec2e47 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -2,7 +2,7 @@ import numpy as np -from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.npy.bool import Bool @@ -15,7 +15,7 @@ class TestBool(BaseTestZDType): np.dtype(np.float64), np.dtype(np.uint16), ) - valid_json_v2 = (V2JsonTestParams(dtype="|b1"),) + valid_json_v2 = ({"name": "|b1", "object_codec_id": None},) valid_json_v3 = ("bool",) invalid_json_v2 = ( "|b1", diff --git a/tests/test_dtype/test_npy/test_bytes.py b/tests/test_dtype/test_npy/test_bytes.py index 53636891cb..b7c16f573e 100644 --- a/tests/test_dtype/test_npy/test_bytes.py +++ b/tests/test_dtype/test_npy/test_bytes.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.common import UnstableSpecificationWarning from zarr.core.dtype.npy.bytes import NullTerminatedBytes, RawBytes, VariableLengthBytes @@ -15,9 +15,9 @@ class TestNullTerminatedBytes(BaseTestZDType): np.dtype("|U10"), ) valid_json_v2 = ( - V2JsonTestParams(dtype="|S0"), - V2JsonTestParams(dtype="|S2"), - V2JsonTestParams(dtype="|S4"), + {"name": "|S0", "object_codec_id": None}, + {"name": "|S2", "object_codec_id": None}, + {"name": "|S4", "object_codec_id": None}, ) valid_json_v3 = ({"name": "null_terminated_bytes", "configuration": {"length_bytes": 10}},) invalid_json_v2 = ( @@ -60,7 +60,7 @@ class TestRawBytes(BaseTestZDType): np.dtype(np.float64), np.dtype("|S10"), ) - valid_json_v2 = (V2JsonTestParams(dtype="|V10"),) + valid_json_v2 = ({"name": "|V10", "object_codec_id": None},) valid_json_v3 = ( {"name": "raw_bytes", "configuration": {"length_bytes": 0}}, {"name": "raw_bytes", "configuration": {"length_bytes": 8}}, @@ -106,7 +106,7 @@ class TestVariableLengthBytes(BaseTestZDType): np.dtype(np.float64), np.dtype("|U10"), ) - valid_json_v2 = (V2JsonTestParams(dtype="|O", object_codec_id="vlen-bytes"),) + valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-bytes"},) valid_json_v3 = ("variable_length_bytes",) invalid_json_v2 = ( "|S", diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index fd216d8415..b6a1e799eb 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -4,7 +4,7 @@ import numpy as np -from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.npy.complex import Complex64, Complex128 @@ -23,7 +23,10 @@ class TestComplex64(_BaseTestFloat): np.dtype(np.float64), np.dtype(np.complex128), ) - valid_json_v2 = (V2JsonTestParams(dtype=">c8"), V2JsonTestParams(dtype="c8", "object_codec_id": None}, + {"name": "c16"), V2JsonTestParams(dtype="c16", "object_codec_id": None}, + {"name": "f2"), V2JsonTestParams(dtype="f2", "object_codec_id": None}, + {"name": "f4"), V2JsonTestParams(dtype="f4", "object_codec_id": None}, + {"name": "f8"), V2JsonTestParams(dtype="f8", "object_codec_id": None}, + {"name": "i1", @@ -46,7 +46,10 @@ class TestInt16(BaseTestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = (V2JsonTestParams(dtype=">i2"), V2JsonTestParams(dtype="i2", "object_codec_id": None}, + {"name": "i4"), V2JsonTestParams(dtype="i4", "object_codec_id": None}, + {"name": "i8"), V2JsonTestParams(dtype="i8", "object_codec_id": None}, + {"name": "u2"), V2JsonTestParams(dtype="u2", "object_codec_id": None}, + {"name": "u4"), V2JsonTestParams(dtype="u4", "object_codec_id": None}, + {"name": "u8"), V2JsonTestParams(dtype="u8", "object_codec_id": None}, + {"name": "U10"), V2JsonTestParams(dtype="U10", "object_codec_id": None}, + {"name": "i4"), ("field2", ">f8")]), - V2JsonTestParams(dtype=[("field1", ">i8"), ("field2", ">i4")]), + {"name": [["field1", ">i4"], ["field2", ">f8"]], "object_codec_id": None}, + {"name": [["field1", ">i8"], ["field2", ">i4"]], "object_codec_id": None}, ) valid_json_v3 = ( { "name": "structured", "configuration": { "fields": [ - ("field1", "int32"), - ("field2", "float64"), + ["field1", "int32"], + ["field2", "float64"], ] }, }, @@ -43,17 +43,17 @@ class TestStructured(BaseTestZDType): "name": "structured", "configuration": { "fields": [ - ( + [ "field1", { "name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 1}, }, - ), - ( + ], + [ "field2", {"name": "fixed_length_utf32", "configuration": {"length_bytes": 32}}, - ), + ], ] }, }, diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index 96281434cd..e201be5cf6 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.npy.common import DateTimeUnit from zarr.core.dtype.npy.time import DateTime64, TimeDelta64, datetime_from_int @@ -35,10 +35,10 @@ class TestDateTime64(_TestTimeBase): np.dtype("timedelta64[ns]"), ) valid_json_v2 = ( - V2JsonTestParams(dtype=">M8"), - V2JsonTestParams(dtype=">M8[s]"), - V2JsonTestParams(dtype="M8", "object_codec_id": None}, + {"name": ">M8[s]", "object_codec_id": None}, + {"name": "m8", "object_codec_id": None}, + {"name": ">m8[s]", "object_codec_id": None}, + {"name": " None: """ -@dataclass(frozen=True, kw_only=True, slots=True) -class V2JsonTestParams: - dtype: str | dict[str, object] | list[object] - object_codec_id: str | None = None - - class BaseTestZDType: """ A base class for testing ZDType subclasses. This class works in conjunction with the custom @@ -73,10 +66,10 @@ class BaseTestZDType: valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () - valid_json_v2: ClassVar[tuple[V2JsonTestParams, ...]] = () + valid_json_v2: ClassVar[tuple[DTypeSpec_V2, ...]] = () invalid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () - valid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () + valid_json_v3: ClassVar[tuple[DTypeSpec_V3, ...]] = () invalid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () # for testing scalar round-trip serialization, we need a tuple of (data type json, scalar json) @@ -108,16 +101,13 @@ def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: zdtype = self.test_cls.from_native_dtype(valid_dtype) assert zdtype.to_native_dtype() == valid_dtype - def test_from_json_roundtrip_v2(self, valid_json_v2: V2JsonTestParams) -> None: - zdtype = self.test_cls.from_json_v2( - valid_json_v2.dtype, # type: ignore[arg-type] - object_codec_id=valid_json_v2.object_codec_id, - ) - assert zdtype.to_json(zarr_format=2) == valid_json_v2.dtype + def test_from_json_roundtrip_v2(self, valid_json_v2: DTypeSpec_V2) -> None: + zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) + assert zdtype.to_json(zarr_format=2) == valid_json_v2 @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: - zdtype = self.test_cls.from_json_v3(valid_json_v3) + def test_from_json_roundtrip_v3(self, valid_json_v3: DTypeSpec_V3) -> None: + zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[ZDType[Any, Any], Any]) -> None: diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index c4225874a4..c7d5f90065 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -23,10 +23,9 @@ TBaseScalar, ZDType, data_type_registry, - get_data_type_from_json_v3, + get_data_type_from_json, parse_data_type, ) -from zarr.core.dtype.common import HasObjectCodec if TYPE_CHECKING: from collections.abc import Generator @@ -85,14 +84,14 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non """ Test that match_dtype raises an error if the dtype is not registered. """ - outside_dtype = "int8" - with pytest.raises( - ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" - ): - data_type_registry_fixture.match_dtype(np.dtype(outside_dtype)) + outside_dtype_name = "int8" + outside_dtype = np.dtype(outside_dtype_name) + msg = f"No Zarr data type found that matches dtype '{outside_dtype!r}'" + with pytest.raises(ValueError, match=re.escape(msg)): + data_type_registry_fixture.match_dtype(outside_dtype) with pytest.raises(KeyError): - data_type_registry_fixture.get(outside_dtype) + data_type_registry_fixture.get(outside_dtype_name) @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -110,23 +109,12 @@ def test_registered_dtypes_match_dtype(zdtype: ZDType[TBaseDType, TBaseScalar]) def test_registered_dtypes_match_json( zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat ) -> None: - if zarr_format == 2: - if isinstance(zdtype, HasObjectCodec): - object_codec_id = zdtype.object_codec_id - else: - object_codec_id = None - assert ( - data_type_registry.match_json_v2( - zdtype.to_json(zarr_format=zarr_format), # type: ignore[arg-type] - object_codec_id=object_codec_id, - ) - == zdtype - ) - else: - skip_object_dtype(zdtype) - assert ( - data_type_registry.match_json_v3(zdtype.to_json(zarr_format=zarr_format)) == zdtype # type: ignore[arg-type] + assert ( + data_type_registry.match_json( + zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format ) + == zdtype + ) @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -148,14 +136,14 @@ def test_match_dtype_unique( dtype_instance = zdtype.to_native_dtype() - msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" + msg = f"No Zarr data type found that matches dtype '{dtype_instance!r}'" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_dtype(dtype_instance) instance_dict = zdtype.to_json(zarr_format=zarr_format) - msg = f"No data type wrapper found that matches {instance_dict}" + msg = f"No Zarr data type found that matches {instance_dict!r}" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_json_v3(instance_dict) # type: ignore[arg-type] + data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) # this is copied from the registry tests -- we should deduplicate @@ -181,7 +169,7 @@ def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: data_type_registry.lazy_load() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) - assert get_data_type_from_json_v3(dtype_json) == instance + assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance data_type_registry.unregister(TestDataType._zarr_v3_name) diff --git a/tests/test_group.py b/tests/test_group.py index 21cbe829a5..60a1fcb9bf 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -23,6 +23,7 @@ from zarr.core._info import GroupInfo from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config +from zarr.core.dtype.common import unpack_dtype_json from zarr.core.dtype.npy.int import UInt8 from zarr.core.group import ( ConsolidatedMetadata, @@ -516,7 +517,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat metadata = { "subarray": { "attributes": {}, - "dtype": dtype.to_json(zarr_format=zarr_format), + "dtype": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), "fill_value": fill_value, "shape": (1,), "chunks": (1,), @@ -552,7 +553,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": dtype.to_json(zarr_format=zarr_format), + "data_type": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), "fill_value": fill_value, "node_type": "array", "shape": (1,), From 70da4da15c67d05070aff2d106f20e89e956de03 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 15 Jun 2025 23:19:25 +0200 Subject: [PATCH 130/130] fix dtype doc example --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 0150e025e3..87c8efc1f5 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -151,7 +151,7 @@ Serialize to JSON for Zarr V2 and V3 >>> json_v2 = int8.to_json(zarr_format=2) >>> json_v2 - '|i1' + {'name': '|i1', 'object_codec_id': None} >>> json_v3 = int8.to_json(zarr_format=3) >>> json_v3 'int8'