From 64bbbd8fce888cb45ed74763e21381831b2f51a9 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Mon, 15 Nov 2021 21:53:45 -0500 Subject: [PATCH 1/7] add v3 store classes Define the StoreV3 class and create v3 versions of most existing stores Add a test_storage_v3.py with test classes inheriting from their v2 counterparts. Only a subset of methods involving differences in v3 behavior were overridden. --- zarr/_storage/store.py | 251 +++++++++- zarr/meta.py | 235 +++++++++- zarr/storage.py | 731 +++++++++++++++++++++++++---- zarr/tests/test_storage.py | 18 +- zarr/tests/test_storage_v3.py | 847 ++++++++++++++++++++++++++++++++++ zarr/tests/util.py | 6 +- 6 files changed, 1996 insertions(+), 92 deletions(-) create mode 100644 zarr/tests/test_storage_v3.py diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 6f5bf78e28..0ff9e0c043 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -1,7 +1,10 @@ +import json +import sys from collections.abc import MutableMapping +from string import ascii_letters, digits from typing import Any, List, Optional, Union -from zarr.meta import Metadata2 +from zarr.meta import Metadata2, Metadata3, _default_entry_point_metadata_v3 from zarr.util import normalize_storage_path # v2 store keys @@ -131,6 +134,169 @@ def rmdir(self, path: str = "") -> None: _rmdir_from_keys(self, path) +class StoreV3(BaseStore): + _store_version = 3 + _metadata_class = Metadata3 + + @staticmethod + def _valid_key(key: str) -> bool: + """ + Verify that a key conforms to the specification. + + A key is any string containing only character in the range a-z, A-Z, + 0-9, or in the set /.-_ it will return True if that's the case, False + otherwise. + + In addition, in spec v3, keys can only start with the prefix meta/, + data/ or be exactly zarr.json and should not end with /. This should + not be exposed to the user, and is a store implementation detail, so + this method will raise a ValueError in that case. + """ + if sys.version_info > (3, 7): + if not key.isascii(): + return False + if set(key) - set(ascii_letters + digits + "/.-_"): + return False + + if ( + not key.startswith("data/") + and (not key.startswith("meta/")) + and (not key == "zarr.json") + ): + raise ValueError("keys starts with unexpected value: `{}`".format(key)) + + if key.endswith('/'): + raise ValueError("keys may not end in /") + + return True + + def list_prefix(self, prefix): + if prefix.startswith('/'): + raise ValueError("prefix must not begin with /") + # TODO: force prefix to end with /? + return [k for k in self.list() if k.startswith(prefix)] + + def erase(self, key): + self.__delitem__(key) + + def erase_prefix(self, prefix): + assert prefix.endswith("/") + + if prefix == "/": + all_keys = self.list() + else: + all_keys = self.list_prefix(prefix) + for key in all_keys: + self.erase(key) + + def list_dir(self, prefix): + """ + Note: carefully test this with trailing/leading slashes + """ + if prefix: # allow prefix = "" ? + assert prefix.endswith("/") + + all_keys = self.list_prefix(prefix) + len_prefix = len(prefix) + keys = [] + prefixes = [] + for k in all_keys: + trail = k[len_prefix:] + if "/" not in trail: + keys.append(prefix + trail) + else: + prefixes.append(prefix + trail.split("/", maxsplit=1)[0] + "/") + return keys, list(set(prefixes)) + + def list(self): + if hasattr(self, 'keys'): + return list(self.keys()) + raise NotImplementedError( + "The list method has not been implemented for this store type." + ) + + # TODO: Remove listdir? This method is just to match the current V2 stores + # The v3 spec mentions: list, list_dir, list_prefix + def listdir(self, path: str = ""): + if path and not path.endswith("/"): + path = path + "/" + keys, prefixes = self.list_dir(path) + prefixes = [p[len(path):].rstrip("/") for p in prefixes] + keys = [k[len(path):] for k in keys] + return keys + prefixes + + # TODO: rmdir here is identical to the rmdir on Store so could potentially + # move to BaseStore instead. + def rmdir(self, path: str = "") -> None: + if not self.is_erasable(): + raise NotImplementedError( + f'{type(self)} is not erasable, cannot call "rmdir"' + ) # pragma: no cover + path = normalize_storage_path(path) + _rmdir_from_keys(self, path) + + def __contains__(self, key): + # TODO: re-enable this check? + # if not key.startswith(("meta/", "data/")): + # raise ValueError( + # f'Key must start with either "meta/" or "data/". ' + # f'Got {key}' + # ) + return key in self.list() + + def clear(self): + """Remove all items from store.""" + self.erase_prefix("/") + + def __eq__(self, other): + from zarr.storage import KVStoreV3 # avoid circular import + if isinstance(other, KVStoreV3): + return self._mutable_mapping == other._mutable_mapping + else: + return NotImplemented + + @staticmethod + def _ensure_store(store): + """ + We want to make sure internally that zarr stores are always a class + with a specific interface derived from ``Store``, which is slightly + different than ``MutableMapping``. + + We'll do this conversion in a few places automatically + """ + from zarr.storage import KVStoreV3 # avoid circular import + if store is None: + return None + elif isinstance(store, StoreV3): + return store + elif isinstance(store, MutableMapping): + return KVStoreV3(store) + else: + for attr in [ + "keys", + "values", + "get", + "__setitem__", + "__getitem__", + "__delitem__", + "__contains__", + ]: + if not hasattr(store, attr): + break + else: + return KVStoreV3(store) + + raise ValueError( + "v3 stores must be subclasses of StoreV3, " + "if your store exposes the MutableMapping interface wrap it in " + f"Zarr.storage.KVStoreV3. Got {store}" + ) + + +# allow MutableMapping for backwards compatibility +StoreLike = Union[BaseStore, MutableMapping] + + def _path_to_prefix(path: Optional[str]) -> str: # assume path already normalized if path: @@ -140,17 +306,49 @@ def _path_to_prefix(path: Optional[str]) -> str: return prefix +# TODO: Should this return default metadata or raise an Error if zarr.json +# is absent? +def _get_hierarchy_metadata(store=None): + meta = _default_entry_point_metadata_v3 + if store is not None: + version = getattr(store, '_store_version', 2) + if version < 3: + raise ValueError("zarr.json hierarchy metadata not stored for " + f"zarr v{version} stores") + if 'zarr.json' in store: + meta = store._metadata_class.decode_hierarchy_metadata(store['zarr.json']) + return meta + + def _rename_from_keys(store: BaseStore, src_path: str, dst_path: str) -> None: # assume path already normalized src_prefix = _path_to_prefix(src_path) dst_prefix = _path_to_prefix(dst_path) - for key in list(store.keys()): - if key.startswith(src_prefix): - new_key = dst_prefix + key.lstrip(src_prefix) - store[new_key] = store.pop(key) - - -def _rmdir_from_keys(store: Union[BaseStore, MutableMapping], path: Optional[str] = None) -> None: + version = getattr(store, '_store_version', 2) + if version == 2: + root_prefixes = [''] + elif version == 3: + root_prefixes = ['meta/root/', 'data/root/'] + for root_prefix in root_prefixes: + _src_prefix = root_prefix + src_prefix + _dst_prefix = root_prefix + dst_prefix + for key in list(store.keys()): + if key.startswith(_src_prefix): + new_key = _dst_prefix + key.lstrip(_src_prefix) + store[new_key] = store.pop(key) + if version == 3: + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + _src_array_json = 'meta/root/' + src_prefix[:-1] + '.array' + sfx + if _src_array_json in store: + new_key = 'meta/root/' + dst_prefix[:-1] + '.array' + sfx + store[new_key] = store.pop(_src_array_json) + _src_group_json = 'meta/root/' + src_prefix[:-1] + '.group' + sfx + if _src_group_json in store: + new_key = 'meta/root/' + dst_prefix[:-1] + '.group' + sfx + store[new_key] = store.pop(_src_group_json) + + +def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: # assume path already normalized prefix = _path_to_prefix(path) for key in list(store.keys()): @@ -168,3 +366,40 @@ def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str child = suffix.split('/')[0] children.add(child) return sorted(children) + + +def _prefix_to_array_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + if prefix: + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + key = "meta/root/" + prefix.rstrip("/") + ".array" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 array key") + else: + key = prefix + array_meta_key + return key + + +def _prefix_to_group_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + if prefix: + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + key = "meta/root/" + prefix.rstrip('/') + ".group" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 group key") + else: + key = prefix + group_meta_key + return key + + +def _prefix_to_attrs_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + # for v3, attributes are stored in the array metadata + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + if prefix: + key = "meta/root/" + prefix.rstrip('/') + ".array" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 array key") + else: + key = prefix + attrs_key + return key diff --git a/zarr/meta.py b/zarr/meta.py index c292b09a14..07fbdcb7d4 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -1,4 +1,6 @@ import base64 +import itertools +import os from collections.abc import Mapping import numpy as np @@ -9,6 +11,37 @@ from typing import cast, Union, Any, List, Mapping as MappingType ZARR_FORMAT = 2 +ZARR_FORMAT_v3 = 3 + +FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} + + +_v3_core_type = set( + "".join(d) + for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8")) +) +_v3_core_type = {"bool", "i1", "u1"} | _v3_core_type + +ZARR_V3_CORE_DTYPES_ONLY = int(os.environ.get("ZARR_V3_CORE_DTYPES_ONLY", False)) +ZARR_V3_ALLOW_COMPLEX = int(os.environ.get("ZARR_V3_ALLOW_COMPLEX", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_DATETIME = int(os.environ.get("ZARR_V3_ALLOW_DATETIME", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_STRUCTURED = int(os.environ.get("ZARR_V3_ALLOW_STRUCTURED", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_OBJECTARRAY = int(os.environ.get("ZARR_V3_ALLOW_OBJECTARRAY", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_BYTES_ARRAY = int(os.environ.get("ZARR_V3_ALLOW_BYTES_ARRAY", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_UNICODE_ARRAY = int(os.environ.get("ZARR_V3_ALLOW_UNICODE_ARRAY", + not ZARR_V3_CORE_DTYPES_ONLY)) + +_default_entry_point_metadata_v3 = { + 'zarr_format': "https://purl.org/zarr/spec/protocol/core/3.0", + 'metadata_encoding': "https://purl.org/zarr/spec/protocol/core/3.0", + 'metadata_key_suffix': '.json', + "extensions": [], +} class Metadata2: @@ -228,7 +261,207 @@ def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return v -# expose class methods for backwards compatibility +class Metadata3(Metadata2): + ZARR_FORMAT = ZARR_FORMAT_v3 + + @classmethod + def decode_dtype(cls, d): + d = cls._decode_dtype_descr(d) + dtype = np.dtype(d) + if dtype.kind == 'c': + if not ZARR_V3_ALLOW_COMPLEX: + raise ValueError("complex-valued arrays not supported") + elif dtype.kind in 'mM': + if not ZARR_V3_ALLOW_DATETIME: + raise ValueError( + "datetime64 and timedelta64 arrays not supported" + ) + elif dtype.kind == 'O': + if not ZARR_V3_ALLOW_OBJECTARRAY: + raise ValueError("object arrays not supported") + elif dtype.kind == 'V': + if not ZARR_V3_ALLOW_STRUCTURED: + raise ValueError("structured arrays not supported") + elif dtype.kind == 'U': + if not ZARR_V3_ALLOW_UNICODE_ARRAY: + raise ValueError("unicode arrays not supported") + elif dtype.kind == 'S': + if not ZARR_V3_ALLOW_BYTES_ARRAY: + raise ValueError("bytes arrays not supported") + else: + assert d in _v3_core_type + return dtype + + @classmethod + def encode_dtype(cls, d): + s = Metadata2.encode_dtype(d) + if s == "|b1": + return "bool" + elif s == "|u1": + return "u1" + elif s == "|i1": + return "i1" + dtype = np.dtype(d) + if dtype.kind == "c": + if not ZARR_V3_ALLOW_COMPLEX: + raise ValueError( + "complex-valued arrays not part of the base v3 spec" + ) + elif dtype.kind in "mM": + if not ZARR_V3_ALLOW_DATETIME: + raise ValueError( + "datetime64 and timedelta64 not part of the base v3 " + "spec" + ) + elif dtype.kind == "O": + if not ZARR_V3_ALLOW_OBJECTARRAY: + raise ValueError( + "object dtypes are not part of the base v3 spec" + ) + elif dtype.kind == "V": + if not ZARR_V3_ALLOW_STRUCTURED: + raise ValueError( + "structured arrays are not part of the base v3 spec" + ) + elif dtype.kind == 'U': + if not ZARR_V3_ALLOW_UNICODE_ARRAY: + raise ValueError("unicode dtypes are not part of the base v3 " + "spec") + elif dtype.kind == 'S': + if not ZARR_V3_ALLOW_BYTES_ARRAY: + raise ValueError("bytes dtypes are not part of the base v3 " + "spec") + else: + assert s in _v3_core_type + return s + + @classmethod + def decode_group_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + # 1 / 0 + # # check metadata format version + # zarr_format = meta.get("zarr_format", None) + # if zarr_format != cls.ZARR_FORMAT: + # raise MetadataError("unsupported zarr format: %s" % zarr_format) + + assert 'attributes' in meta + # meta = dict(attributes=meta['attributes']) + return meta + + # return json.loads(s) + + @classmethod + def encode_group_metadata(cls, meta=None) -> bytes: + # The ZARR_FORMAT should not be in the group metadata, but in the + # entry point metadata instead + # meta = dict(zarr_format=cls.ZARR_FORMAT) + if meta is None: + meta = {'attributes': {}} + meta = dict(attributes=meta.get("attributes", {})) + return json_dumps(meta) + + @classmethod + def encode_hierarchy_metadata(cls, meta=None) -> bytes: + if meta is None: + meta = _default_entry_point_metadata_v3 + elif set(meta.keys()) != { + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", + }: + raise ValueError(f"Unexpected keys in metadata. meta={meta}") + return json_dumps(meta) + + @classmethod + def decode_hierarchy_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + # check metadata format + # zarr_format = meta.get("zarr_format", None) + # if zarr_format != "https://purl.org/zarr/spec/protocol/core/3.0": + # raise MetadataError("unsupported zarr format: %s" % zarr_format) + if set(meta.keys()) != { + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", + }: + raise ValueError(f"Unexpected keys in metdata. meta={meta}") + return meta + + @classmethod + def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + + # check metadata format + zarr_format = meta.get("zarr_format", None) + if zarr_format != cls.ZARR_FORMAT: + raise MetadataError("unsupported zarr format: %s" % zarr_format) + + # extract array metadata fields + try: + dtype = cls.decode_dtype(meta["data_type"]) + if dtype.hasobject: + import numcodecs + object_codec = numcodecs.get_codec(meta['attributes']['filters'][0]) + else: + object_codec = None + fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) + # TODO: remove dimension_separator? + meta = dict( + zarr_format=meta["zarr_format"], + shape=tuple(meta["shape"]), + chunk_grid=dict( + type=meta["chunk_grid"]["type"], + chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), + separator=meta["chunk_grid"]["separator"], + ), + data_type=dtype, + compressor=meta["compressor"], + fill_value=fill_value, + chunk_memory_layout=meta["chunk_memory_layout"], + dimension_separator=meta.get("dimension_separator", "/"), + attributes=meta["attributes"], + ) + # dimension_separator = meta.get("dimension_separator", None) + # if dimension_separator: + # meta["dimension_separator"] = dimension_separator + except Exception as e: + raise MetadataError("error decoding metadata: %s" % e) + else: + return meta + + @classmethod + def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: + dtype = meta["data_type"] + sdshape = () + if dtype.subdtype is not None: + dtype, sdshape = dtype.subdtype + dimension_separator = meta.get("dimension_separator") + if dtype.hasobject: + import numcodecs + object_codec = numcodecs.get_codec(meta['attributes']['filters'][0]) + else: + object_codec = None + meta = dict( + zarr_format=cls.ZARR_FORMAT, + shape=meta["shape"] + sdshape, + chunk_grid=dict( + type=meta["chunk_grid"]["type"], + chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), + separator=meta["chunk_grid"]["separator"], + ), + data_type=cls.encode_dtype(dtype), + compressor=meta["compressor"], + fill_value=encode_fill_value(meta["fill_value"], dtype, object_codec), + chunk_memory_layout=meta["chunk_memory_layout"], + attributes=meta.get("attributes", {}), + ) + if dimension_separator: + meta["dimension_separator"] = dimension_separator + return json_dumps(meta) + + parse_metadata = Metadata2.parse_metadata decode_array_metadata = Metadata2.decode_array_metadata encode_array_metadata = Metadata2.encode_array_metadata diff --git a/zarr/storage.py b/zarr/storage.py index 7170eeaf23..00ca4591b4 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -57,15 +57,19 @@ normalize_shape, normalize_storage_path, retry_call) from zarr._storage.absstore import ABSStore # noqa: F401 -from zarr._storage.store import (_listdir_from_keys, - _path_to_prefix, +from zarr._storage.store import (_get_hierarchy_metadata, + _listdir_from_keys, _rename_from_keys, _rmdir_from_keys, + _path_to_prefix, + _prefix_to_array_key, + _prefix_to_group_key, array_meta_key, group_meta_key, attrs_key, BaseStore, - Store) + Store, + StoreV3) __doctest_requires__ = { ('RedisStore', 'RedisStore.*'): ['redis'], @@ -92,40 +96,95 @@ def contains_array(store: StoreLike, path: Path = None) -> bool: """Return True if the store contains an array at the given logical path.""" path = normalize_storage_path(path) prefix = _path_to_prefix(path) - key = prefix + array_meta_key + key = _prefix_to_array_key(store, prefix) return key in store -def contains_group(store: StoreLike, path: Path = None) -> bool: +def contains_group(store: StoreLike, path: Path = None, explicit_only=True) -> bool: """Return True if the store contains a group at the given logical path.""" path = normalize_storage_path(path) prefix = _path_to_prefix(path) - key = prefix + group_meta_key - return key in store + key = _prefix_to_group_key(store, prefix) + store_version = getattr(store, '_store_version', 2) + if store_version == 2 or explicit_only: + return key in store + else: + if key in store: + return True + # for v3, need to also handle implicit groups + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + implicit_prefix = key.replace('.group' + sfx, '') + if not implicit_prefix.endswith('/'): + implicit_prefix += '/' + if store.list_prefix(implicit_prefix): # type: ignore + return True + return False -def normalize_store_arg(store: Any, clobber=False, storage_options=None, mode="w") -> BaseStore: +def normalize_store_arg(store, clobber=False, storage_options=None, mode="w", + *, zarr_version=None) -> Store: + if zarr_version is None: + # default to v2 store for backward compatibility + zarr_version = getattr(store, '_store_version', 2) + if zarr_version not in [2, 3]: + raise ValueError("zarr_version must be 2 or 3") if store is None: - return BaseStore._ensure_store(dict()) - elif isinstance(store, os.PathLike): + if zarr_version == 2: + store = KVStore(dict()) + else: + store = KVStoreV3(dict()) + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) + return store + if isinstance(store, os.PathLike): store = os.fspath(store) if isinstance(store, str): mode = mode if clobber else "r" - if "://" in store or "::" in store: - return FSStore(store, mode=mode, **(storage_options or {})) - elif storage_options: - raise ValueError("storage_options passed with non-fsspec path") - if store.endswith('.zip'): - return ZipStore(store, mode=mode) - elif store.endswith('.n5'): - from zarr.n5 import N5Store - return N5Store(store) - else: - return DirectoryStore(store) - else: - if not isinstance(store, BaseStore) and isinstance(store, MutableMapping): - store = BaseStore._ensure_store(store) - return store + if zarr_version == 2: + if "://" in store or "::" in store: + return FSStore(store, mode=mode, **(storage_options or {})) + elif storage_options: + raise ValueError("storage_options passed with non-fsspec path") + if store.endswith('.zip'): + return ZipStore(store, mode=mode) + elif store.endswith('.n5'): + from zarr.n5 import N5Store + return N5Store(store) + else: + return DirectoryStore(store) + elif zarr_version == 3: + if "://" in store or "::" in store: + store = FSStoreV3(store, mode=mode, **(storage_options or {})) + elif storage_options: + store = ValueError("storage_options passed with non-fsspec path") + if store.endswith('.zip'): + store = ZipStoreV3(store, mode=mode) + elif store.endswith('.n5'): + raise NotImplementedError("N5Store not yet implemented for V3") + # return N5StoreV3(store) + else: + store = DirectoryStoreV3(store) + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) + return store + elif zarr_version == 2: + store = Store._ensure_store(store) + if getattr(store, '_store_version', 2) != 2: + raise ValueError( + "provided store does not match the specified zarr version.") + # if not isinstance(store, Store) and isinstance(store, MutableMapping): + # store = KVStore(store) + elif zarr_version == 3: + store = StoreV3._ensure_store(store) + if getattr(store, '_store_version', 2) != 3: + raise ValueError( + "provided store does not match the specified zarr version.") + # if not isinstance(store, StoreV3) and isinstance(store, MutableMapping): + # store = KVStoreV3(store) + if 'zarr.json' not in store: + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) + return store def rmdir(store: StoreLike, path: Path = None): @@ -133,15 +192,36 @@ def rmdir(store: StoreLike, path: Path = None): this will be called, otherwise will fall back to implementation via the `Store` interface.""" path = normalize_storage_path(path) - if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore - # pass through - store.rmdir(path) # type: ignore + if getattr(store, '_store_version', 2) == 2: + if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore + # pass through + store.rmdir(path) # type: ignore + else: + # slow version, delete one key at a time + _rmdir_from_keys(store, path) else: - # slow version, delete one key at a time - _rmdir_from_keys(store, path) + # TODO: check behavior for v3 and fix in the Store class, deferring to + # those by default + + # remove metadata folder + meta_dir = 'meta/root/' + path + _rmdir_from_keys(store, meta_dir) + + # remove data folder + data_dir = 'data/root/' + path + _rmdir_from_keys(store, data_dir) + # remove metadata files + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + array_meta_file = meta_dir + '.array' + sfx + if array_meta_file in store: + store.erase(array_meta_file) # type: ignore + group_meta_file = meta_dir + '.group' + sfx + if group_meta_file in store: + store.erase(group_meta_file) # type: ignore -def rename(store: BaseStore, src_path: Path, dst_path: Path): + +def rename(store: Store, src_path: Path, dst_path: Path): """Rename all items under the given path. If `store` provides a `rename` method, this will be called, otherwise will fall back to implementation via the `Store` interface.""" @@ -163,6 +243,27 @@ def listdir(store: BaseStore, path: Path = None): if hasattr(store, 'listdir'): # pass through return store.listdir(path) # type: ignore + elif getattr(store, "_store_version", None) == 3: + meta_prefix = 'meta/root/' + dir_path = meta_prefix + path + path_start = len(meta_prefix) + meta_keys = [] + include_meta_keys = False + if include_meta_keys: + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + group_meta_key = dir_path + '.group' + sfx + if group_meta_key in store: + meta_keys.append(group_meta_key[path_start:]) + array_meta_key = dir_path + '.array' + sfx + if array_meta_key in store: + meta_keys.append(array_meta_key[path_start:]) + if not dir_path.endswith('/'): + dir_path += '/' + keys, prefixes = store.list_dir(dir_path) # type: ignore + keys = [k[path_start:] for k in keys] + prefixes = [p[path_start:] for p in prefixes] + return meta_keys + keys + prefixes + else: # slow version, iterate through all keys warnings.warn( @@ -173,33 +274,45 @@ def listdir(store: BaseStore, path: Path = None): return _listdir_from_keys(store, path) +def _getsize(store: BaseStore, path: Path = None) -> int: + # compute from size of values + if path and path in store: + v = store[path] + size = buffer_size(v) + else: + path = '' if path is None else normalize_storage_path(path) + size = 0 + store_version = getattr(store, '_store_version', 2) + if store_version == 3: + members = store.list_prefix('data/root/' + path) # type: ignore + members += store.list_prefix('meta/root/' + path) # type: ignore + # members += ['zarr.json'] + else: + members = listdir(store, path) + prefix = _path_to_prefix(path) + members = [prefix + k for k in members] + for k in members: + try: + v = store[k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 + return size + + def getsize(store: BaseStore, path: Path = None) -> int: """Compute size of stored items for a given path. If `store` provides a `getsize` method, this will be called, otherwise will return -1.""" - path = normalize_storage_path(path) if hasattr(store, 'getsize'): # pass through + path = normalize_storage_path(path) return store.getsize(path) # type: ignore elif isinstance(store, MutableMapping): - # compute from size of values - if path in store: - v = store[path] - size = buffer_size(v) - else: - members = listdir(store, path) - prefix = _path_to_prefix(path) - size = 0 - for k in members: - try: - v = store[prefix + k] - except KeyError: - pass - else: - try: - size += buffer_size(v) - except TypeError: - return -1 - return size + return _getsize(store, path) else: return -1 @@ -346,7 +459,14 @@ def init_array( path = normalize_storage_path(path) # ensure parent group initialized - _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) + store_version = getattr(store, "_store_version", 2) + if store_version < 3: + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + if store_version == 3 and 'zarr.json' not in store: + # initialize with default zarr.json entry level metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, @@ -372,16 +492,50 @@ def _init_array_metadata( dimension_separator=None, ): + store_version = getattr(store, '_store_version', 2) + + path = normalize_storage_path(path) + # guard conditions if overwrite: - # attempt to delete any pre-existing items in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - elif contains_array(store, path): - raise ContainsArrayError(path) - elif contains_group(store, path): - raise ContainsGroupError(path) + if store_version == 2: + # attempt to delete any pre-existing array in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) + else: + group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) + array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) + data_prefix = 'data/root/' + _path_to_prefix(path) + + # attempt to delete any pre-existing array in store + if array_meta_key in store: + store.erase(array_meta_key) # type: ignore + if group_meta_key in store: + store.erase(group_meta_key) # type: ignore + store.erase_prefix(data_prefix) # type: ignore + if chunk_store is not None: + chunk_store.erase_prefix(data_prefix) # type: ignore + + if '/' in path: + # path is a subfolder of an existing array, remove that array + parent_path = '/'.join(path.split('/')[:-1]) + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + array_key = 'meta/root/' + parent_path + '.array' + sfx + if array_key in store: + store.erase(array_key) # type: ignore + + if not overwrite: + if contains_array(store, path): + raise ContainsArrayError(path) + elif contains_group(store, path, explicit_only=False): + raise ContainsGroupError(path) + elif store_version == 3: + if '/' in path: + # cannot create an array within an existing array path + parent_path = '/'.join(path.split('/')[:-1]) + if contains_array(store, parent_path): + raise ContainsArrayError(path) # normalize metadata dtype, object_codec = normalize_dtype(dtype, object_codec) @@ -392,7 +546,7 @@ def _init_array_metadata( fill_value = normalize_fill_value(fill_value, dtype) # optional array metadata - if dimension_separator is None: + if dimension_separator is None and store_version == 2: dimension_separator = getattr(store, "_dimension_separator", None) dimension_separator = normalize_dimension_separator(dimension_separator) @@ -416,6 +570,8 @@ def _init_array_metadata( # obtain filters config if filters: + # TODO: filters was removed from the metadata in v3 + # raise error here if store_version > 2? filters_config = [f.get_config() for f in filters] else: filters_config = [] @@ -441,11 +597,30 @@ def _init_array_metadata( filters_config = None # type: ignore # initialize metadata - meta = dict(shape=shape, chunks=chunks, dtype=dtype, - compressor=compressor_config, fill_value=fill_value, - order=order, filters=filters_config, + # TODO: don't store redundant dimension_separator for v3? + meta = dict(shape=shape, compressor=compressor_config, + fill_value=fill_value, dimension_separator=dimension_separator) - key = _path_to_prefix(path) + array_meta_key + if store_version < 3: + meta.update(dict(chunks=chunks, dtype=dtype, order=order, + filters=filters_config)) + else: + if dimension_separator is None: + dimension_separator = "/" + if filters_config: + attributes = {'filters': filters_config} + else: + attributes = {} + meta.update( + dict(chunk_grid=dict(type="regular", + chunk_shape=chunks, + separator=dimension_separator), + chunk_memory_layout=order, + data_type=dtype, + attributes=attributes) + ) + + key = _prefix_to_array_key(store, _path_to_prefix(path)) if hasattr(store, '_metadata_class'): store[key] = store._metadata_class.encode_array_metadata(meta) # type: ignore else: @@ -482,14 +657,26 @@ def init_group( # normalize path path = normalize_storage_path(path) - # ensure parent group initialized - _require_parent_group(path, store=store, chunk_store=chunk_store, - overwrite=overwrite) + store_version = getattr(store, '_store_version', 2) + if store_version < 3: + # ensure parent group initialized + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + if store_version == 3 and 'zarr.json' not in store: + # initialize with default zarr.json entry level metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore # initialise metadata _init_group_metadata(store=store, overwrite=overwrite, path=path, chunk_store=chunk_store) + if store_version == 3: + # TODO: Should initializing a v3 group also create a corresponding + # empty folder under data/root/? I think probably not until there + # is actual data written there. + pass + def _init_group_metadata( store: StoreLike, @@ -498,22 +685,51 @@ def _init_group_metadata( chunk_store: StoreLike = None, ): + store_version = getattr(store, '_store_version', 2) + path = normalize_storage_path(path) + # guard conditions if overwrite: - # attempt to delete any pre-existing items in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - elif contains_array(store, path): - raise ContainsArrayError(path) - elif contains_group(store, path): - raise ContainsGroupError(path) + if store_version == 2: + # attempt to delete any pre-existing items in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) + else: + group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) + array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) + data_prefix = 'data/root/' + _path_to_prefix(path) + meta_prefix = 'meta/root/' + _path_to_prefix(path) + + # attempt to delete any pre-existing array in store + if array_meta_key in store: + store.erase(array_meta_key) # type: ignore + if group_meta_key in store: + store.erase(group_meta_key) # type: ignore + store.erase_prefix(data_prefix) # type: ignore + store.erase_prefix(meta_prefix) # type: ignore + if chunk_store is not None: + chunk_store.erase_prefix(data_prefix) # type: ignore + + if not overwrite: + if contains_array(store, path): + raise ContainsArrayError(path) + elif contains_group(store, path): + raise ContainsGroupError(path) + elif store_version == 3 and '/' in path: + # cannot create a group overlapping with an existing array name + parent_path = '/'.join(path.split('/')[:-1]) + if contains_array(store, parent_path): + raise ContainsArrayError(path) # initialize metadata # N.B., currently no metadata properties are needed, however there may # be in future - meta = dict() # type: ignore - key = _path_to_prefix(path) + group_meta_key + if store_version == 3: + meta = {'attributes': {}} # type: ignore + else: + meta = {} # type: ignore + key = _prefix_to_group_key(store, _path_to_prefix(path)) if hasattr(store, '_metadata_class'): store[key] = store._metadata_class.encode_group_metadata(meta) # type: ignore else: @@ -1139,14 +1355,17 @@ def __init__(self, url, normalize_keys=False, key_separator=None, dimension_separator = key_separator self.key_separator = dimension_separator - if self.key_separator is None: - self.key_separator = "." + self._default_key_separator() # Pass attributes to array creation self._dimension_separator = dimension_separator if self.fs.exists(self.path) and not self.fs.isdir(self.path): raise FSPathExistNotDir(url) + def _default_key_separator(self): + if self.key_separator is None: + self.key_separator = "." + def _normalize_key(self, key): key = normalize_storage_path(key).lstrip('/') if key: @@ -2647,6 +2866,10 @@ class ConsolidatedMetadataStore(Store): def __init__(self, store: StoreLike, metadata_key=".zmetadata"): self.store = Store._ensure_store(store) + if getattr(store, '_store_version', 2) != 2: + raise ValueError("Can only consolidate stores corresponding to " + "the Zarr v2 spec.") + # retrieve consolidated metadata meta = json_loads(store[metadata_key]) @@ -2682,3 +2905,351 @@ def getsize(self, path): def listdir(self, path): return listdir(self.meta_store, path) + + +""" versions of stores following the v3 protocol """ + + +def _get_files_and_dirs_from_path(store, path): + path = normalize_storage_path(path) + + files = [] + # add array metadata file if present + array_key = _prefix_to_array_key(store, path) + if array_key in store: + files.append(os.path.join(store.path, array_key)) + + # add group metadata file if present + group_key = _prefix_to_group_key(store, path) + if group_key in store: + files.append(os.path.join(store.path, group_key)) + + dirs = [] + # add array and group folders if present + for d in ['data/root/' + path, 'meta/root/' + path]: + dir_path = os.path.join(store.path, d) + if os.path.exists(dir_path): + dirs.append(dir_path) + return files, dirs + + +class KVStoreV3(KVStore, StoreV3): + + def list(self): + return list(self._mutable_mapping.keys()) + + +KVStoreV3.__doc__ = KVStore.__doc__ + + +class FSStoreV3(FSStore, StoreV3): + + # FSStoreV3 doesn't use this (FSStore uses it within _normalize_key) + _META_KEYS = () + + def _default_key_separator(self): + if self.key_separator is None: + self.key_separator = "/" + + def list(self): + return list(self.keys()) + + def _normalize_key(self, key): + key = normalize_storage_path(key).lstrip('/') + return key.lower() if self.normalize_keys else key + + def getsize(self, path=None): + size = 0 + if path is None or path == '': + # size of both the data and meta subdirs + dirs = [] + for d in ['data/root', 'meta/root']: + dir_path = os.path.join(self.path, d) + if os.path.exists(dir_path): + dirs.append(dir_path) + else: + files, dirs = _get_files_and_dirs_from_path(self, path) + for file in files: + size += os.path.getsize(file) + for d in dirs: + size += self.fs.du(d, total=True, maxdepth=None) + return size + + def setitems(self, values): + if self.mode == 'r': + raise ReadOnlyError() + values = {self._normalize_key(key): val for key, val in values.items()} + + # initialize the /data/root/... folder corresponding to the array! + # Note: zarr.tests.test_core_v3.TestArrayWithFSStoreV3PartialRead fails + # without this explicit creation of directories + subdirectories = set([os.path.dirname(v) for v in values.keys()]) + for subdirectory in subdirectories: + data_dir = os.path.join(self.path, subdirectory) + if not self.fs.exists(data_dir): + self.fs.mkdir(data_dir) + + self.map.setitems(values) + + +class MemoryStoreV3(MemoryStore, StoreV3): + + def __init__(self, root=None, cls=dict, dimension_separator=None): + if root is None: + self.root = cls() + else: + self.root = root + self.cls = cls + self.write_mutex = Lock() + self._dimension_separator = dimension_separator # TODO: modify for v3? + + def __eq__(self, other): + return ( + isinstance(other, MemoryStoreV3) and + self.root == other.root and + self.cls == other.cls + ) + + def list(self): + return list(self.keys()) + + def getsize(self, path: Path = None): + size = 0 + path = normalize_storage_path(path) + members = self.list_prefix('data/root/' + path) + members += self.list_prefix('meta/root/' + path) + for k in members: + try: + v = self[k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 + return size + + +MemoryStoreV3.__doc__ = MemoryStore.__doc__ + + +class DirectoryStoreV3(DirectoryStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, DirectoryStoreV3) and + self.path == other.path + ) + + # def getsize(self, path=None): + # size = 0 + # if path is None or path == '': + # # add array and group folders if present + # dirs = [] + # for d in ['data/root', 'meta/root']: + # dir_path = os.path.join(self.path, d) + # if os.path.exists(dir_path): + # dirs.append(dir_path) + # print(f"dirs={dirs}") + # else: + # files, dirs = _get_files_and_dirs_from_path(self, path) + # for file in files: + # size += os.path.getsize(file) + # for d in dirs: + # for child in scandir(d): + # print(f"child={child}") + # if child.is_file(): + # size += child.stat().st_size + # return size + + def getsize(self, path: Path = None): + size = 0 + path = normalize_storage_path(path) + members = self.list_prefix('data/root/' + path) + members += self.list_prefix('meta/root/' + path) + for k in members: + try: + v = self[k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 + return size + + def rename(self, src_path, dst_path, metadata_key_suffix='.json'): + store_src_path = normalize_storage_path(src_path) + store_dst_path = normalize_storage_path(dst_path) + + dir_path = self.path + any_existed = False + for root_prefix in ['meta', 'data']: + src_path = os.path.join(dir_path, root_prefix, 'root', store_src_path) + if os.path.exists(src_path): + any_existed = True + dst_path = os.path.join(dir_path, root_prefix, 'root', store_dst_path) + os.renames(src_path, dst_path) + + for suffix in ['.array' + metadata_key_suffix, + '.group' + metadata_key_suffix]: + src_meta = os.path.join(dir_path, 'meta', 'root', store_src_path + suffix) + if os.path.exists(src_meta): + any_existed = True + dst_meta = os.path.join(dir_path, 'meta', 'root', store_dst_path + suffix) + dst_dir = os.path.dirname(dst_meta) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir) + os.rename(src_meta, dst_meta) + if not any_existed: + raise FileNotFoundError("nothing found at src_path") + + +DirectoryStoreV3.__doc__ = DirectoryStore.__doc__ + + +class ZipStoreV3(ZipStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, ZipStore) and + self.path == other.path and + self.compression == other.compression and + self.allowZip64 == other.allowZip64 + ) + + def getsize(self, path=None): + path = normalize_storage_path(path) + with self.mutex: + children = self.list_prefix('data/root/' + path) + children += self.list_prefix('meta/root/' + path) + if children: + size = 0 + for name in children: + try: + info = self.zf.getinfo(name) + except KeyError: + pass + else: + size += info.compress_size + return size + elif path: + try: + info = self.zf.getinfo(path) + return info.compress_size + except KeyError: + return 0 + else: + return 0 + + +ZipStoreV3.__doc__ = ZipStore.__doc__ + + +class NestedDirectoryStoreV3(NestedDirectoryStore, DirectoryStoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, NestedDirectoryStoreV3) and + self.path == other.path + ) + + +NestedDirectoryStoreV3.__doc__ = NestedDirectoryStore.__doc__ + + +class RedisStoreV3(RedisStore, StoreV3): + + def list(self): + return list(self.keys()) + + +RedisStoreV3.__doc__ = RedisStore.__doc__ + + +class MongoDBStoreV3(MongoDBStore, StoreV3): + + def list(self): + return list(self.keys()) + + +MongoDBStoreV3.__doc__ = MongoDBStore.__doc__ + + +class DBMStoreV3(DBMStore, StoreV3): + + def list(self): + return list(self.keys()) + + +DBMStoreV3.__doc__ = DBMStore.__doc__ + + +class LMDBStoreV3(LMDBStore, StoreV3): + + def list(self): + return list(self.keys()) + + +LMDBStoreV3.__doc__ = LMDBStore.__doc__ + + +class SQLiteStoreV3(SQLiteStore, StoreV3): + + def list(self): + return list(self.keys()) + + def getsize(self, path=None): + if path is None or path == '': + # TODO: why does the query below not work in this case? + # For now fall back to the default _getsize implementation + return _getsize(self, path) + else: + path = normalize_storage_path(path) + size = 0 + for _path in ['data/root/' + path, 'meta/root/' + path]: + c = self.cursor.execute( + ''' + SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + WHERE k LIKE (? || "%") AND + 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + ''', + (_path, _path) + ) + for item_size, in c: + size += item_size + return size + + +SQLiteStoreV3.__doc__ = SQLiteStore.__doc__ + + +class LRUStoreCacheV3(LRUStoreCache, StoreV3): + + def __init__(self, store, max_size: int): + self._store = StoreV3._ensure_store(store) + self._max_size = max_size + self._current_size = 0 + self._keys_cache = None + self._contains_cache = None + self._listdir_cache: Dict[Path, Any] = dict() + self._values_cache: Dict[Path, Any] = OrderedDict() + self._mutex = Lock() + self.hits = self.misses = 0 + + def list(self): + return list(self.keys()) + + +LRUStoreCacheV3.__doc__ = LRUStoreCache.__doc__ diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 3438e60691..0865917926 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -18,9 +18,9 @@ from numcodecs.compat import ensure_bytes from zarr.codecs import BZ2, AsType, Blosc, Zlib -from zarr.errors import MetadataError +from zarr.errors import ContainsArrayError, ContainsGroupError, MetadataError from zarr.hierarchy import group -from zarr.meta import ZARR_FORMAT, decode_array_metadata +from zarr.meta import ZARR_FORMAT, ZARR_FORMAT_v3, decode_array_metadata from zarr.n5 import N5Store, N5FSStore from zarr.storage import (ABSStore, ConsolidatedMetadataStore, DBMStore, DictStore, DirectoryStore, KVStore, LMDBStore, @@ -31,7 +31,12 @@ attrs_key, default_compressor, getsize, group_meta_key, init_array, init_group, migrate_1to2) from zarr.storage import FSStore, rename, listdir +from zarr.storage import (KVStoreV3, MemoryStoreV3, ZipStoreV3, FSStoreV3, + DirectoryStoreV3, NestedDirectoryStoreV3, + RedisStoreV3, MongoDBStoreV3, DBMStoreV3, + LMDBStoreV3, SQLiteStoreV3, LRUStoreCacheV3) from zarr.tests.util import CountingDict, have_fsspec, skip_test_env_var, abs_container +from zarr.tests.util import CountingDictV3 @contextmanager @@ -48,6 +53,15 @@ def dimension_separator_fixture(request): return request.param +@pytest.fixture(params=[ + (None, "/"), + (".", "."), + ("/", "/"), +]) +def dimension_separator_fixture_v3(request): + return request.param + + def skip_if_nested_chunks(**kwargs): if kwargs.get("dimension_separator") == "/": pytest.skip("nested chunks are unsupported") diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py new file mode 100644 index 0000000000..9118bc513c --- /dev/null +++ b/zarr/tests/test_storage_v3.py @@ -0,0 +1,847 @@ +import array +import atexit +import os +import tempfile + +import numpy as np +import pytest + +from numcodecs.compat import ensure_bytes + +from zarr.codecs import Zlib +from zarr.errors import ContainsArrayError, ContainsGroupError +from zarr.meta import ZARR_FORMAT, ZARR_FORMAT_v3 +from zarr.storage import (array_meta_key, atexit_rmglob, atexit_rmtree, + default_compressor, getsize, init_array, init_group) +from zarr.storage import (KVStoreV3, MemoryStoreV3, ZipStoreV3, FSStoreV3, + DirectoryStoreV3, NestedDirectoryStoreV3, + RedisStoreV3, MongoDBStoreV3, DBMStoreV3, + LMDBStoreV3, SQLiteStoreV3, LRUStoreCacheV3) +from zarr.tests.util import CountingDictV3, have_fsspec, skip_test_env_var + +from .test_storage import (StoreTests, TestMemoryStore, TestDirectoryStore, + TestFSStore, TestNestedDirectoryStore, TestZipStore, + TestDBMStore, TestDBMStoreDumb, TestDBMStoreGnu, + TestDBMStoreNDBM, TestDBMStoreBerkeleyDB, + TestLMDBStore, TestSQLiteStore, + TestSQLiteStoreInMemory, TestLRUStoreCache, + dimension_separator_fixture, s3, + skip_if_nested_chunks) + + +@pytest.fixture(params=[ + (None, "/"), + (".", "."), + ("/", "/"), +]) +def dimension_separator_fixture_v3(request): + return request.param + + +class StoreV3Tests(StoreTests): + + def test_getsize(self): + # TODO: determine proper getsize() behavior for v3 + + # Currently returns the combined size of entries under + # meta/root/path and data/root/path. + # Any path not under meta/root/ or data/root/ (including zarr.json) + # returns size 0. + + store = self.create_store() + if isinstance(store, dict) or hasattr(store, 'getsize'): + assert 0 == getsize(store, 'zarr.json') + store['meta/root/foo/a'] = b'x' + assert 1 == getsize(store) + assert 1 == getsize(store, 'foo') + store['meta/root/foo/b'] = b'x' + assert 2 == getsize(store, 'foo') + assert 1 == getsize(store, 'foo/b') + store['meta/root/bar/a'] = b'yy' + assert 2 == getsize(store, 'bar') + store['data/root/bar/a'] = b'zzz' + assert 5 == getsize(store, 'bar') + store['data/root/baz/a'] = b'zzz' + assert 3 == getsize(store, 'baz') + assert 10 == getsize(store) + store['data/root/quux'] = array.array('B', b'zzzz') + assert 14 == getsize(store) + assert 4 == getsize(store, 'quux') + store['data/root/spong'] = np.frombuffer(b'zzzzz', dtype='u1') + assert 19 == getsize(store) + assert 5 == getsize(store, 'spong') + + store.close() + + # noinspection PyStatementEffect + def test_hierarchy(self): + pytest.skip("TODO: adapt v2 test_hierarchy tests to v3") + + def test_init_array(self, dimension_separator_fixture_v3): + + pass_dim_sep, want_dim_sep = dimension_separator_fixture_v3 + + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100, + dimension_separator=pass_dim_sep) + + # check metadata + mkey = 'meta/root/' + path + '.array.json' + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + # TODO: zarr_format already stored at the heirarchy level should we + # also keep it in the .array.json? + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + assert default_compressor.get_config() == meta['compressor'] + assert meta['fill_value'] is None + # Missing MUST be assumed to be "/" + assert meta.get('dimension_separator', "/") is want_dim_sep + assert meta['chunk_grid']['separator'] is want_dim_sep + store.close() + + def _test_init_array_overwrite(self, order): + # setup + store = self.create_store() + + if store._store_version < 3: + path = None + mkey = array_meta_key + else: + path = 'arr1' # no default, have to specify for v3 + mkey = 'meta/root/' + path + '.array.json' + store[mkey] = store._metadata_class.encode_array_metadata( + dict(shape=(2000,), + chunk_grid=dict(type='regular', + chunk_shape=(200,), + separator=('/')), + data_type=np.dtype('u1'), + compressor=Zlib(1).get_config(), + fill_value=0, + chunk_memory_layout=order, + filters=None) + ) + + # don't overwrite (default) + with pytest.raises(ContainsArrayError): + init_array(store, path=path, shape=1000, chunks=100) + + # do overwrite + try: + init_array(store, path=path, shape=1000, chunks=100, + dtype='i4', overwrite=True) + except NotImplementedError: + pass + else: + assert mkey in store + meta = store._metadata_class.decode_array_metadata( + store[mkey] + ) + assert (1000,) == meta['shape'] + if store._store_version == 2: + assert ZARR_FORMAT == meta['zarr_format'] + assert (100,) == meta['chunks'] + assert np.dtype('i4') == meta['dtype'] + elif store._store_version == 3: + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype('i4') == meta['data_type'] + else: + raise ValueError( + "unexpected store version: {store._store_version}" + ) + store.close() + + def test_init_array_path(self): + path = 'foo/bar' + store = self.create_store() + init_array(store, shape=1000, chunks=100, path=path) + + # check metadata + mkey = 'meta/root/' + path + '.array.json' + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + assert default_compressor.get_config() == meta['compressor'] + assert meta['fill_value'] is None + + store.close() + + def _test_init_array_overwrite_path(self, order): + # setup + path = 'foo/bar' + store = self.create_store() + meta = dict(shape=(2000,), + chunk_grid=dict(type='regular', + chunk_shape=(200,), + separator=('/')), + data_type=np.dtype('u1'), + compressor=Zlib(1).get_config(), + fill_value=0, + chunk_memory_layout=order, + filters=None) + mkey = 'meta/root/' + path + '.array.json' + store[mkey] = store._metadata_class.encode_array_metadata(meta) + + # don't overwrite + with pytest.raises(ContainsArrayError): + init_array(store, shape=1000, chunks=100, path=path) + + # do overwrite + try: + init_array(store, shape=1000, chunks=100, dtype='i4', path=path, + overwrite=True) + except NotImplementedError: + pass + else: + assert mkey in store + # should have been overwritten + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype('i4') == meta['data_type'] + + store.close() + + def test_init_array_overwrite_group(self): + # setup + path = 'foo/bar' + store = self.create_store() + array_key = 'meta/root/' + path + '.array.json' + group_key = 'meta/root/' + path + '.group.json' + store[group_key] = store._metadata_class.encode_group_metadata() + + with pytest.raises(ContainsGroupError): + init_array(store, shape=1000, chunks=100, path=path) + + # do overwrite + try: + init_array(store, shape=1000, chunks=100, dtype='i4', path=path, + overwrite=True) + except NotImplementedError: + pass + else: + assert group_key not in store + assert array_key in store + meta = store._metadata_class.decode_array_metadata( + store[array_key] + ) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype('i4') == meta['data_type'] + + store.close() + + def _test_init_array_overwrite_chunk_store(self, order): + # setup + store = self.create_store() + chunk_store = self.create_store() + path = 'arr1' + mkey = 'meta/root/' + path + '.array.json' + store[mkey] = store._metadata_class.encode_array_metadata( + dict(shape=(2000,), + chunk_grid=dict(type='regular', + chunk_shape=(200,), + separator=('/')), + data_type=np.dtype('u1'), + compressor=None, + fill_value=0, + filters=None, + chunk_memory_layout=order) + ) + + chunk_store['data/root/arr1/0'] = b'aaa' + chunk_store['data/root/arr1/1'] = b'bbb' + + assert 'data/root/arr1/0' in chunk_store + assert 'data/root/arr1/1' in chunk_store + + # don't overwrite (default) + with pytest.raises(ValueError): + init_array(store, path=path, shape=1000, chunks=100, chunk_store=chunk_store) + + # do overwrite + try: + init_array(store, path=path, shape=1000, chunks=100, dtype='i4', + overwrite=True, chunk_store=chunk_store) + except NotImplementedError: + pass + else: + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype('i4') == meta['data_type'] + assert 'data/root/arr1/0' not in chunk_store + assert 'data/root/arr1/1' not in chunk_store + + store.close() + chunk_store.close() + + def test_init_array_compat(self): + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100, compressor='none') + mkey = 'meta/root/' + path + '.array.json' + meta = store._metadata_class.decode_array_metadata( + store[mkey] + ) + assert meta['compressor'] is None + + store.close() + + def test_init_group(self): + store = self.create_store() + path = "meta/root/foo" + init_group(store, path=path) + + # check metadata + mkey = 'meta/root/' + path + '.group.json' + assert mkey in store + meta = store._metadata_class.decode_group_metadata(store[mkey]) + assert meta == {'attributes': {}} + + store.close() + + def _test_init_group_overwrite(self, order): + pytest.skip( + "In v3 array and group names cannot overlap" + ) + + def _test_init_group_overwrite_path(self, order): + # setup + path = 'foo/bar' + store = self.create_store() + meta = dict( + shape=(2000,), + chunk_grid=dict(type='regular', + chunk_shape=(200,), + separator=('/')), + data_type=np.dtype('u1'), + compressor=None, + fill_value=0, + filters=None, + chunk_memory_layout=order, + ) + array_key = 'meta/root/' + path + '.array.json' + group_key = 'meta/root/' + path + '.group.json' + store[array_key] = store._metadata_class.encode_array_metadata(meta) + + # don't overwrite + with pytest.raises(ContainsArrayError): + init_group(store, path=path) + + # do overwrite + try: + init_group(store, overwrite=True, path=path) + except NotImplementedError: + pass + else: + assert array_key not in store + assert group_key in store + # should have been overwritten + meta = store._metadata_class.decode_group_metadata(store[group_key]) + # assert ZARR_FORMAT == meta['zarr_format'] + assert meta == {'attributes': {}} + + store.close() + + def _test_init_group_overwrite_chunk_store(self, order): + pytest.skip( + "In v3 array and group names cannot overlap" + ) + + +class TestMappingStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + return KVStoreV3(dict()) + + def test_set_invalid_content(self): + # Generic mappings support non-buffer types + pass + + +class TestMemoryStoreV3(TestMemoryStore, StoreV3Tests): + + def create_store(self, **kwargs): + skip_if_nested_chunks(**kwargs) + return MemoryStoreV3(**kwargs) + + +class TestDirectoryStoreV3(TestDirectoryStore, StoreV3Tests): + + def create_store(self, normalize_keys=False, **kwargs): + # For v3, don't have to skip if nested. + # skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStoreV3(path, normalize_keys=normalize_keys, **kwargs) + return store + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreV3(TestFSStore, StoreV3Tests): + + def create_store(self, normalize_keys=False, + dimension_separator=".", + path=None, + **kwargs): + + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + store = FSStoreV3( + path, + normalize_keys=normalize_keys, + dimension_separator=dimension_separator, + **kwargs) + return store + + def test_init_array(self): + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100) + + # check metadata + array_meta_key = 'meta/root/' + path + '.array.json' + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + assert meta['chunk_grid']['separator'] == "/" + + # TODO: remove this skip once v3 support is added to hierarchy.Group + @pytest.mark.skipif(True, reason="need v3 support in zarr.hierarchy.Group") + def test_deep_ndim(self): + import zarr + + store = self.create_store() + foo = zarr.open_group(store=store, path='group1') + bar = foo.create_group("bar") + baz = bar.create_dataset("baz", + shape=(4, 4, 4), + chunks=(2, 2, 2), + dtype="i8") + baz[:] = 1 + assert set(store.listdir()) == set(["data", "meta", "zarr.json"]) + assert set(store.listdir("meta/root/group1")) == set(["bar", "bar.group.json"]) + assert set(store.listdir("data/root/group1")) == set(["bar"]) + assert foo["bar"]["baz"][(0, 0, 0)] == 1 + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreV3WithKeySeparator(StoreV3Tests): + + def create_store(self, normalize_keys=False, key_separator=".", **kwargs): + + # Since the user is passing key_separator, that will take priority. + skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + return FSStoreV3( + path, + normalize_keys=normalize_keys, + key_separator=key_separator) + + +# TODO: remove NestedDirectoryStoreV3? +class TestNestedDirectoryStoreV3(TestNestedDirectoryStore, + TestDirectoryStoreV3): + + def create_store(self, normalize_keys=False, **kwargs): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = NestedDirectoryStoreV3(path, normalize_keys=normalize_keys, **kwargs) + return store + + def test_init_array(self): + store = self.create_store() + # assert store._dimension_separator == "/" + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100) + + # check metadata + array_meta_key = 'meta/root/' + path + '.array.json' + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + # assert meta['dimension_separator'] == "/" + assert meta['chunk_grid']['separator'] == "/" + +# TODO: enable once N5StoreV3 has been implemented +# @pytest.mark.skipif(True, reason="N5StoreV3 not yet fully implemented") +# class TestN5StoreV3(TestN5Store, TestNestedDirectoryStoreV3, StoreV3Tests): + + +class TestZipStoreV3(TestZipStore, StoreV3Tests): + + def create_store(self, **kwargs): + path = tempfile.mktemp(suffix='.zip') + atexit.register(os.remove, path) + store = ZipStoreV3(path, mode='w', **kwargs) + return store + + def test_mode(self): + with ZipStoreV3('data/store.zip', mode='w') as store: + store['foo'] = b'bar' + store = ZipStoreV3('data/store.zip', mode='r') + with pytest.raises(PermissionError): + store['foo'] = b'bar' + with pytest.raises(PermissionError): + store.clear() + + +class TestDBMStoreV3(TestDBMStore, StoreV3Tests): + + def create_store(self, dimension_separator=None): + path = tempfile.mktemp(suffix='.anydbm') + atexit.register(atexit_rmglob, path + '*') + # create store using default dbm implementation + store = DBMStoreV3(path, flag='n', dimension_separator=dimension_separator) + return store + + +class TestDBMStoreV3Dumb(TestDBMStoreDumb, StoreV3Tests): + + def create_store(self, **kwargs): + path = tempfile.mktemp(suffix='.dumbdbm') + atexit.register(atexit_rmglob, path + '*') + + import dbm.dumb as dumbdbm + store = DBMStoreV3(path, flag='n', open=dumbdbm.open, **kwargs) + return store + + +class TestDBMStoreV3Gnu(TestDBMStoreGnu, StoreV3Tests): + + def create_store(self, **kwargs): + gdbm = pytest.importorskip("dbm.gnu") + path = tempfile.mktemp(suffix=".gdbm") # pragma: no cover + atexit.register(os.remove, path) # pragma: no cover + store = DBMStoreV3( + path, flag="n", open=gdbm.open, write_lock=False, **kwargs + ) # pragma: no cover + return store # pragma: no cover + + +class TestDBMStoreV3NDBM(TestDBMStoreNDBM, StoreV3Tests): + + def create_store(self, **kwargs): + ndbm = pytest.importorskip("dbm.ndbm") + path = tempfile.mktemp(suffix=".ndbm") # pragma: no cover + atexit.register(atexit_rmglob, path + "*") # pragma: no cover + store = DBMStoreV3(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover + return store # pragma: no cover + + +class TestDBMStoreV3BerkeleyDB(TestDBMStoreBerkeleyDB, StoreV3Tests): + + def create_store(self, **kwargs): + bsddb3 = pytest.importorskip("bsddb3") + path = tempfile.mktemp(suffix='.dbm') + atexit.register(os.remove, path) + store = DBMStoreV3(path, flag='n', open=bsddb3.btopen, write_lock=False, **kwargs) + return store + + +class TestLMDBStoreV3(TestLMDBStore, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("lmdb") + path = tempfile.mktemp(suffix='.lmdb') + atexit.register(atexit_rmtree, path) + buffers = True + store = LMDBStoreV3(path, buffers=buffers, **kwargs) + return store + + +class TestSQLiteStoreV3(TestSQLiteStore, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStoreV3(path, **kwargs) + return store + + +class TestSQLiteStoreV3InMemory(TestSQLiteStoreInMemory, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + store = SQLiteStoreV3(':memory:', **kwargs) + return store + + +@skip_test_env_var("ZARR_TEST_MONGO") +class TestMongoDBStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("pymongo") + store = MongoDBStoreV3(host='127.0.0.1', database='zarr_tests', + collection='zarr_tests', **kwargs) + # start with an empty store + store.clear() + return store + + +@skip_test_env_var("ZARR_TEST_REDIS") +class TestRedisStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + # TODO: this is the default host for Redis on Travis, + # we probably want to generalize this though + pytest.importorskip("redis") + store = RedisStoreV3(host='localhost', port=6379, **kwargs) + # start with an empty store + store.clear() + return store + + +class TestLRUStoreCacheV3(TestLRUStoreCache, StoreV3Tests): + + def create_store(self, **kwargs): + # wrapper therefore no dimension_separator argument + skip_if_nested_chunks(**kwargs) + return LRUStoreCacheV3(dict(), max_size=2**27) + + def test_cache_values_no_max_size(self): + + # setup store + store = CountingDictV3() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + assert 1 == store.counter['__setitem__', 'bar'] + + # setup cache + cache = LRUStoreCacheV3(store, max_size=None) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test __setitem__, __getitem__ + cache['foo'] = b'zzz' + assert 1 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + # should be a cache hit + assert b'zzz' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + assert 2 == cache.hits + assert 1 == cache.misses + + # manually invalidate all cached values + cache.invalidate_values() + assert b'zzz' == cache['foo'] + assert 2 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + cache.invalidate() + assert b'zzz' == cache['foo'] + assert 3 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + + # test __delitem__ + del cache['foo'] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + cache['foo'] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + store['foo'] + + # verify other keys untouched + assert 0 == store.counter['__getitem__', 'bar'] + assert 1 == store.counter['__setitem__', 'bar'] + + def test_cache_values_with_max_size(self): + + # setup store + store = CountingDictV3() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + # setup cache - can only hold one item + cache = LRUStoreCacheV3(store, max_size=5) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first 'foo' __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second 'foo' __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test first 'bar' __getitem__, cache miss + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 1 == cache.hits + assert 2 == cache.misses + + # test second 'bar' __getitem__, cache hit + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 2 == cache.misses + + # test 'foo' __getitem__, should have been evicted, cache miss + assert b'xxx' == cache['foo'] + assert 2 == store.counter['__getitem__', 'foo'] + assert 2 == cache.hits + assert 3 == cache.misses + + # test 'bar' __getitem__, should have been evicted, cache miss + assert b'yyy' == cache['bar'] + assert 2 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 4 == cache.misses + + # setup store + store = CountingDictV3() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + # setup cache - can hold two items + cache = LRUStoreCacheV3(store, max_size=6) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first 'foo' __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second 'foo' __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test first 'bar' __getitem__, cache miss + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 1 == cache.hits + assert 2 == cache.misses + + # test second 'bar' __getitem__, cache hit + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 2 == cache.misses + + # test 'foo' __getitem__, should still be cached + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 3 == cache.hits + assert 2 == cache.misses + + # test 'bar' __getitem__, should still be cached + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 4 == cache.hits + assert 2 == cache.misses + + def test_cache_keys(self): + + # setup + store = CountingDictV3() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + assert 0 == store.counter['keys'] + cache = LRUStoreCacheV3(store, max_size=None) + + # keys should be cached on first call + keys = sorted(cache.keys()) + assert keys == ['bar', 'foo'] + assert 1 == store.counter['keys'] + # keys should now be cached + assert keys == sorted(cache.keys()) + assert 1 == store.counter['keys'] + assert 'foo' in cache + assert 0 == store.counter['__contains__', 'foo'] + assert keys == sorted(cache) + assert 0 == store.counter['__iter__'] + assert 1 == store.counter['keys'] + + # cache should be cleared if store is modified - crude but simple for now + cache['baz'] = b'zzz' + keys = sorted(cache.keys()) + assert keys == ['bar', 'baz', 'foo'] + assert 2 == store.counter['keys'] + # keys should now be cached + assert keys == sorted(cache.keys()) + assert 2 == store.counter['keys'] + + # manually invalidate keys + cache.invalidate_keys() + keys = sorted(cache.keys()) + assert keys == ['bar', 'baz', 'foo'] + assert 3 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + cache.invalidate_keys() + keys = sorted(cache) + assert keys == ['bar', 'baz', 'foo'] + assert 4 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + cache.invalidate_keys() + assert 'foo' in cache + assert 5 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + + # check these would get counted if called directly + assert 'foo' in store + assert 1 == store.counter['__contains__', 'foo'] + assert keys == sorted(store) + assert 1 == store.counter['__iter__'] + + +# TODO: implement ABSStoreV3 +# @skip_test_env_var("ZARR_TEST_ABS") +# class TestABSStoreV3(TestABSStore, StoreV3Tests): diff --git a/zarr/tests/util.py b/zarr/tests/util.py index e0f11d72ad..bb4df90d1b 100644 --- a/zarr/tests/util.py +++ b/zarr/tests/util.py @@ -1,7 +1,7 @@ import collections import os -from zarr.storage import Store +from zarr.storage import Store, StoreV3 import pytest @@ -41,6 +41,10 @@ def __delitem__(self, key): del self.wrapped[key] +class CountingDictV3(CountingDict, StoreV3): + pass + + def skip_test_env_var(name): """ Checks for environment variables indicating whether tests requiring services should be run """ From 85b8e2322025a367a9f47329cfef9e8a095b0e87 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 17 Nov 2021 13:58:52 -0500 Subject: [PATCH 2/7] add TODO comment to meta.py --- zarr/meta.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/zarr/meta.py b/zarr/meta.py index 07fbdcb7d4..d3f4ec50d5 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -22,6 +22,12 @@ ) _v3_core_type = {"bool", "i1", "u1"} | _v3_core_type +# TODO: How do we want to handle dtypes not officially in the v3 spec? +# Those in _v3_core_type above are the only ones defined in the spec. +# However we currently support many other dtypes for v2. For now, I also +# allow all of these for v3 unless the user sets an environment variable +# ZARR_V3_CORE_DTYPES_ONLY=1, etc. + ZARR_V3_CORE_DTYPES_ONLY = int(os.environ.get("ZARR_V3_CORE_DTYPES_ONLY", False)) ZARR_V3_ALLOW_COMPLEX = int(os.environ.get("ZARR_V3_ALLOW_COMPLEX", not ZARR_V3_CORE_DTYPES_ONLY)) From 983d190dd7be8fb9b7db95cbb3d92c43517a71d3 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 30 Nov 2021 14:37:47 -0500 Subject: [PATCH 3/7] fix flake8 errors --- zarr/_storage/store.py | 1 - zarr/tests/test_storage.py | 9 ++------- zarr/tests/test_storage_v3.py | 6 +++--- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 0ff9e0c043..2c6d7b3978 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -1,4 +1,3 @@ -import json import sys from collections.abc import MutableMapping from string import ascii_letters, digits diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 0865917926..1cad7f459f 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -18,9 +18,9 @@ from numcodecs.compat import ensure_bytes from zarr.codecs import BZ2, AsType, Blosc, Zlib -from zarr.errors import ContainsArrayError, ContainsGroupError, MetadataError +from zarr.errors import MetadataError from zarr.hierarchy import group -from zarr.meta import ZARR_FORMAT, ZARR_FORMAT_v3, decode_array_metadata +from zarr.meta import ZARR_FORMAT, decode_array_metadata from zarr.n5 import N5Store, N5FSStore from zarr.storage import (ABSStore, ConsolidatedMetadataStore, DBMStore, DictStore, DirectoryStore, KVStore, LMDBStore, @@ -31,12 +31,7 @@ attrs_key, default_compressor, getsize, group_meta_key, init_array, init_group, migrate_1to2) from zarr.storage import FSStore, rename, listdir -from zarr.storage import (KVStoreV3, MemoryStoreV3, ZipStoreV3, FSStoreV3, - DirectoryStoreV3, NestedDirectoryStoreV3, - RedisStoreV3, MongoDBStoreV3, DBMStoreV3, - LMDBStoreV3, SQLiteStoreV3, LRUStoreCacheV3) from zarr.tests.util import CountingDict, have_fsspec, skip_test_env_var, abs_container -from zarr.tests.util import CountingDictV3 @contextmanager diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index 9118bc513c..24f358c350 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from numcodecs.compat import ensure_bytes - from zarr.codecs import Zlib from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.meta import ZARR_FORMAT, ZARR_FORMAT_v3 @@ -25,9 +23,11 @@ TestDBMStoreNDBM, TestDBMStoreBerkeleyDB, TestLMDBStore, TestSQLiteStore, TestSQLiteStoreInMemory, TestLRUStoreCache, - dimension_separator_fixture, s3, skip_if_nested_chunks) +# pytest will fail to run if the following fixtures aren't imported here +from .test_storage import dimension_separator_fixture, s3 # noqa + @pytest.fixture(params=[ (None, "/"), From 9ed6181c41567fa99ef6fe1c3f73d89d62280706 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 14 Dec 2021 19:10:04 -0500 Subject: [PATCH 4/7] follow zarr v3 spec when dealing with extension data types --- zarr/meta.py | 224 +++++++++++++++++++++++++-------------------------- 1 file changed, 111 insertions(+), 113 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index d3f4ec50d5..72c6cfc869 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -1,6 +1,7 @@ import base64 import itertools import os +from collections import namedtuple from collections.abc import Mapping import numpy as np @@ -13,42 +14,72 @@ ZARR_FORMAT = 2 ZARR_FORMAT_v3 = 3 -FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} - - -_v3_core_type = set( - "".join(d) - for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8")) -) -_v3_core_type = {"bool", "i1", "u1"} | _v3_core_type - -# TODO: How do we want to handle dtypes not officially in the v3 spec? -# Those in _v3_core_type above are the only ones defined in the spec. -# However we currently support many other dtypes for v2. For now, I also -# allow all of these for v3 unless the user sets an environment variable -# ZARR_V3_CORE_DTYPES_ONLY=1, etc. - -ZARR_V3_CORE_DTYPES_ONLY = int(os.environ.get("ZARR_V3_CORE_DTYPES_ONLY", False)) -ZARR_V3_ALLOW_COMPLEX = int(os.environ.get("ZARR_V3_ALLOW_COMPLEX", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_DATETIME = int(os.environ.get("ZARR_V3_ALLOW_DATETIME", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_STRUCTURED = int(os.environ.get("ZARR_V3_ALLOW_STRUCTURED", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_OBJECTARRAY = int(os.environ.get("ZARR_V3_ALLOW_OBJECTARRAY", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_BYTES_ARRAY = int(os.environ.get("ZARR_V3_ALLOW_BYTES_ARRAY", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_UNICODE_ARRAY = int(os.environ.get("ZARR_V3_ALLOW_UNICODE_ARRAY", - not ZARR_V3_CORE_DTYPES_ONLY)) +# FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} _default_entry_point_metadata_v3 = { - 'zarr_format': "https://purl.org/zarr/spec/protocol/core/3.0", - 'metadata_encoding': "https://purl.org/zarr/spec/protocol/core/3.0", - 'metadata_key_suffix': '.json', + "zarr_format": "https://purl.org/zarr/spec/protocol/core/3.0", + "metadata_encoding": "https://purl.org/zarr/spec/protocol/core/3.0", + "metadata_key_suffix": ".json", "extensions": [], } +_v3_core_types = set( + "".join(d) for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8")) +) +_v3_core_types = {"bool", "i1", "u1"} | _v3_core_types + +# The set of complex types allowed ({"c4", ">c8"}) +_v3_complex_types = set( + f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("4", "8")) +) + +# All dtype.str values corresponding to datetime64 and timedelta64 +# see: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units +_date_units = ["Y", "M", "W", "D"] +_time_units = ["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +_v3_datetime_types = set(f"{end}{kind}8[{unit}]" for end, unit, kind in itertools.product("<>", _date_units + _time_units, ('m', 'M'))) + + +def get_extended_dtype_info(dtype): + if dtype.str in _v3_complex_types: + return dict( + extension="https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/extensions/complex-dtypes/v1.0.html", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str == "|O": + return dict( + extension="TODO: object array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|S"): + return dict( + extension="TODO: bytestring array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|U"): + return dict( + extension="TODO: unicode array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|V"): + return dict( + extension="TODO: structured array protocol URL", # noqa + type=dtype.descr, + fallback=None, + ) + elif dtype.str in _v3_datetime_types: + return dict( + extension="https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/extensions/datetime-dtypes/v1.0.html", # noqa + type=dtype.str, + fallback=None, + ) + else: + raise ValueError(f"Unsupport dtype: {dtype}") + class Metadata2: ZARR_FORMAT = ZARR_FORMAT @@ -85,12 +116,13 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A dtype = cls.decode_dtype(meta["dtype"]) if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['filters'][0]) + + object_codec = numcodecs.get_codec(meta["filters"][0]) else: object_codec = None dimension_separator = meta.get("dimension_separator", None) - fill_value = cls.decode_fill_value(meta['fill_value'], dtype, object_codec) + fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) meta = dict( zarr_format=meta["zarr_format"], shape=tuple(meta["shape"]), @@ -102,7 +134,7 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A filters=meta["filters"], ) if dimension_separator: - meta['dimension_separator'] = dimension_separator + meta["dimension_separator"] = dimension_separator except Exception as e: raise MetadataError("error decoding metadata") from e else: @@ -118,7 +150,8 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: dimension_separator = meta.get("dimension_separator") if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['filters'][0]) + + object_codec = numcodecs.get_codec(meta["filters"][0]) else: object_codec = None @@ -133,7 +166,7 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: filters=meta["filters"], ) if dimension_separator: - meta['dimension_separator'] = dimension_separator + meta["dimension_separator"] = dimension_separator if dimension_separator: meta["dimension_separator"] = dimension_separator @@ -180,13 +213,15 @@ def encode_group_metadata(cls, meta=None) -> bytes: return json_dumps(meta) @classmethod - def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + def decode_fill_value( + cls, v: Any, dtype: np.dtype, object_codec: Any = None + ) -> Any: # early out if v is None: return v - if dtype.kind == 'V' and dtype.hasobject: + if dtype.kind == "V" and dtype.hasobject: if object_codec is None: - raise ValueError('missing object_codec for object array') + raise ValueError("missing object_codec for object array") v = base64.standard_b64decode(v) v = object_codec.decode(v) v = np.array(v, dtype=dtype)[()] @@ -228,15 +263,17 @@ def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return np.array(v, dtype=dtype)[()] @classmethod - def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + def encode_fill_value( + cls, v: Any, dtype: np.dtype, object_codec: Any = None + ) -> Any: # early out if v is None: return v - if dtype.kind == 'V' and dtype.hasobject: + if dtype.kind == "V" and dtype.hasobject: if object_codec is None: - raise ValueError('missing object_codec for object array') + raise ValueError("missing object_codec for object array") v = object_codec.encode(v) - v = str(base64.standard_b64encode(v), 'ascii') + v = str(base64.standard_b64encode(v), "ascii") return v if dtype.kind == "f": if np.isnan(v): @@ -253,8 +290,10 @@ def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return bool(v) elif dtype.kind in "c": c = cast(np.complex128, np.dtype(complex).type()) - v = (cls.encode_fill_value(v.real, c.real.dtype, object_codec), - cls.encode_fill_value(v.imag, c.imag.dtype, object_codec)) + v = ( + cls.encode_fill_value(v.real, c.real.dtype, object_codec), + cls.encode_fill_value(v.imag, c.imag.dtype, object_codec), + ) return v elif dtype.kind in "SV": v = str(base64.standard_b64encode(v), "ascii") @@ -272,74 +311,29 @@ class Metadata3(Metadata2): @classmethod def decode_dtype(cls, d): + if isinstance(d, dict): + # extract the type from the extension info + info = get_extended_dtype_info(d) + d = info['type'] d = cls._decode_dtype_descr(d) dtype = np.dtype(d) - if dtype.kind == 'c': - if not ZARR_V3_ALLOW_COMPLEX: - raise ValueError("complex-valued arrays not supported") - elif dtype.kind in 'mM': - if not ZARR_V3_ALLOW_DATETIME: - raise ValueError( - "datetime64 and timedelta64 arrays not supported" - ) - elif dtype.kind == 'O': - if not ZARR_V3_ALLOW_OBJECTARRAY: - raise ValueError("object arrays not supported") - elif dtype.kind == 'V': - if not ZARR_V3_ALLOW_STRUCTURED: - raise ValueError("structured arrays not supported") - elif dtype.kind == 'U': - if not ZARR_V3_ALLOW_UNICODE_ARRAY: - raise ValueError("unicode arrays not supported") - elif dtype.kind == 'S': - if not ZARR_V3_ALLOW_BYTES_ARRAY: - raise ValueError("bytes arrays not supported") - else: - assert d in _v3_core_type return dtype @classmethod def encode_dtype(cls, d): - s = Metadata2.encode_dtype(d) + s = d.str if s == "|b1": return "bool" elif s == "|u1": return "u1" elif s == "|i1": return "i1" - dtype = np.dtype(d) - if dtype.kind == "c": - if not ZARR_V3_ALLOW_COMPLEX: - raise ValueError( - "complex-valued arrays not part of the base v3 spec" - ) - elif dtype.kind in "mM": - if not ZARR_V3_ALLOW_DATETIME: - raise ValueError( - "datetime64 and timedelta64 not part of the base v3 " - "spec" - ) - elif dtype.kind == "O": - if not ZARR_V3_ALLOW_OBJECTARRAY: - raise ValueError( - "object dtypes are not part of the base v3 spec" - ) - elif dtype.kind == "V": - if not ZARR_V3_ALLOW_STRUCTURED: - raise ValueError( - "structured arrays are not part of the base v3 spec" - ) - elif dtype.kind == 'U': - if not ZARR_V3_ALLOW_UNICODE_ARRAY: - raise ValueError("unicode dtypes are not part of the base v3 " - "spec") - elif dtype.kind == 'S': - if not ZARR_V3_ALLOW_BYTES_ARRAY: - raise ValueError("bytes dtypes are not part of the base v3 " - "spec") + elif s in _v3_core_types: + return Metadata2.encode_dtype(d) else: - assert s in _v3_core_type - return s + # Check if this dtype corresponds to a supported extension to + # the v3 protocol. + return get_extended_dtype_info(np.dtype(d)) @classmethod def decode_group_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: @@ -350,7 +344,7 @@ def decode_group_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A # if zarr_format != cls.ZARR_FORMAT: # raise MetadataError("unsupported zarr format: %s" % zarr_format) - assert 'attributes' in meta + assert "attributes" in meta # meta = dict(attributes=meta['attributes']) return meta @@ -362,7 +356,7 @@ def encode_group_metadata(cls, meta=None) -> bytes: # entry point metadata instead # meta = dict(zarr_format=cls.ZARR_FORMAT) if meta is None: - meta = {'attributes': {}} + meta = {"attributes": {}} meta = dict(attributes=meta.get("attributes", {})) return json_dumps(meta) @@ -371,26 +365,28 @@ def encode_hierarchy_metadata(cls, meta=None) -> bytes: if meta is None: meta = _default_entry_point_metadata_v3 elif set(meta.keys()) != { - "zarr_format", - "metadata_encoding", - "metadata_key_suffix", - "extensions", + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", }: raise ValueError(f"Unexpected keys in metadata. meta={meta}") return json_dumps(meta) @classmethod - def decode_hierarchy_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + def decode_hierarchy_metadata( + cls, s: Union[MappingType, str] + ) -> MappingType[str, Any]: meta = cls.parse_metadata(s) # check metadata format # zarr_format = meta.get("zarr_format", None) # if zarr_format != "https://purl.org/zarr/spec/protocol/core/3.0": # raise MetadataError("unsupported zarr format: %s" % zarr_format) if set(meta.keys()) != { - "zarr_format", - "metadata_encoding", - "metadata_key_suffix", - "extensions", + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", }: raise ValueError(f"Unexpected keys in metdata. meta={meta}") return meta @@ -409,7 +405,8 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A dtype = cls.decode_dtype(meta["data_type"]) if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['attributes']['filters'][0]) + + object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) else: object_codec = None fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) @@ -446,7 +443,8 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: dimension_separator = meta.get("dimension_separator") if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['attributes']['filters'][0]) + + object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) else: object_codec = None meta = dict( From 662e310fb5fe01500a10c5f02c358ebda559a5ec Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 14 Dec 2021 19:40:07 -0500 Subject: [PATCH 5/7] fixes to v3 dtype handling --- zarr/meta.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 72c6cfc869..bdb7dd9702 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -28,9 +28,9 @@ ) _v3_core_types = {"bool", "i1", "u1"} | _v3_core_types -# The set of complex types allowed ({"c4", ">c8"}) +# The set of complex types allowed ({"c8", ">c16"}) _v3_complex_types = set( - f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("4", "8")) + f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("8", "16")) ) # All dtype.str values corresponding to datetime64 and timedelta64 @@ -310,13 +310,24 @@ class Metadata3(Metadata2): ZARR_FORMAT = ZARR_FORMAT_v3 @classmethod - def decode_dtype(cls, d): + def decode_dtype(cls, d, validate=True): if isinstance(d, dict): # extract the type from the extension info - info = get_extended_dtype_info(d) - d = info['type'] + try: + d = d['type'] + except KeyError: + raise KeyError( + "Extended dtype info must provide a key named 'type'." + ) d = cls._decode_dtype_descr(d) dtype = np.dtype(d) + if validate: + if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): + # it is a core dtype of the v3 spec + pass + else: + # will raise if this is not a recognized extended dtype + get_extended_dtype_info(dtype) return dtype @classmethod From 450c57506a7427692a55acf098a57a10152dda88 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 15 Dec 2021 09:40:50 -0500 Subject: [PATCH 6/7] flake8 cleanup --- zarr/meta.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index bdb7dd9702..3d56e16fd3 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -1,7 +1,5 @@ import base64 import itertools -import os -from collections import namedtuple from collections.abc import Mapping import numpy as np @@ -37,7 +35,10 @@ # see: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units _date_units = ["Y", "M", "W", "D"] _time_units = ["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] -_v3_datetime_types = set(f"{end}{kind}8[{unit}]" for end, unit, kind in itertools.product("<>", _date_units + _time_units, ('m', 'M'))) +_v3_datetime_types = set( + f"{end}{kind}8[{unit}]" + for end, unit, kind in itertools.product("<>", _date_units + _time_units, ('m', 'M')) +) def get_extended_dtype_info(dtype): @@ -322,7 +323,7 @@ def decode_dtype(cls, d, validate=True): d = cls._decode_dtype_descr(d) dtype = np.dtype(d) if validate: - if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): + if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): # it is a core dtype of the v3 spec pass else: From 27579c3645464d47b1d3cf0ef2a3b10446d9d080 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 30 Nov 2021 14:59:43 -0500 Subject: [PATCH 7/7] update Attributes, adding StoreV3 support avoid pytest error about missing fixture fix flake8 error related to zarr_version fixture --- zarr/attrs.py | 68 +++++++++++++++++---- zarr/tests/test_attrs.py | 125 +++++++++++++++++++++++++-------------- zarr/tests/test_sync.py | 3 +- 3 files changed, 141 insertions(+), 55 deletions(-) diff --git a/zarr/attrs.py b/zarr/attrs.py index eff1237db1..78c26461c4 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -1,6 +1,6 @@ from collections.abc import MutableMapping -from zarr._storage.store import Store +from zarr._storage.store import Store, StoreV3 from zarr.util import json_dumps @@ -26,7 +26,15 @@ class Attributes(MutableMapping): def __init__(self, store, key='.zattrs', read_only=False, cache=True, synchronizer=None): - self.store = Store._ensure_store(store) + + self._version = getattr(store, '_store_version', 2) + assert key + + if self._version == 3 and '.z' in key: + raise ValueError('invalid v3 key') + + _Store = Store if self._version == 2 else StoreV3 + self.store = _Store._ensure_store(store) self.key = key self.read_only = read_only self.cache = cache @@ -38,6 +46,8 @@ def _get_nosync(self): data = self.store[self.key] except KeyError: d = dict() + if self._version > 2: + d['attributes'] = {} else: d = self.store._metadata_class.parse_metadata(data) return d @@ -47,6 +57,8 @@ def asdict(self): if self.cache and self._cached_asdict is not None: return self._cached_asdict d = self._get_nosync() + if self._version == 3: + d = d['attributes'] if self.cache: self._cached_asdict = d return d @@ -54,7 +66,10 @@ def asdict(self): def refresh(self): """Refresh cached attributes from the store.""" if self.cache: - self._cached_asdict = self._get_nosync() + if self._version == 3: + self._cached_asdict = self._get_nosync()['attributes'] + else: + self._cached_asdict = self._get_nosync() def __contains__(self, x): return x in self.asdict() @@ -84,7 +99,10 @@ def _setitem_nosync(self, item, value): d = self._get_nosync() # set key value - d[item] = value + if self._version == 2: + d[item] = value + else: + d['attributes'][item] = value # _put modified data self._put_nosync(d) @@ -98,7 +116,10 @@ def _delitem_nosync(self, key): d = self._get_nosync() # delete key value - del d[key] + if self._version == 2: + del d[key] + else: + del d['attributes'][key] # _put modified data self._put_nosync(d) @@ -106,12 +127,34 @@ def _delitem_nosync(self, key): def put(self, d): """Overwrite all attributes with the key/value pairs in the provided dictionary `d` in a single operation.""" - self._write_op(self._put_nosync, d) + if self._version == 2: + self._write_op(self._put_nosync, d) + else: + self._write_op(self._put_nosync, dict(attributes=d)) def _put_nosync(self, d): - self.store[self.key] = json_dumps(d) - if self.cache: - self._cached_asdict = d + if self._version == 2: + self.store[self.key] = json_dumps(d) + if self.cache: + self._cached_asdict = d + else: + if self.key in self.store: + # Cannot write the attributes directly to JSON, but have to + # store it within the pre-existing attributes key of the v3 + # metadata. + + # Note: this changes the store.counter result in test_caching_on! + + meta = self.store._metadata_class.parse_metadata(self.store[self.key]) + if 'attributes' in meta and 'filters' in meta['attributes']: + # need to preserve any existing "filters" attribute + d['attributes']['filters'] = meta['attributes']['filters'] + meta['attributes'] = d['attributes'] + else: + meta = d + self.store[self.key] = json_dumps(meta) + if self.cache: + self._cached_asdict = d['attributes'] # noinspection PyMethodOverriding def update(self, *args, **kwargs): @@ -124,7 +167,12 @@ def _update_nosync(self, *args, **kwargs): d = self._get_nosync() # update - d.update(*args, **kwargs) + if self._version == 2: + d.update(*args, **kwargs) + else: + if 'attributes' not in d: + d['attributes'] = {} + d['attributes'].update(*args, **kwargs) # _put modified data self._put_nosync(d) diff --git a/zarr/tests/test_attrs.py b/zarr/tests/test_attrs.py index b2de736d4a..62faf662da 100644 --- a/zarr/tests/test_attrs.py +++ b/zarr/tests/test_attrs.py @@ -3,8 +3,20 @@ import pytest from zarr.attrs import Attributes -from zarr.tests.util import CountingDict -from zarr.storage import KVStore +from zarr.storage import KVStore, KVStoreV3 +from zarr.tests.util import CountingDict, CountingDictV3 + + +@pytest.fixture(params=[2, 3]) +def zarr_version(request): + return request.param + + +def _init_store(version): + """Use a plain dict() for v2, but KVStoreV3 otherwise.""" + if version == 2: + return dict() + return KVStoreV3(dict()) class TestAttributes(): @@ -12,13 +24,9 @@ class TestAttributes(): def init_attributes(self, store, read_only=False, cache=True): return Attributes(store, key='attrs', read_only=read_only, cache=cache) - @pytest.mark.parametrize('store_from_dict', [False, True]) - def test_storage(self, store_from_dict): + def test_storage(self, zarr_version): - if store_from_dict: - store = dict() - else: - store = KVStore(dict()) + store = _init_store(zarr_version) a = Attributes(store=store, key='attrs') assert isinstance(a.store, KVStore) assert 'foo' not in a @@ -30,11 +38,14 @@ def test_storage(self, store_from_dict): assert 'attrs' in store assert isinstance(store['attrs'], bytes) d = json.loads(str(store['attrs'], 'ascii')) + if zarr_version == 3: + d = d['attributes'] assert dict(foo='bar', baz=42) == d - def test_get_set_del_contains(self): + def test_get_set_del_contains(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store) assert 'foo' not in a a['foo'] = 'bar' a['baz'] = 42 @@ -48,9 +59,10 @@ def test_get_set_del_contains(self): # noinspection PyStatementEffect a['foo'] - def test_update_put(self): + def test_update_put(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store) assert 'foo' not in a assert 'bar' not in a assert 'baz' not in a @@ -65,9 +77,10 @@ def test_update_put(self): assert a['bar'] == 84 assert 'baz' not in a - def test_iterators(self): + def test_iterators(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store) assert 0 == len(a) assert set() == set(a) assert set() == set(a.keys()) @@ -83,10 +96,13 @@ def test_iterators(self): assert {'bar', 42} == set(a.values()) assert {('foo', 'bar'), ('baz', 42)} == set(a.items()) - def test_read_only(self): - store = dict() + def test_read_only(self, zarr_version): + store = _init_store(zarr_version) a = self.init_attributes(store, read_only=True) - store['attrs'] = json.dumps(dict(foo='bar', baz=42)).encode('ascii') + if zarr_version == 2: + store['attrs'] = json.dumps(dict(foo='bar', baz=42)).encode('ascii') + else: + store['attrs'] = json.dumps(dict(attributes=dict(foo='bar', baz=42))).encode('ascii') assert a['foo'] == 'bar' assert a['baz'] == 42 with pytest.raises(PermissionError): @@ -96,8 +112,9 @@ def test_read_only(self): with pytest.raises(PermissionError): a.update(foo='quux') - def test_key_completions(self): - a = self.init_attributes(dict()) + def test_key_completions(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store) d = a._ipython_key_completions_() assert 'foo' not in d assert '123' not in d @@ -112,14 +129,17 @@ def test_key_completions(self): assert 'asdf;' in d assert 'baz' not in d - def test_caching_on(self): + def test_caching_on(self, zarr_version): # caching is turned on by default # setup store - store = CountingDict() + store = CountingDict() if zarr_version == 2 else CountingDictV3() assert 0 == store.counter['__getitem__', 'attrs'] assert 0 == store.counter['__setitem__', 'attrs'] - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + if zarr_version == 2: + store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store['attrs'] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') assert 0 == store.counter['__getitem__', 'attrs'] assert 1 == store.counter['__setitem__', 'attrs'] @@ -136,54 +156,65 @@ def test_caching_on(self): # test __setitem__ updates the cache a['foo'] = 'yyy' - assert 2 == store.counter['__getitem__', 'attrs'] + get_cnt = 2 if zarr_version == 2 else 3 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 2 == store.counter['__setitem__', 'attrs'] assert a['foo'] == 'yyy' - assert 2 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 2 == store.counter['__setitem__', 'attrs'] # test update() updates the cache a.update(foo='zzz', bar=84) - assert 3 == store.counter['__getitem__', 'attrs'] + get_cnt = 3 if zarr_version == 2 else 5 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] assert a['foo'] == 'zzz' assert a['bar'] == 84 - assert 3 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] # test __contains__ uses the cache assert 'foo' in a - assert 3 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] assert 'spam' not in a - assert 3 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] # test __delitem__ updates the cache del a['bar'] - assert 4 == store.counter['__getitem__', 'attrs'] + get_cnt = 4 if zarr_version == 2 else 7 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 4 == store.counter['__setitem__', 'attrs'] assert 'bar' not in a - assert 4 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 4 == store.counter['__setitem__', 'attrs'] # test refresh() - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') - assert 4 == store.counter['__getitem__', 'attrs'] + if zarr_version == 2: + store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store['attrs'] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') + assert get_cnt == store.counter['__getitem__', 'attrs'] a.refresh() - assert 5 == store.counter['__getitem__', 'attrs'] + get_cnt = 5 if zarr_version == 2 else 8 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert a['foo'] == 'xxx' - assert 5 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert a['bar'] == 42 - assert 5 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] - def test_caching_off(self): + def test_caching_off(self, zarr_version): # setup store - store = CountingDict() + store = CountingDict() if zarr_version == 2 else CountingDictV3() assert 0 == store.counter['__getitem__', 'attrs'] assert 0 == store.counter['__setitem__', 'attrs'] - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + + if zarr_version == 2: + store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store['attrs'] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') assert 0 == store.counter['__getitem__', 'attrs'] assert 1 == store.counter['__setitem__', 'attrs'] @@ -200,25 +231,31 @@ def test_caching_off(self): # test __setitem__ a['foo'] = 'yyy' - assert 4 == store.counter['__getitem__', 'attrs'] + get_cnt = 4 if zarr_version == 2 else 5 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 2 == store.counter['__setitem__', 'attrs'] assert a['foo'] == 'yyy' - assert 5 == store.counter['__getitem__', 'attrs'] + get_cnt = 5 if zarr_version == 2 else 6 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 2 == store.counter['__setitem__', 'attrs'] # test update() a.update(foo='zzz', bar=84) - assert 6 == store.counter['__getitem__', 'attrs'] + get_cnt = 6 if zarr_version == 2 else 8 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] assert a['foo'] == 'zzz' assert a['bar'] == 84 - assert 8 == store.counter['__getitem__', 'attrs'] + get_cnt = 8 if zarr_version == 2 else 10 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] # test __contains__ assert 'foo' in a - assert 9 == store.counter['__getitem__', 'attrs'] + get_cnt = 9 if zarr_version == 2 else 11 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] assert 'spam' not in a - assert 10 == store.counter['__getitem__', 'attrs'] + get_cnt = 10 if zarr_version == 2 else 12 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] diff --git a/zarr/tests/test_sync.py b/zarr/tests/test_sync.py index 69fc0d7708..1a763dc7f7 100644 --- a/zarr/tests/test_sync.py +++ b/zarr/tests/test_sync.py @@ -15,7 +15,8 @@ from zarr.storage import (DirectoryStore, KVStore, atexit_rmtree, init_array, init_group) from zarr.sync import ProcessSynchronizer, ThreadSynchronizer -from zarr.tests.test_attrs import TestAttributes +# zarr_version fixture must be imported although not used directly here +from zarr.tests.test_attrs import TestAttributes, zarr_version # noqa from zarr.tests.test_core import TestArray from zarr.tests.test_hierarchy import TestGroup