From 0edc39b51941d136365a8535b55621db3e246e70 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 17 Oct 2024 23:28:28 +0200 Subject: [PATCH 01/22] move v3/tests to tests and fix various mypy issues --- pyproject.toml | 23 +++++++++---------- src/zarr/testing/utils.py | 17 ++++++++++---- tests/{v3 => }/conftest.py | 0 .../entry_points.txt | 0 .../package_with_entrypoint/__init__.py | 0 tests/{v3 => }/test_api.py | 0 tests/{v3 => }/test_array.py | 0 tests/{v3 => }/test_attributes.py | 0 tests/{v3 => }/test_buffer.py | 0 tests/{v3 => }/test_chunk_grids.py | 0 tests/{v3 => }/test_codec_entrypoints.py | 0 tests/{v3 => test_codecs}/__init__.py | 0 tests/{v3 => }/test_codecs/test_blosc.py | 0 tests/{v3 => }/test_codecs/test_codecs.py | 0 tests/{v3 => }/test_codecs/test_endian.py | 0 tests/{v3 => }/test_codecs/test_gzip.py | 0 tests/{v3 => }/test_codecs/test_sharding.py | 0 tests/{v3 => }/test_codecs/test_transpose.py | 0 tests/{v3 => }/test_codecs/test_vlen.py | 0 tests/{v3 => }/test_codecs/test_zstd.py | 0 tests/{v3 => }/test_common.py | 0 tests/{v3 => }/test_config.py | 0 tests/{v3 => }/test_group.py | 0 tests/{v3 => }/test_indexing.py | 0 .../test_codecs => test_metadata}/__init__.py | 0 .../test_metadata/test_consolidated.py | 0 tests/{v3 => }/test_metadata/test_v2.py | 0 tests/{v3 => }/test_metadata/test_v3.py | 0 tests/{v3 => }/test_properties.py | 0 .../test_metadata => test_store}/__init__.py | 0 tests/{v3 => }/test_store/test_core.py | 0 tests/{v3 => }/test_store/test_local.py | 0 tests/{v3 => }/test_store/test_logging.py | 0 tests/{v3 => }/test_store/test_memory.py | 0 tests/{v3 => }/test_store/test_remote.py | 0 .../test_store/test_stateful_store.py | 0 tests/{v3 => }/test_store/test_zip.py | 0 tests/{v3 => }/test_sync.py | 0 tests/{v3 => }/test_v2.py | 0 tests/v3/test_store/__init__.py | 0 40 files changed, 24 insertions(+), 16 deletions(-) rename tests/{v3 => }/conftest.py (100%) rename tests/{v3 => }/package_with_entrypoint-0.1.dist-info/entry_points.txt (100%) rename tests/{v3 => }/package_with_entrypoint/__init__.py (100%) rename tests/{v3 => }/test_api.py (100%) rename tests/{v3 => }/test_array.py (100%) rename tests/{v3 => }/test_attributes.py (100%) rename tests/{v3 => }/test_buffer.py (100%) rename tests/{v3 => }/test_chunk_grids.py (100%) rename tests/{v3 => }/test_codec_entrypoints.py (100%) rename tests/{v3 => test_codecs}/__init__.py (100%) rename tests/{v3 => }/test_codecs/test_blosc.py (100%) rename tests/{v3 => }/test_codecs/test_codecs.py (100%) rename tests/{v3 => }/test_codecs/test_endian.py (100%) rename tests/{v3 => }/test_codecs/test_gzip.py (100%) rename tests/{v3 => }/test_codecs/test_sharding.py (100%) rename tests/{v3 => }/test_codecs/test_transpose.py (100%) rename tests/{v3 => }/test_codecs/test_vlen.py (100%) rename tests/{v3 => }/test_codecs/test_zstd.py (100%) rename tests/{v3 => }/test_common.py (100%) rename tests/{v3 => }/test_config.py (100%) rename tests/{v3 => }/test_group.py (100%) rename tests/{v3 => }/test_indexing.py (100%) rename tests/{v3/test_codecs => test_metadata}/__init__.py (100%) rename tests/{v3 => }/test_metadata/test_consolidated.py (100%) rename tests/{v3 => }/test_metadata/test_v2.py (100%) rename tests/{v3 => }/test_metadata/test_v3.py (100%) rename tests/{v3 => }/test_properties.py (100%) rename tests/{v3/test_metadata => test_store}/__init__.py (100%) rename tests/{v3 => }/test_store/test_core.py (100%) rename tests/{v3 => }/test_store/test_local.py (100%) rename tests/{v3 => }/test_store/test_logging.py (100%) rename tests/{v3 => }/test_store/test_memory.py (100%) rename tests/{v3 => }/test_store/test_remote.py (100%) rename tests/{v3 => }/test_store/test_stateful_store.py (100%) rename tests/{v3 => }/test_store/test_zip.py (100%) rename tests/{v3 => }/test_sync.py (100%) rename tests/{v3 => }/test_v2.py (100%) delete mode 100644 tests/v3/test_store/__init__.py diff --git a/pyproject.toml b/pyproject.toml index 593be36b75..e40b6bf23b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -275,18 +275,17 @@ ignore_errors = true [[tool.mypy.overrides]] module = [ - "tests.v2.*", - "tests.v3.package_with_entrypoint.*", - "tests.v3.test_codecs.test_codecs", - "tests.v3.test_codecs.test_transpose", - "tests.v3.test_metadata.*", - "tests.v3.test_store.*", - "tests.v3.test_config", - "tests.v3.test_group", - "tests.v3.test_indexing", - "tests.v3.test_properties", - "tests.v3.test_sync", - "tests.v3.test_v2", + "tests.package_with_entrypoint.*", + "tests.test_codecs.test_codecs", + "tests.test_codecs.test_transpose", + "tests.test_metadata.*", + "tests.test_store.*", + "tests.test_config", + "tests.test_group", + "tests.test_indexing", + "tests.test_properties", + "tests.test_sync", + "tests.test_v2", ] ignore_errors = true diff --git a/src/zarr/testing/utils.py b/src/zarr/testing/utils.py index 9d6dfa7e18..c7b6e7939c 100644 --- a/src/zarr/testing/utils.py +++ b/src/zarr/testing/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, cast +from collections.abc import Callable, Coroutine +from typing import TYPE_CHECKING, Any, TypeVar, cast import pytest @@ -37,8 +38,16 @@ def has_cupy() -> bool: return False +T_Callable = TypeVar("T_Callable", bound=Callable[[], Coroutine[Any, Any, None]]) + + # Decorator for GPU tests -def gpu_test(func: Any) -> Any: - return pytest.mark.gpu( - pytest.mark.skipif(not has_cupy(), reason="CuPy not installed or no GPU available")(func) +def gpu_test(func: T_Callable) -> T_Callable: + return cast( + T_Callable, + pytest.mark.gpu( + pytest.mark.skipif(not has_cupy(), reason="CuPy not installed or no GPU available")( + func + ) + ), ) diff --git a/tests/v3/conftest.py b/tests/conftest.py similarity index 100% rename from tests/v3/conftest.py rename to tests/conftest.py diff --git a/tests/v3/package_with_entrypoint-0.1.dist-info/entry_points.txt b/tests/package_with_entrypoint-0.1.dist-info/entry_points.txt similarity index 100% rename from tests/v3/package_with_entrypoint-0.1.dist-info/entry_points.txt rename to tests/package_with_entrypoint-0.1.dist-info/entry_points.txt diff --git a/tests/v3/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py similarity index 100% rename from tests/v3/package_with_entrypoint/__init__.py rename to tests/package_with_entrypoint/__init__.py diff --git a/tests/v3/test_api.py b/tests/test_api.py similarity index 100% rename from tests/v3/test_api.py rename to tests/test_api.py diff --git a/tests/v3/test_array.py b/tests/test_array.py similarity index 100% rename from tests/v3/test_array.py rename to tests/test_array.py diff --git a/tests/v3/test_attributes.py b/tests/test_attributes.py similarity index 100% rename from tests/v3/test_attributes.py rename to tests/test_attributes.py diff --git a/tests/v3/test_buffer.py b/tests/test_buffer.py similarity index 100% rename from tests/v3/test_buffer.py rename to tests/test_buffer.py diff --git a/tests/v3/test_chunk_grids.py b/tests/test_chunk_grids.py similarity index 100% rename from tests/v3/test_chunk_grids.py rename to tests/test_chunk_grids.py diff --git a/tests/v3/test_codec_entrypoints.py b/tests/test_codec_entrypoints.py similarity index 100% rename from tests/v3/test_codec_entrypoints.py rename to tests/test_codec_entrypoints.py diff --git a/tests/v3/__init__.py b/tests/test_codecs/__init__.py similarity index 100% rename from tests/v3/__init__.py rename to tests/test_codecs/__init__.py diff --git a/tests/v3/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py similarity index 100% rename from tests/v3/test_codecs/test_blosc.py rename to tests/test_codecs/test_blosc.py diff --git a/tests/v3/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py similarity index 100% rename from tests/v3/test_codecs/test_codecs.py rename to tests/test_codecs/test_codecs.py diff --git a/tests/v3/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py similarity index 100% rename from tests/v3/test_codecs/test_endian.py rename to tests/test_codecs/test_endian.py diff --git a/tests/v3/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py similarity index 100% rename from tests/v3/test_codecs/test_gzip.py rename to tests/test_codecs/test_gzip.py diff --git a/tests/v3/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py similarity index 100% rename from tests/v3/test_codecs/test_sharding.py rename to tests/test_codecs/test_sharding.py diff --git a/tests/v3/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py similarity index 100% rename from tests/v3/test_codecs/test_transpose.py rename to tests/test_codecs/test_transpose.py diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py similarity index 100% rename from tests/v3/test_codecs/test_vlen.py rename to tests/test_codecs/test_vlen.py diff --git a/tests/v3/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py similarity index 100% rename from tests/v3/test_codecs/test_zstd.py rename to tests/test_codecs/test_zstd.py diff --git a/tests/v3/test_common.py b/tests/test_common.py similarity index 100% rename from tests/v3/test_common.py rename to tests/test_common.py diff --git a/tests/v3/test_config.py b/tests/test_config.py similarity index 100% rename from tests/v3/test_config.py rename to tests/test_config.py diff --git a/tests/v3/test_group.py b/tests/test_group.py similarity index 100% rename from tests/v3/test_group.py rename to tests/test_group.py diff --git a/tests/v3/test_indexing.py b/tests/test_indexing.py similarity index 100% rename from tests/v3/test_indexing.py rename to tests/test_indexing.py diff --git a/tests/v3/test_codecs/__init__.py b/tests/test_metadata/__init__.py similarity index 100% rename from tests/v3/test_codecs/__init__.py rename to tests/test_metadata/__init__.py diff --git a/tests/v3/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py similarity index 100% rename from tests/v3/test_metadata/test_consolidated.py rename to tests/test_metadata/test_consolidated.py diff --git a/tests/v3/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py similarity index 100% rename from tests/v3/test_metadata/test_v2.py rename to tests/test_metadata/test_v2.py diff --git a/tests/v3/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py similarity index 100% rename from tests/v3/test_metadata/test_v3.py rename to tests/test_metadata/test_v3.py diff --git a/tests/v3/test_properties.py b/tests/test_properties.py similarity index 100% rename from tests/v3/test_properties.py rename to tests/test_properties.py diff --git a/tests/v3/test_metadata/__init__.py b/tests/test_store/__init__.py similarity index 100% rename from tests/v3/test_metadata/__init__.py rename to tests/test_store/__init__.py diff --git a/tests/v3/test_store/test_core.py b/tests/test_store/test_core.py similarity index 100% rename from tests/v3/test_store/test_core.py rename to tests/test_store/test_core.py diff --git a/tests/v3/test_store/test_local.py b/tests/test_store/test_local.py similarity index 100% rename from tests/v3/test_store/test_local.py rename to tests/test_store/test_local.py diff --git a/tests/v3/test_store/test_logging.py b/tests/test_store/test_logging.py similarity index 100% rename from tests/v3/test_store/test_logging.py rename to tests/test_store/test_logging.py diff --git a/tests/v3/test_store/test_memory.py b/tests/test_store/test_memory.py similarity index 100% rename from tests/v3/test_store/test_memory.py rename to tests/test_store/test_memory.py diff --git a/tests/v3/test_store/test_remote.py b/tests/test_store/test_remote.py similarity index 100% rename from tests/v3/test_store/test_remote.py rename to tests/test_store/test_remote.py diff --git a/tests/v3/test_store/test_stateful_store.py b/tests/test_store/test_stateful_store.py similarity index 100% rename from tests/v3/test_store/test_stateful_store.py rename to tests/test_store/test_stateful_store.py diff --git a/tests/v3/test_store/test_zip.py b/tests/test_store/test_zip.py similarity index 100% rename from tests/v3/test_store/test_zip.py rename to tests/test_store/test_zip.py diff --git a/tests/v3/test_sync.py b/tests/test_sync.py similarity index 100% rename from tests/v3/test_sync.py rename to tests/test_sync.py diff --git a/tests/v3/test_v2.py b/tests/test_v2.py similarity index 100% rename from tests/v3/test_v2.py rename to tests/test_v2.py diff --git a/tests/v3/test_store/__init__.py b/tests/v3/test_store/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 From b9fca093a2eb6e3dae0e4b8edd0aa4c639889be7 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 15 Oct 2024 08:52:19 -0700 Subject: [PATCH 02/22] test(ci): change branch name in v3 workflows (#2368) --- .github/workflows/gpu_test.yml | 6 +++--- .github/workflows/hypothesis.yaml | 2 -- .github/workflows/test.yml | 6 +++--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index e0be897532..f9e36e9c2e 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -1,13 +1,13 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: GPU Test V3 +name: GPU Test on: push: - branches: [ v3 ] + branches: [ main ] pull_request: - branches: [ v3 ] + branches: [ main ] workflow_dispatch: env: diff --git a/.github/workflows/hypothesis.yaml b/.github/workflows/hypothesis.yaml index c5a239c274..85d48bddb1 100644 --- a/.github/workflows/hypothesis.yaml +++ b/.github/workflows/hypothesis.yaml @@ -3,11 +3,9 @@ on: push: branches: - "main" - - "v3" pull_request: branches: - "main" - - "v3" types: [opened, reopened, synchronize, labeled] schedule: - cron: "0 0 * * *" # Daily “At 00:00” UTC diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5683b62dff..ae24fca6f5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,13 +1,13 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Test V3 +name: Test on: push: - branches: [ v3 ] + branches: [ main ] pull_request: - branches: [ v3 ] + branches: [ main ] workflow_dispatch: concurrency: From 0172dd6ebb3e76e854c8751d3f6ebd2b340f05f3 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Tue, 15 Oct 2024 18:20:36 +0200 Subject: [PATCH 03/22] Use lazy % formatting in logging functions (#2366) * Use lazy % formatting in logging functions * f-string should be more efficient * Space before unit symbol From "SI Unit rules and style conventions": https://physics.nist.gov/cuu/Units/checklist.html There is a space between the numerical value and unit symbol, even when the value is used in an adjectival sense, except in the case of superscript units for plane angle. * Enforce ruff/flake8-logging-format rules (G) --------- Co-authored-by: Joe Hamman --- pyproject.toml | 1 + src/zarr/core/sync.py | 2 +- src/zarr/storage/logging.py | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e40b6bf23b..281baee1c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -211,6 +211,7 @@ extend-select = [ "B", # flake8-bugbear "C4", # flake8-comprehensions "FLY", # flynt + "G", # flake8-logging-format "I", # isort "ISC", # flake8-implicit-str-concat "PGH", # pygrep-hooks diff --git a/src/zarr/core/sync.py b/src/zarr/core/sync.py index 20f04f543b..8c5bc9c397 100644 --- a/src/zarr/core/sync.py +++ b/src/zarr/core/sync.py @@ -133,7 +133,7 @@ def sync( finished, unfinished = wait([future], return_when=asyncio.ALL_COMPLETED, timeout=timeout) if len(unfinished) > 0: - raise TimeoutError(f"Coroutine {coro} failed to finish in within {timeout}s") + raise TimeoutError(f"Coroutine {coro} failed to finish in within {timeout} s") assert len(finished) == 1 return_result = next(iter(finished)).result() diff --git a/src/zarr/storage/logging.py b/src/zarr/storage/logging.py index 59a796dc18..a29661729f 100644 --- a/src/zarr/storage/logging.py +++ b/src/zarr/storage/logging.py @@ -83,15 +83,15 @@ def log(self, hint: Any = "") -> Generator[None, None, None]: method = inspect.stack()[2].function op = f"{type(self._store).__name__}.{method}" if hint: - op += f"({hint})" - self.logger.info(f"Calling {op}") + op = f"{op}({hint})" + self.logger.info("Calling %s", op) start_time = time.time() try: self.counter[method] += 1 yield finally: end_time = time.time() - self.logger.info(f"Finished {op} [{end_time - start_time:.2f}s]") + self.logger.info("Finished %s [%.2f s]", op, end_time - start_time) @property def supports_writes(self) -> bool: From f76f3a31fd4b9b62f484301696624425ced8d798 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 15 Oct 2024 11:02:30 -0700 Subject: [PATCH 04/22] Move roadmap and v3-design documument to docs (#2354) * move roadmap to docs * formatting and minor copy editing --- docs/index.rst | 1 + docs/roadmap.rst | 696 +++++++++++++++++++++++++++++++++++++++ v3-roadmap-and-design.md | 429 ------------------------ 3 files changed, 697 insertions(+), 429 deletions(-) create mode 100644 docs/roadmap.rst delete mode 100644 v3-roadmap-and-design.md diff --git a/docs/index.rst b/docs/index.rst index 6b90b5a773..d0b41ed634 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,6 +16,7 @@ Zarr-Python release license contributing + roadmap **Version**: |version| diff --git a/docs/roadmap.rst b/docs/roadmap.rst new file mode 100644 index 0000000000..93f2a26896 --- /dev/null +++ b/docs/roadmap.rst @@ -0,0 +1,696 @@ +Roadmap +======= + +- Status: active +- Author: Joe Hamman +- Created On: October 31, 2023 +- Input from: + + - Davis Bennett / @d-v-b + - Norman Rzepka / @normanrz + - Deepak Cherian @dcherian + - Brian Davis / @monodeldiablo + - Oliver McCormack / @olimcc + - Ryan Abernathey / @rabernat + - Jack Kelly / @JackKelly + - Martin Durrant / @martindurant + +.. note:: + + This document was written in the early stages of the 3.0 refactor. Some + aspects of the design have changed since this was originally written. + Questions and discussion about the contents of this document should be directed to + `this GitHub Discussion `__. + +Introduction +------------ + +This document lays out a design proposal for version 3.0 of the +`Zarr-Python `__ package. A +specific focus of the design is to bring Zarr-Python’s API up to date +with the `Zarr V3 +specification `__, +with the hope of enabling the development of the many features and +extensions that motivated the V3 Spec. The ideas presented here are +expected to result in a major release of Zarr-Python (version 3.0) +including significant a number of breaking API changes. For clarity, +“V3” will be used to describe the version of the Zarr specification and +“3.0” will be used to describe the release tag of the Zarr-Python +project. + +Current status of V3 in Zarr-Python +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +During the development of the V3 Specification, a `prototype +implementation `__ +was added to the Zarr-Python library. Since that implementation, the V3 +spec evolved in significant ways and as a result, the Zarr-Python +library is now out of sync with the approved spec. Downstream libraries +(e.g. `Xarray `__) have added support +for this implementation and will need to migrate to the accepted spec +when its available in Zarr-Python. + +Goals +----- + +- Provide a complete implementation of Zarr V3 through the Zarr-Python + API +- Clear the way for exciting extensions / ZEPs + (i.e. `sharding `__, + `variable chunking `__, + etc.) +- Provide a developer API that can be used to implement and register V3 + extensions +- Improve the performance of Zarr-Python by streamlining the interface + between the Store layer and higher level APIs (e.g. Groups and + Arrays) +- Clean up the internal and user facing APIs +- Improve code quality and robustness (e.g. achieve 100% type hint + coverage) +- Align the Zarr-Python array API with the `array API + Standard `__ + +Examples of what 3.0 will enable? +--------------------------------- + +1. Reading and writing V3 spec-compliant groups and arrays +2. V3 extensions including sharding and variable chunking. +3. Improved performance by leveraging concurrency when + creating/reading/writing to stores (imagine a + ``create_hierarchy(zarr_objects)`` function). +4. User-developed extensions (e.g. storage-transformers) can be + registered with Zarr-Python at runtime + +Non-goals (of this document) +---------------------------- + +- Implementation of any unaccepted Zarr V3 extensions +- Major revisions to the Zarr V3 spec + +Requirements +------------ + +1. Read and write spec compliant V2 and V3 data +2. Limit unnecessary traffic to/from the store +3. Cleanly define the Array/Group/Store abstractions +4. Cleanly define how V2 will be supported going forward +5. Provide a clear roadmap to help users upgrade to 3.0 +6. Developer tools / hooks for registering extensions + +Design +------ + +Async API +~~~~~~~~~ + +Zarr-Python is an IO library. As such, supporting concurrent action +against the storage layer is critical to achieving acceptable +performance. The Zarr-Python 2 was not designed with asynchronous +computation in mind and as a result has struggled to effectively +leverage the benefits of concurrency. At one point, ``getitems`` and +``setitems`` support was added to the Zarr store model but that is only +used for operating on a set of chunks in a single variable. + +With Zarr-Python 3.0, we have the opportunity to revisit this design. +The proposal here is as follows: + +1. The ``Store`` interface will be entirely async. +2. On top of the async ``Store`` interface, we will provide an + ``AsyncArray`` and ``AsyncGroup`` interface. +3. Finally, the primary user facing API will be synchronous ``Array`` + and ``Group`` classes that wrap the async equivalents. + +**Examples** + +- **Store** + + .. code:: python + + class Store: + ... + async def get(self, key: str) -> bytes: + ... + async def get_partial_values(self, key_ranges: List[Tuple[str, Tuple[int, Optional[int]]]]) -> bytes: + ... + # (no sync interface here) + +- **Array** + + .. code:: python + + class AsyncArray: + ... + + async def getitem(self, selection: Selection) -> np.ndarray: + # the core logic for getitem goes here + + class Array: + _async_array: AsyncArray + + def __getitem__(self, selection: Selection) -> np.ndarray: + return sync(self._async_array.getitem(selection)) + +- **Group** + + .. code:: python + + class AsyncGroup: + ... + + async def create_group(self, path: str, **kwargs) -> AsyncGroup: + # the core logic for create_group goes here + + class Group: + _async_group: AsyncGroup + + def create_group(self, path: str, **kwargs) -> Group: + return sync(self._async_group.create_group(path, **kwargs)) + + **Internal Synchronization API** + +With the ``Store`` and core ``AsyncArray``/ ``AsyncGroup`` classes being +predominantly async, Zarr-Python will need an internal API to provide a +synchronous API. The proposal here is to use the approach in +`fsspec `__ +to provide a high-level ``sync`` function that takes an ``awaitable`` +and runs it in its managed IO Loop / thread. + +| **FAQ** 1. Why two levels of Arrays/groups? a. First, this is an + intentional decision and departure from the current Zarrita + implementation b. The idea is that users rarely want to mix + interfaces. Either they are working within an async context (currently + quite rare) or they are in a typical synchronous context. c. Splitting + the two will allow us to clearly define behavior on the ``AsyncObj`` + and simply wrap it in the ``SyncObj``. 2. What if a store is only has + a synchronous backend? a. First off, this is expected to be a fairly + rare occurrence. Most storage backends have async interfaces. b. But + in the event a storage backend doesn’t have a async interface, there + is nothing wrong with putting synchronous code in ``async`` methods. + There are approaches to enabling concurrent action through wrappers + like AsyncIO’s ``loop.run_in_executor`` (`ref + 1 `__, + `ref 2 `__, `ref + 3 `__, + `ref + 4 `__. +| 3. Will Zarr help manage the async contexts encouraged by some + libraries + (e.g. `AioBotoCore `__)? + a. Many async IO libraries require entering an async context before + interacting with the API. We expect some experimentation to be needed + here but the initial design will follow something close to what fsspec + does (`example in + s3fs `__). + 4. Why not provide a synchronous Store interface? a. We could but this + design is simpler. It would mean supporting it in the ``AsyncGroup`` + and ``AsyncArray`` classes which, may be more trouble than its worth. + Storage backends that do not have an async API will be encouraged to + wrap blocking calls in an async wrapper + (e.g. ``loop.run_in_executor``). + +Store API +~~~~~~~~~ + +The ``Store`` API is specified directly in the V3 specification. All V3 +stores should implement this abstract API, omitting Write and List +support as needed. As described above, all stores will be expected to +expose the required methods as async methods. + +**Example** + +.. code:: python + + class ReadWriteStore: + ... + async def get(self, key: str) -> bytes: + ... + + async def get_partial_values(self, key_ranges: List[Tuple[str, int, int]) -> bytes: + ... + + async def set(self, key: str, value: Union[bytes, bytearray, memoryview]) -> None: + ... # required for writable stores + + async def set_partial_values(self, key_start_values: List[Tuple[str, int, Union[bytes, bytearray, memoryview]]]) -> None: + ... # required for writable stores + + async def list(self) -> List[str]: + ... # required for listable stores + + async def list_prefix(self, prefix: str) -> List[str]: + ... # required for listable stores + + async def list_dir(self, prefix: str) -> List[str]: + ... # required for listable stores + + # additional (optional methods) + async def getsize(self, prefix: str) -> int: + ... + + async def rename(self, src: str, dest: str) -> None + ... + + +Recognizing that there are many Zarr applications today that rely on the +``MutableMapping`` interface supported by Zarr-Python 2, a wrapper store +will be developed to allow existing stores to plug directly into this +API. + +Array API +~~~~~~~~~ + +The user facing array interface will implement a subset of the `Array +API Standard `__. Most of the +computational parts of the Array API Standard don’t fit into Zarr right +now. That’s okay. What matters most is that we ensure we can give +downstream applications a compliant API. + +*Note, Zarr already does most of this so this is more about formalizing +the relationship than a substantial change in API.* + ++------------------------+------------------------+-------------------------+-------------------------+ +| | Included | Not Included | Unknown / Maybe Possible| ++========================+========================+=========================+=========================+ +| **Attributes** | ``dtype`` | ``mT`` | ``device`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``ndim`` | ``T`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``shape`` | | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``size`` | | | ++------------------------+------------------------+-------------------------+-------------------------+ +| **Methods** | ``__getitem__`` | ``__array_namespace__`` | ``to_device`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``__setitem__`` | ``__abs__`` | ``__bool__`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``__eq__`` | ``__add__`` | ``__complex__`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``__bool__`` | ``__and__`` | ``__dlpack__`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__floordiv__`` | ``__dlpack_device__`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__ge__`` | ``__float__`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__gt__`` | ``__index__`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__invert__`` | ``__int__`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__le__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__lshift__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__lt__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__matmul__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__mod__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__mul__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__ne__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__neg__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__or__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__pos__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__pow__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__rshift__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__sub__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__truediv__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | | ``__xor__`` | | ++------------------------+------------------------+-------------------------+-------------------------+ +| **Creation functions** | ``zeros`` | | ``arange`` | +| (``zarr.creation``) | | | | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``zeros_like`` | | ``asarray`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``ones`` | | ``eye`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``ones_like`` | | ``from_dlpack`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``full`` | | ``linspace`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``full_like`` | | ``meshgrid`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``empty`` | | ``tril`` | ++------------------------+------------------------+-------------------------+-------------------------+ +| | ``empty_like`` | | ``triu`` | ++------------------------+------------------------+-------------------------+-------------------------+ + +In addition to the core array API defined above, the Array class should +have the following Zarr specific properties: + +- ``.metadata`` (see Metadata Interface below) +- ``.attrs`` - (pulled from metadata object) +- ``.info`` - (repolicated from existing property †) + +*† In Zarr-Python 2, the info property listed the store to identify +initialized chunks. By default this will be turned off in 3.0 but will +be configurable.* + +**Indexing** + +Zarr-Python currently supports ``__getitem__`` style indexing and the +special ``oindex`` and ``vindex`` indexers. These are not part of the +current Array API standard (see +`data-apis/array-api#669 `__) +but they have been `proposed as a +NEP `__. +Zarr-Python will maintain these in 3.0. + +We are also exploring a new high-level indexing API that will enabled +optimized batch/concurrent loading of many chunks. We expect this to be +important to enable performant loading of data in the context of +sharding. See `this +discussion `__ +for more detail. + +Concurrent indexing across multiple arrays will be possible using the +AsyncArray API. + +**Async and Sync Array APIs** + +Most the logic to support Zarr Arrays will live in the ``AsyncArray`` +class. There are a few notable differences that should be called out. + +=============== ============ +Sync Method Async Method +=============== ============ +``__getitem__`` ``getitem`` +``__setitem__`` ``setitem`` +``__eq__`` ``equals`` +=============== ============ + +**Metadata interface** + +Zarr-Python 2.\* closely mirrors the V2 spec metadata schema in the +Array and Group classes. In 3.0, we plan to move the underlying metadata +representation to a separate interface (e.g. ``Array.metadata``). This +interface will return either a ``V2ArrayMetadata`` or +``V3ArrayMetadata`` object (both will inherit from a parent +``ArrayMetadataABC`` class. The ``V2ArrayMetadata`` and +``V3ArrayMetadata`` classes will be responsible for producing valid JSON +representations of their metadata, and yielding a consistent view to the +``Array`` or ``Group`` class. + +Group API +~~~~~~~~~ + +The main question is how closely we should follow the existing +Zarr-Python implementation / ``MutableMapping`` interface. The table +below shows the primary ``Group`` methods in Zarr-Python 2 and attempts +to identify if and how they would be implemented in 3.0. + ++---------------------+------------------+------------------+-----------------------+ +| V2 Group Methods | ``AsyncGroup`` | ``Group`` | ``h5py_compat.Group`` | ++=====================+==================+==================+=======================+ +| ``__len__`` | ``length`` | ``__len__`` | ``__len__`` | ++---------------------+------------------+------------------+-----------------------+ +| ``__iter__`` | ``__aiter__`` | ``__iter__`` | ``__iter__`` | ++---------------------+------------------+------------------+-----------------------+ +| ``__contains__`` | ``contains`` | ``__contains__`` | ``__contains__`` | ++---------------------+------------------+------------------+-----------------------+ +| ``__getitem__`` | ``getitem`` | ``__getitem__`` | ``__getitem__`` | ++---------------------+------------------+------------------+-----------------------+ +| ``__enter__`` | N/A | N/A | ``__enter__`` | ++---------------------+------------------+------------------+-----------------------+ +| ``__exit__`` | N/A | N/A | ``__exit__`` | ++---------------------+------------------+------------------+-----------------------+ +| ``group_keys`` | ``group_keys`` | ``group_keys`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``groups`` | ``groups`` | ``groups`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``array_keys`` | ``array_key`` | ``array_keys`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``arrays`` | ``arrays`` | ``arrays`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``visit`` | ? | ? | ``visit`` | ++---------------------+------------------+------------------+-----------------------+ +| ``visitkeys`` | ? | ? | ? | ++---------------------+------------------+------------------+-----------------------+ +| ``visitvalues`` | ? | ? | ? | ++---------------------+------------------+------------------+-----------------------+ +| ``visititems`` | ? | ? | ``visititems`` | ++---------------------+------------------+------------------+-----------------------+ +| ``tree`` | ``tree`` | ``tree`` | ``Both`` | ++---------------------+------------------+------------------+-----------------------+ +| ``create_group`` | ``create_group`` | ``create_group`` | ``create_group`` | ++---------------------+------------------+------------------+-----------------------+ +| ``require_group`` | N/A | N/A | ``require_group`` | ++---------------------+------------------+------------------+-----------------------+ +| ``create_groups`` | ? | ? | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``require_groups`` | ? | ? | ? | ++---------------------+------------------+------------------+-----------------------+ +| ``create_dataset`` | N/A | N/A | ``create_dataset`` | ++---------------------+------------------+------------------+-----------------------+ +| ``require_dataset`` | N/A | N/A | ``require_dataset`` | ++---------------------+------------------+------------------+-----------------------+ +| ``create`` | ``create_array`` | ``create_array`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``empty`` | ``empty`` | ``empty`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``zeros`` | ``zeros`` | ``zeros`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``ones`` | ``ones`` | ``ones`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``full`` | ``full`` | ``full`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``array`` | ``create_array`` | ``create_array`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``empty_like`` | ``empty_like`` | ``empty_like`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``zeros_like`` | ``zeros_like`` | ``zeros_like`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``ones_like`` | ``ones_like`` | ``ones_like`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``full_like`` | ``full_like`` | ``full_like`` | N/A | ++---------------------+------------------+------------------+-----------------------+ +| ``move`` | ``move`` | ``move`` | ``move`` | ++---------------------+------------------+------------------+-----------------------+ + +**``zarr.h5compat.Group``** +-- +Zarr-Python 2.\* made an attempt to align its API with that of +`h5py `__. With 3.0, we will +relax this alignment in favor of providing an explicit compatibility +module (``zarr.h5py_compat``). This module will expose the ``Group`` and +``Dataset`` APIs that map to Zarr-Python’s ``Group`` and ``Array`` +objects. + +Creation API +~~~~~~~~~~~~ + +Zarr-Python 2.\* bundles together the creation and serialization of Zarr +objects. Zarr-Python 3.\* will make it possible to create objects in +memory separate from serializing them. This will specifically enable +writing hierarchies of Zarr objects in a single batch step. For example: + +.. code:: python + + + arr1 = Array(shape=(10, 10), path="foo/bar", dtype="i4", store=store) + arr2 = Array(shape=(10, 10), path="foo/spam", dtype="f8", store=store) + + arr1.save() + arr2.save() + + # or equivalently + + zarr.save_many([arr1 ,arr2]) + +*Note: this batch creation API likely needs additional design effort +prior to implementation.* + +Plugin API +~~~~~~~~~~ + +Zarr V3 was designed to be extensible at multiple layers. Zarr-Python +will support these extensions through a combination of `Abstract Base +Classes `__ (ABCs) and +`Entrypoints `__. + +**ABCs** + +Zarr V3 will expose Abstract base classes for the following objects: + +- ``Store``, ``ReadStore``, ``ReadWriteStore``, ``ReadListStore``, and + ``ReadWriteListStore`` +- ``BaseArray``, ``SynchronousArray``, and ``AsynchronousArray`` +- ``BaseGroup``, ``SynchronousGroup``, and ``AsynchronousGroup`` +- ``Codec``, ``ArrayArrayCodec``, ``ArrayBytesCodec``, + ``BytesBytesCodec`` + +**Entrypoints** + +Lots more thinking here but the idea here is to provide entrypoints for +``data type``, ``chunk grid``, ``chunk key encoding``, ``codecs``, +``storage_transformers`` and ``stores``. These might look something +like: + +:: + + entry_points=""" + [zarr.codecs] + blosc_codec=codec_plugin:make_blosc_codec + zlib_codec=codec_plugin:make_zlib_codec + """ + +Python type hints and static analysis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Target 100% Mypy coverage in 3.0 source. + +Observability +~~~~~~~~~~~~~ + +A persistent problem in Zarr-Python is diagnosing problems that span +many parts of the stack. To address this in 3.0, we will add a basic +logging framework that can be used to debug behavior at various levels +of the stack. We propose to add the separate loggers for the following +namespaces: + +- ``array`` +- ``group`` +- ``store`` +- ``codec`` + +These should be documented such that users know how to activate them and +developers know how to use them when developing extensions. + +Dependencies +~~~~~~~~~~~~ + +Today, Zarr-Python has the following required dependencies: + +.. code:: python + + dependencies = [ + 'asciitree', + 'numpy>=1.20,!=1.21.0', + 'fasteners', + 'numcodecs>=0.10.0', + ] + +What other dependencies should be considered? + +1. Attrs - Zarrita makes extensive use of the Attrs library +2. Fsspec - Zarrita has a hard dependency on Fsspec. This could be + easily relaxed though. + +Breaking changes relative to Zarr-Python 2.\* +--------------------------------------------- + +1. H5py compat moved to a stand alone module? +2. ``Group.__getitem__`` support moved to ``Group.members.__getitem__``? +3. Others? + +Open questions +-------------- + +1. How to treat V2 + + a. Note: Zarrita currently implements a separate ``V2Array`` and + ``V3Array`` classes. This feels less than ideal. + b. We could easily convert metadata from v2 to the V3 Array, but what + about writing? + c. Ideally, we don’t have completely separate code paths. But if its + too complicated to support both within one interface, its probably + better. + +2. How and when to remove the current implementation of V3. + + a. It’s hidden behind a hard-to-use feature flag so we probably don’t + need to do anything. + +3. How to model runtime configuration? +4. Which extensions belong in Zarr-Python and which belong in separate + packages? + + a. We don’t need to take a strong position on this here. It’s likely + that someone will want to put Sharding in. That will be useful to + develop in parallel because it will give us a good test case for + the plugin interface. + +Testing +------- + +Zarr-python 3.0 adds a major new dimension to Zarr: Async support. This +also comes with a compatibility risk, we will need to thoroughly test +support in key execution environments. Testing plan: - Reuse the +existing test suite for testing the ``v3`` API. - ``xfail`` tests that +expose breaking changes with ``3.0 - breaking change`` description. This +will help identify additional and/or unintentional breaking changes - +Rework tests that were only testing internal APIs. - Add a set of +functional / integration tests targeting real-world workflows in various +contexts (e.g. w/ Dask) + +Development process +------------------- + +Zarr-Python 3.0 will introduce a number of new APIs and breaking changes +to existing APIs. In order to facilitate ongoing support for Zarr-Python +2.*, we will take on the following development process: + +- Create a ``v3`` branch that can be use for developing the core + functionality apart from the ``main`` branch. This will allow us to + support ongoing work and bug fixes on the ``main`` branch. +- Put the ``3.0`` APIs inside a ``zarr.v3`` module. Imports from this + namespace will all be new APIs that users can develop and test + against once the ``v3`` branch is merged to ``main``. +- Kickstart the process by pulling in the current state of ``zarrita`` + - which has many of the features described in this design. +- Release a series of 2.\* releases with the ``v3`` namespace +- When ``v3`` is complete, move contents of ``v3`` to the package root + +**Milestones** + +Below are a set of specific milestones leading toward the completion of +this process. As work begins, we expect this list to grow in +specificity. + +1. Port current version of Zarrita to Zarr-Python +2. Formalize Async interface by splitting ``Array`` and ``Group`` + objects into Sync and Async versions +3. Implement “fancy” indexing operations on the ``AsyncArray`` +4. Implement an abstract base class for the ``Store`` interface and a + wrapper ``Store`` to make use of existing ``MutableMapping`` stores. +5. Rework the existing unit test suite to use the ``v3`` namespace. +6. Develop a plugin interface for extensions +7. Develop a set of functional and integration tests +8. Work with downstream libraries (Xarray, Dask, etc.) to test new APIs + +TODOs +----- + +The following subjects are not covered in detail above but perhaps +should be. Including them here so they are not forgotten. + +1. [Store] Should Zarr provide an API for caching objects after first + read/list/etc. Read only stores? +2. [Array] buffer protocol support +3. [Array] ``meta_array`` support +4. [Extensions] Define how Zarr-Python will consume the various plugin + types +5. [Misc] H5py compatibility requires a bit more work and a champion to + drive it forward. +6. [Misc] Define ``chunk_store`` API in 3.0 +7. [Misc] Define ``synchronizer`` API in 3.0 + +References +---------- + +1. `Zarr-Python + repository `__ +2. `Zarr core specification (version 3.0) — Zarr specs + documentation `__ +3. `Zarrita repository `__ +4. `Async-Zarr `__ +5. `Zarr-Python Discussion + Topic `__ diff --git a/v3-roadmap-and-design.md b/v3-roadmap-and-design.md deleted file mode 100644 index 696799e56f..0000000000 --- a/v3-roadmap-and-design.md +++ /dev/null @@ -1,429 +0,0 @@ -# Zarr Python Roadmap - -- Status: draft -- Author: Joe Hamman -- Created On: October 31, 2023 -- Input from: - - Davis Bennett / @d-v-b - - Norman Rzepka / @normanrz - - Deepak Cherian @dcherian - - Brian Davis / @monodeldiablo - - Oliver McCormack / @olimcc - - Ryan Abernathey / @rabernat - - Jack Kelly / @JackKelly - - Martin Durrant / @martindurant - -## Introduction - -This document lays out a design proposal for version 3.0 of the [Zarr-Python](https://zarr.readthedocs.io/en/stable/) package. A specific focus of the design is to bring Zarr-Python's API up to date with the [Zarr V3 specification](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html), with the hope of enabling the development of the many features and extensions that motivated the V3 Spec. The ideas presented here are expected to result in a major release of Zarr-Python (version 3.0) including significant a number of breaking API changes. -For clarity, “V3” will be used to describe the version of the Zarr specification and “3.0” will be used to describe the release tag of the Zarr-Python project. - -### Current status of V3 in Zarr-Python - -During the development of the V3 Specification, a [prototype implementation](https://github.com/zarr-developers/zarr-python/pull/898) was added to the Zarr-Python library. Since that implementation, the V3 spec evolved in significant ways and as a result, the Zarr-Python library is now out of sync with the approved spec. Downstream libraries (e.g. [Xarray](https://github.com/pydata/xarray)) have added support for this implementation and will need to migrate to the accepted spec when its available in Zarr-Python. - -## Goals - -- Provide a complete implementation of Zarr V3 through the Zarr-Python API -- Clear the way for exciting extensions / ZEPs (i.e. [sharding](https://zarr-specs.readthedocs.io/en/latest/v3/codecs/sharding-indexed/v1.0.html), [variable chunking](https://zarr.dev/zeps/draft/ZEP0003.html), etc.) -- Provide a developer API that can be used to implement and register V3 extensions -- Improve the performance of Zarr-Python by streamlining the interface between the Store layer and higher level APIs (e.g. Groups and Arrays) -- Clean up the internal and user facing APIs -- Improve code quality and robustness (e.g. achieve 100% type hint coverage) -- Align the Zarr-Python array API with the [array API Standard](https://data-apis.org/array-api/latest/) - -## Examples of what 3.0 will enable? -1. Reading and writing V3 spec-compliant groups and arrays -2. V3 extensions including sharding and variable chunking. -3. Improved performance by leveraging concurrency when creating/reading/writing to stores (imagine a `create_hierarchy(zarr_objects)` function). -4. User-developed extensions (e.g. storage-transformers) can be registered with Zarr-Python at runtime - -## Non-goals (of this document) - -- Implementation of any unaccepted Zarr V3 extensions -- Major revisions to the Zarr V3 spec - -## Requirements - -1. Read and write spec compliant V2 and V3 data -2. Limit unnecessary traffic to/from the store -3. Cleanly define the Array/Group/Store abstractions -4. Cleanly define how V2 will be supported going forward -5. Provide a clear roadmap to help users upgrade to 3.0 -6. Developer tools / hooks for registering extensions - -## Design - -### Async API - -Zarr-Python is an IO library. As such, supporting concurrent action against the storage layer is critical to achieving acceptable performance. The Zarr-Python 2 was not designed with asynchronous computation in mind and as a result has struggled to effectively leverage the benefits of concurrency. At one point, `getitems` and `setitems` support was added to the Zarr store model but that is only used for operating on a set of chunks in a single variable. - -With Zarr-Python 3.0, we have the opportunity to revisit this design. The proposal here is as follows: - -1. The `Store` interface will be entirely async. -2. On top of the async `Store` interface, we will provide an `AsyncArray` and `AsyncGroup` interface. -3. Finally, the primary user facing API will be synchronous `Array` and `Group` classes that wrap the async equivalents. - -**Examples** - -- **Store** - - ```python - class Store: - ... - async def get(self, key: str) -> bytes: - ... - async def get_partial_values(self, key_ranges: List[Tuple[str, Tuple[int, Optional[int]]]]) -> bytes: - ... - # (no sync interface here) - ``` -- **Array** - - ```python - class AsyncArray: - ... - - async def getitem(self, selection: Selection) -> np.ndarray: - # the core logic for getitem goes here - - class Array: - _async_array: AsyncArray - - def __getitem__(self, selection: Selection) -> np.ndarray: - return sync(self._async_array.getitem(selection)) - ``` -- **Group** - - ```python - class AsyncGroup: - ... - - async def create_group(self, path: str, **kwargs) -> AsyncGroup: - # the core logic for create_group goes here - - class Group: - _async_group: AsyncGroup - - def create_group(self, path: str, **kwargs) -> Group: - return sync(self._async_group.create_group(path, **kwargs)) - ``` -**Internal Synchronization API** - -With the `Store` and core `AsyncArray`/ `AsyncGroup` classes being predominantly async, Zarr-Python will need an internal API to provide a synchronous API. The proposal here is to use the approach in [fsspec](https://github.com/fsspec/filesystem_spec/blob/master/fsspec/asyn.py) to provide a high-level `sync` function that takes an `awaitable` and runs it in its managed IO Loop / thread. - -**FAQ** -1. Why two levels of Arrays/groups? - a. First, this is an intentional decision and departure from the current Zarrita implementation - b. The idea is that users rarely want to mix interfaces. Either they are working within an async context (currently quite rare) or they are in a typical synchronous context. - c. Splitting the two will allow us to clearly define behavior on the `AsyncObj` and simply wrap it in the `SyncObj`. -2. What if a store is only has a synchronous backend? - a. First off, this is expected to be a fairly rare occurrence. Most storage backends have async interfaces. - b. But in the event a storage backend doesn’t have a async interface, there is nothing wrong with putting synchronous code in `async` methods. There are approaches to enabling concurrent action through wrappers like AsyncIO's `loop.run_in_executor` ([ref 1](https://stackoverflow.com/questions/38865050/is-await-in-python3-cooperative-multitasking ), [ref 2](https://stackoverflow.com/a/43263397/732596), [ref 3](https://bbc.github.io/cloudfit-public-docs/asyncio/asyncio-part-5.html), [ref 4](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor). -3. Will Zarr help manage the async contexts encouraged by some libraries (e.g. [AioBotoCore](https://aiobotocore.readthedocs.io/en/latest/tutorial.html#using-botocore))? - a. Many async IO libraries require entering an async context before interacting with the API. We expect some experimentation to be needed here but the initial design will follow something close to what fsspec does ([example in s3fs](https://github.com/fsspec/s3fs/blob/949442693ec940b35cda3420c17a864fbe426567/s3fs/core.py#L527)). -4. Why not provide a synchronous Store interface? - a. We could but this design is simpler. It would mean supporting it in the `AsyncGroup` and `AsyncArray` classes which, may be more trouble than its worth. Storage backends that do not have an async API will be encouraged to wrap blocking calls in an async wrapper (e.g. `loop.run_in_executor`). - -### Store API - -The `Store` API is specified directly in the V3 specification. All V3 stores should implement this abstract API, omitting Write and List support as needed. As described above, all stores will be expected to expose the required methods as async methods. - -**Example** - -```python -class ReadWriteStore: - ... - async def get(self, key: str) -> bytes: - ... - - async def get_partial_values(self, key_ranges: List[Tuple[str, int, int]) -> bytes: - ... - - async def set(self, key: str, value: Union[bytes, bytearray, memoryview]) -> None: - ... # required for writable stores - - async def set_partial_values(self, key_start_values: List[Tuple[str, int, Union[bytes, bytearray, memoryview]]]) -> None: - ... # required for writable stores - - async def list(self) -> List[str]: - ... # required for listable stores - - async def list_prefix(self, prefix: str) -> List[str]: - ... # required for listable stores - - async def list_dir(self, prefix: str) -> List[str]: - ... # required for listable stores - - # additional (optional methods) - async def getsize(self, prefix: str) -> int: - ... - - async def rename(self, src: str, dest: str) -> None - ... - -``` - -Recognizing that there are many Zarr applications today that rely on the `MutableMapping` interface supported by Zarr-Python 2, a wrapper store will be developed to allow existing stores to plug directly into this API. - -### Array API - -The user facing array interface will implement a subset of the [Array API Standard](https://data-apis.org/array-api/latest/). Most of the computational parts of the Array API Standard don’t fit into Zarr right now. That’s okay. What matters most is that we ensure we can give downstream applications a compliant API. - -*Note, Zarr already does most of this so this is more about formalizing the relationship than a substantial change in API.* - -| | Included | Not Included | Unknown / Maybe possible? | -| --- | --- | --- | --- | -| Attributes | `dtype` | `mT` | `device` | -| | `ndim` | `T` | | -| | `shape` | | | -| | `size` | | | -| Methods | `__getitem__` | `__array_namespace__` | `to_device` | -| | `__setitem__` | `__abs__` | `__bool__` | -| | `__eq__` | `__add__` | `__complex__` | -| | `__bool__` | `__and__` | `__dlpack__` | -| | | `__floordiv__` | `__dlpack_device__` | -| | | `__ge__` | `__float__` | -| | | `__gt__` | `__index__` | -| | | `__invert__` | `__int__` | -| | | `__le__` | | -| | | `__lshift__` | | -| | | `__lt__` | | -| | | `__matmul__` | | -| | | `__mod__` | | -| | | `__mul__` | | -| | | `__ne__` | | -| | | `__neg__` | | -| | | `__or__` | | -| | | `__pos__` | | -| | | `__pow__` | | -| | | `__rshift__` | | -| | | `__sub__` | | -| | | `__truediv__` | | -| | | `__xor__` | | -| Creation functions (`zarr.creation`) | `zeros` | | `arange` | -| | `zeros_like` | | `asarray` | -| | `ones` | | `eye` | -| | `ones_like` | | `from_dlpack` | -| | `full` | | `linspace` | -| | `full_like` | | `meshgrid` | -| | `empty` | | `tril` | -| | `empty_like` | | `triu` | - -In addition to the core array API defined above, the Array class should have the following Zarr specific properties: - -- `.metadata` (see Metadata Interface below) -- `.attrs` - (pull from metadata object) -- `.info` - (pull from existing property †) - -*† In Zarr-Python 2, the info property lists the store to identify initialized chunks. By default this will be turned off in 3.0 but will be configurable.* - -**Indexing** - -Zarr-Python currently supports `__getitem__` style indexing and the special `oindex` and `vindex` indexers. These are not part of the current Array API standard (see [data-apis/array-api\#669](https://github.com/data-apis/array-api/issues/669)) but they have been [proposed as a NEP](https://numpy.org/neps/nep-0021-advanced-indexing.html). Zarr-Python will maintain these in 3.0. - -We are also exploring a new high-level indexing API that will enabled optimized batch/concurrent loading of many chunks. We expect this to be important to enable performant loading of data in the context of sharding. See [this discussion](https://github.com/zarr-developers/zarr-python/discussions/1569) for more detail. - -Concurrent indexing across multiple arrays will be possible using the AsyncArray API. - -**Async and Sync Array APIs** - -Most the logic to support Zarr Arrays will live in the `AsyncArray` class. There are a few notable differences that should be called out. - -| Sync Method | Async Method | -| --- | --- | -| `__getitem__` | `getitem` | -| `__setitem__` | `setitem` | -| `__eq__` | `equals` | - -**Metadata interface** - -Zarr-Python 2.* closely mirrors the V2 spec metadata schema in the Array and Group classes. In 3.0, we plan to move the underlying metadata representation to a separate interface (e.g. `Array.metadata`). This interface will return either a `V2ArrayMetadata` or `V3ArrayMetadata` object (both will inherit from a parent `ArrayMetadataABC` class. The `V2ArrayMetadata` and `V3ArrayMetadata` classes will be responsible for producing valid JSON representations of their metadata, and yielding a consistent view to the `Array` or `Group` class. - -### Group API - -The main question is how closely we should follow the existing Zarr-Python implementation / `MutableMapping` interface. The table below shows the primary `Group` methods in Zarr-Python 2 and attempts to identify if and how they would be implemented in 3.0. - -| V2 Group Methods | `AsyncGroup` | `Group` | `h5py_compat.Group`` | -| --- | --- | --- | --- | -| `__len__` | `length` | `__len__` | `__len__` | -| `__iter__` | `__aiter__` | `__iter__` | `__iter__` | -| `__contains__` | `contains` | `__contains__` | `__contains__` | -| `__getitem__` | `getitem` | `__getitem__` | `__getitem__` | -| `__enter__` | N/A | N/A | `__enter__` | -| `__exit__` | N/A | N/A | `__exit__` | -| `group_keys` | `group_keys` | `group_keys` | N/A | -| `groups` | `groups` | `groups` | N/A | -| `array_keys` | `array_key` | `array_keys` | N/A | -| `arrays` | `arrays`* | `arrays` | N/A | -| `visit` | ? | ? | `visit` | -| `visitkeys` | ? | ? | ? | -| `visitvalues` | ? | ? | ? | -| `visititems` | ? | ? | `visititems` | -| `tree` | `tree` | `tree` | `Both` | -| `create_group` | `create_group` | `create_group` | `create_group` | -| `require_group` | N/A | N/A | `require_group` | -| `create_groups` | ? | ? | N/A | -| `require_groups` | ? | ? | ? | -| `create_dataset` | N/A | N/A | `create_dataset` | -| `require_dataset` | N/A | N/A | `require_dataset` | -| `create` | `create_array` | `create_array` | N/A | -| `empty` | `empty` | `empty` | N/A | -| `zeros` | `zeros` | `zeros` | N/A | -| `ones` | `ones` | `ones` | N/A | -| `full` | `full` | `full` | N/A | -| `array` | `create_array` | `create_array` | N/A | -| `empty_like` | `empty_like` | `empty_like` | N/A | -| `zeros_like` | `zeros_like` | `zeros_like` | N/A | -| `ones_like` | `ones_like` | `ones_like` | N/A | -| `full_like` | `full_like` | `full_like` | N/A | -| `move` | `move` | `move` | `move` | - -**`zarr.h5compat.Group`** - -Zarr-Python 2.* made an attempt to align its API with that of [h5py](https://docs.h5py.org/en/stable/index.html). With 3.0, we will relax this alignment in favor of providing an explicit compatibility module (`zarr.h5py_compat`). This module will expose the `Group` and `Dataset` APIs that map to Zarr-Python’s `Group` and `Array` objects. - -### Creation API - -Zarr-Python 2.* bundles together the creation and serialization of Zarr objects. Zarr-Python 3.* will make it possible to create objects in memory separate from serializing them. This will specifically enable writing hierarchies of Zarr objects in a single batch step. For example: - -```python - -arr1 = Array(shape=(10, 10), path="foo/bar", dtype="i4", store=store) -arr2 = Array(shape=(10, 10), path="foo/spam", dtype="f8", store=store) - -arr1.save() -arr2.save() - -# or equivalently - -zarr.save_many([arr1 ,arr2]) -``` - -*Note: this batch creation API likely needs additional design effort prior to implementation.* - -### Plugin API - -Zarr V3 was designed to be extensible at multiple layers. Zarr-Python will support these extensions through a combination of [Abstract Base Classes](https://docs.python.org/3/library/abc.html) (ABCs) and [Entrypoints](https://packaging.python.org/en/latest/specifications/entry-points/). - -**ABCs** - -Zarr V3 will expose Abstract base classes for the following objects: - -- `Store`, `ReadStore`, `ReadWriteStore`, `ReadListStore`, and `ReadWriteListStore` -- `BaseArray`, `SynchronousArray`, and `AsynchronousArray` -- `BaseGroup`, `SynchronousGroup`, and `AsynchronousGroup` -- `Codec`, `ArrayArrayCodec`, `ArrayBytesCodec`, `BytesBytesCodec` - -**Entrypoints** - -Lots more thinking here but the idea here is to provide entrypoints for `data type`, `chunk grid`, `chunk key encoding`, `codecs`, `storage_transformers` and `stores`. These might look something like: - -``` -entry_points=""" - [zarr.codecs] - blosc_codec=codec_plugin:make_blosc_codec - zlib_codec=codec_plugin:make_zlib_codec -""" -``` - -### Python type hints and static analysis - -Target 100% Mypy coverage in 3.0 source. - -### Observability - -A persistent problem in Zarr-Python is diagnosing problems that span many parts of the stack. To address this in 3.0, we will add a basic logging framework that can be used to debug behavior at various levels of the stack. We propose to add the separate loggers for the following namespaces: - -- `array` -- `group` -- `store` -- `codec` - -These should be documented such that users know how to activate them and developers know how to use them when developing extensions. - -### Dependencies - -Today, Zarr-Python has the following required dependencies: - -```python -dependencies = [ - 'asciitree', - 'numpy>=1.20,!=1.21.0', - 'fasteners', - 'numcodecs>=0.10.0', -] -``` - -What other dependencies should be considered? - -1. Attrs - Zarrita makes extensive use of the Attrs library -2. Fsspec - Zarrita has a hard dependency on Fsspec. This could be easily relaxed though. - -## Breaking changes relative to Zarr-Python 2.* - -1. H5py compat moved to a stand alone module? -2. `Group.__getitem__` support moved to `Group.members.__getitem__`? -3. Others? - -## Open questions - -1. How to treat V2 - a. Note: Zarrita currently implements a separate `V2Array` and `V3Array` classes. This feels less than ideal. - b. We could easily convert metadata from v2 to the V3 Array, but what about writing? - c. Ideally, we don’t have completely separate code paths. But if its too complicated to support both within one interface, its probably better. -2. How and when to remove the current implementation of V3. - a. It's hidden behind a hard-to-use feature flag so we probably don't need to do anything. -4. How to model runtime configuration? -5. Which extensions belong in Zarr-Python and which belong in separate packages? - a. We don't need to take a strong position on this here. It's likely that someone will want to put Sharding in. That will be useful to develop in parallel because it will give us a good test case for the plugin interface. - -## Testing - -Zarr-python 3.0 adds a major new dimension to Zarr: Async support. This also comes with a compatibility risk, we will need to thoroughly test support in key execution environments. Testing plan: -- Reuse the existing test suite for testing the `v3` API. - - `xfail` tests that expose breaking changes with `3.0 - breaking change` description. This will help identify additional and/or unintentional breaking changes - - Rework tests that were only testing internal APIs. -- Add a set of functional / integration tests targeting real-world workflows in various contexts (e.g. w/ Dask) - -## Development process - -Zarr-Python 3.0 will introduce a number of new APIs and breaking changes to existing APIs. In order to facilitate ongoing support for Zarr-Python 2.*, we will take on the following development process: - -- Create a `v3` branch that can be use for developing the core functionality apart from the `main` branch. This will allow us to support ongoing work and bug fixes on the `main` branch. -- Put the `3.0` APIs inside a `zarr.v3` module. Imports from this namespace will all be new APIs that users can develop and test against once the `v3` branch is merged to `main`. -- Kickstart the process by pulling in the current state of `zarrita` - which has many of the features described in this design. -- Release a series of 2.* releases with the `v3` namespace -- When `v3` is complete, move contents of `v3` to the package root - -**Milestones** - -Below are a set of specific milestones leading toward the completion of this process. As work begins, we expect this list to grow in specificity. - -1. Port current version of Zarrita to Zarr-Python -2. Formalize Async interface by splitting `Array` and `Group` objects into Sync and Async versions -4. Implement "fancy" indexing operations on the `AsyncArray` -6. Implement an abstract base class for the `Store` interface and a wrapper `Store` to make use of existing `MutableMapping` stores. -7. Rework the existing unit test suite to use the `v3` namespace. -8. Develop a plugin interface for extensions -9. Develop a set of functional and integration tests -10. Work with downstream libraries (Xarray, Dask, etc.) to test new APIs - -## TODOs - -The following subjects are not covered in detail above but perhaps should be. Including them here so they are not forgotten. - -1. [Store] Should Zarr provide an API for caching objects after first read/list/etc. Read only stores? -2. [Array] buffer protocol support -3. [Array] `meta_array` support -4. [Extensions] Define how Zarr-Python will consume the various plugin types -5. [Misc] H5py compatibility requires a bit more work and a champion to drive it forward. -6. [Misc] Define `chunk_store` API in 3.0 -7. [Misc] Define `synchronizer` API in 3.0 - -## References - -1. [Zarr-Python repository](https://github.com/zarr-developers/zarr-python) -2. [Zarr core specification (version 3.0) — Zarr specs documentation](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#) -3. [Zarrita repository](https://github.com/scalableminds/zarrita) -4. [Async-Zarr](https://github.com/martindurant/async-zarr) -5. [Zarr-Python Discussion Topic](https://github.com/zarr-developers/zarr-python/discussions/1569) From c68c8190afbf20849594aac4355f365dc03737ef Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Tue, 15 Oct 2024 20:21:19 +0200 Subject: [PATCH 05/22] Multiple imports for an import name (#2367) Co-authored-by: Joe Hamman --- src/zarr/abc/store.py | 1 - src/zarr/storage/remote.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 85e335089d..40c8129afe 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -2,7 +2,6 @@ from abc import ABC, abstractmethod from asyncio import gather -from types import TracebackType from typing import TYPE_CHECKING, NamedTuple, Protocol, runtime_checkable if TYPE_CHECKING: diff --git a/src/zarr/storage/remote.py b/src/zarr/storage/remote.py index 6945f129ed..0a0ec7f7cc 100644 --- a/src/zarr/storage/remote.py +++ b/src/zarr/storage/remote.py @@ -5,7 +5,6 @@ import fsspec from zarr.abc.store import ByteRangeRequest, Store -from zarr.core.buffer import Buffer from zarr.storage.common import _dereference_path if TYPE_CHECKING: From 74f4a9e5e156a4acd202a1b598618726b75499e2 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Tue, 15 Oct 2024 20:30:29 +0200 Subject: [PATCH 06/22] Enforce ruff/pycodestyle warnings (W) (#2369) * Apply ruff/pycodestyle rule W291 W291 Trailing whitespace * Enforce ruff/pycodestyle warnings (W) It looks like `ruff format` does not catch all trailing spaces. --------- Co-authored-by: Joe Hamman --- pyproject.toml | 1 + src/zarr/registry.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 281baee1c3..46c957968d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -223,6 +223,7 @@ extend-select = [ "TCH", # flake8-type-checking "TRY", # tryceratops "UP", # pyupgrade + "W", # pycodestyle warnings ] ignore = [ "ANN003", diff --git a/src/zarr/registry.py b/src/zarr/registry.py index fcea834f04..12b0738016 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -48,9 +48,9 @@ def register(self, cls: type[T]) -> None: __ndbuffer_registry: Registry[NDBuffer] = Registry() """ -The registry module is responsible for managing implementations of codecs, pipelines, buffers and ndbuffers and -collecting them from entrypoints. -The implementation used is determined by the config +The registry module is responsible for managing implementations of codecs, +pipelines, buffers and ndbuffers and collecting them from entrypoints. +The implementation used is determined by the config. """ From 39e68aa59901c67d10e844bd02259793b7b99d19 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Wed, 16 Oct 2024 00:04:10 +0200 Subject: [PATCH 07/22] Apply ruff/pycodestyle preview rule E262 (#2370) E262 Inline comment should start with `# ` Co-authored-by: Joe Hamman --- tests/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 0ea9cda39d..b3a1990686 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -683,7 +683,7 @@ def test_get_orthogonal_selection_2d(store: StorePath) -> None: with pytest.raises(IndexError): z.get_orthogonal_selection(selection_2d_bad) # type: ignore[arg-type] with pytest.raises(IndexError): - z.oindex[selection_2d_bad] # type: ignore[index] + z.oindex[selection_2d_bad] # type: ignore[index] def _test_get_orthogonal_selection_3d( From fc96b91c4a84158fa214ffae80e1e211c7d4634d Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Wed, 16 Oct 2024 00:19:39 +0200 Subject: [PATCH 08/22] Fix typo (#2382) Co-authored-by: Joe Hamman --- src/zarr/core/sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/sync.py b/src/zarr/core/sync.py index 8c5bc9c397..f7d4529478 100644 --- a/src/zarr/core/sync.py +++ b/src/zarr/core/sync.py @@ -133,7 +133,7 @@ def sync( finished, unfinished = wait([future], return_when=asyncio.ALL_COMPLETED, timeout=timeout) if len(unfinished) > 0: - raise TimeoutError(f"Coroutine {coro} failed to finish in within {timeout} s") + raise TimeoutError(f"Coroutine {coro} failed to finish within {timeout} s") assert len(finished) == 1 return_result = next(iter(finished)).result() From 9a9c989fd85efd506899c48f802f010d1ef2e8ad Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Wed, 16 Oct 2024 00:21:27 +0200 Subject: [PATCH 09/22] Imported name is not used anywhere in the module (#2379) --- docs/conf.py | 1 - tests/test_array.py | 1 - tests/test_v2.py | 2 -- 3 files changed, 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 0a328ac25f..72c6130a16 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,7 +21,6 @@ from importlib.metadata import version as get_version -import zarr import sphinx # If extensions (or modules to document with autodoc) are in another directory, diff --git a/tests/test_array.py b/tests/test_array.py index b558c826d6..829a04d304 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -6,7 +6,6 @@ import pytest import zarr.api.asynchronous -import zarr.storage from zarr import Array, AsyncArray, Group from zarr.codecs import BytesCodec, VLenBytesCodec from zarr.core.array import chunks_initialized diff --git a/tests/test_v2.py b/tests/test_v2.py index 729ed0533f..439b15b64c 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -9,8 +9,6 @@ from numcodecs.blosc import Blosc import zarr -import zarr.core.buffer.cpu -import zarr.core.metadata import zarr.storage from zarr import Array from zarr.storage import MemoryStore, StorePath From df4ab1b2af4db91c41fd5ac85a20d5c537b61053 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Wed, 16 Oct 2024 00:44:01 +0200 Subject: [PATCH 10/22] Missing mandatory keyword argument `shape` (#2376) --- src/zarr/core/group.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index e85057e2f6..e25f70eef6 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -932,7 +932,11 @@ async def create_array( @deprecated("Use AsyncGroup.create_array instead.") async def create_dataset( - self, name: str, **kwargs: Any + self, + name: str, + *, + shape: ShapeLike, + **kwargs: Any, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -943,6 +947,8 @@ async def create_dataset( ---------- name : str Array name. + shape : int or tuple of ints + Array shape. kwargs : dict Additional arguments passed to :func:`zarr.AsyncGroup.create_array`. @@ -953,7 +959,7 @@ async def create_dataset( .. deprecated:: 3.0.0 The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead. """ - return await self.create_array(name, **kwargs) + return await self.create_array(name, shape=shape, **kwargs) @deprecated("Use AsyncGroup.require_array instead.") async def require_dataset( @@ -1621,7 +1627,13 @@ def create_dataset(self, name: str, **kwargs: Any) -> Array: return Array(self._sync(self._async_group.create_dataset(name, **kwargs))) @deprecated("Use Group.require_array instead.") - def require_dataset(self, name: str, **kwargs: Any) -> Array: + def require_dataset( + self, + name: str, + *, + shape: ShapeLike, + **kwargs: Any, + ) -> Array: """Obtain an array, creating if it doesn't exist. Arrays are known as "datasets" in HDF5 terminology. For compatibility @@ -1648,9 +1660,15 @@ def require_dataset(self, name: str, **kwargs: Any) -> Array: .. deprecated:: 3.0.0 The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead. """ - return Array(self._sync(self._async_group.require_array(name, **kwargs))) + return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs))) - def require_array(self, name: str, **kwargs: Any) -> Array: + def require_array( + self, + name: str, + *, + shape: ShapeLike, + **kwargs: Any, + ) -> Array: """Obtain an array, creating if it doesn't exist. @@ -1672,7 +1690,7 @@ def require_array(self, name: str, **kwargs: Any) -> Array: ------- a : Array """ - return Array(self._sync(self._async_group.require_array(name, **kwargs))) + return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs))) def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array: return Array(self._sync(self._async_group.empty(name=name, shape=shape, **kwargs))) From 568853f04370dfb0af65611f678dbc0d566fd304 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Wed, 16 Oct 2024 00:44:18 +0200 Subject: [PATCH 11/22] Update ruff rules to ignore (#2374) Co-authored-by: Joe Hamman --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 46c957968d..4013f9ca22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -231,6 +231,7 @@ ignore = [ "ANN102", "ANN401", "PT004", # deprecated + "PT005", # deprecated "PT011", # TODO: apply this rule "PT012", # TODO: apply this rule "PYI013", @@ -238,6 +239,8 @@ ignore = [ "RET506", "RUF005", "TRY003", + "UP027", # deprecated + "UP038", # https://github.com/astral-sh/ruff/issues/7871 # https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules "W191", "E111", From 3f676ff400e02062d3b1b21d5379efae3ddd86fe Mon Sep 17 00:00:00 2001 From: Emma Marshall <55526386+e-marshall@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:58:21 -0600 Subject: [PATCH 12/22] Docstrings for arraymodule (#2276) * start to docstrings for arraymodule * incorporating toms edits, overriding mypy error... * fix attrs * Update src/zarr/core/array.py Co-authored-by: Sanket Verma * fix store -> storage * remove properties from asyncarray docstring --------- Co-authored-by: Sanket Verma Co-authored-by: Joe Hamman --- src/zarr/core/array.py | 339 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 336 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ca405842e0..0e73d44563 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -175,6 +175,32 @@ async def get_array_metadata( @dataclass(frozen=True) class AsyncArray(Generic[T_ArrayMetadata]): + """ + An asynchronous array class representing a chunked array stored in a Zarr store. + + Parameters + ---------- + metadata : ArrayMetadata + The metadata of the array. + store_path : StorePath + The path to the Zarr store. + codec_pipeline : CodecPipeline, optional + The codec pipeline used for encoding and decoding chunks, by default None. + order : {'C', 'F'}, optional + The order of the array data in memory, by default None. + + Attributes + ---------- + metadata : ArrayMetadata + The metadata of the array. + store_path : StorePath + The path to the Zarr store. + codec_pipeline : CodecPipeline + The codec pipeline used for encoding and decoding chunks. + order : {'C', 'F'} + The order of the array data in memory. + """ + metadata: T_ArrayMetadata store_path: StorePath codec_pipeline: CodecPipeline = field(init=False) @@ -364,6 +390,69 @@ async def create( exists_ok: bool = False, data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """ + Method to create a new asynchronous array instance. + + Parameters + ---------- + store : StoreLike + The store where the array will be created. + shape : ShapeLike + The shape of the array. + dtype : npt.DTypeLike + The data type of the array. + zarr_format : ZarrFormat, optional + The Zarr format version (default is 3). + fill_value : Any, optional + The fill value of the array (default is None). + attributes : dict[str, JSON], optional + The attributes of the array (default is None). + chunk_shape : ChunkCoords, optional + The shape of the array's chunks (default is None). + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding (default is None). + codecs : Iterable[Codec | dict[str, JSON]], optional + The codecs used to encode the data (default is None). + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + chunks : ShapeLike, optional + The shape of the array's chunks (default is None). + V2 only. V3 arrays should not have 'chunks' parameter. + dimension_separator : Literal[".", "/"], optional + The dimension separator (default is None). + V2 only. V3 arrays cannot have a dimension separator. + order : Literal["C", "F"], optional + The order of the array (default is None). + V2 only. V3 arrays should not have 'order' parameter. + filters : list[dict[str, JSON]], optional + The filters used to compress the data (default is None). + V2 only. V3 arrays should not have 'filters' parameter. + compressor : dict[str, JSON], optional + The compressor used to compress the data (default is None). + V2 only. V3 arrays should not have 'compressor' parameter. + exists_ok : bool, optional + Whether to raise an error if the store already exists (default is False). + data : npt.ArrayLike, optional + The data to be inserted into the array (default is None). + + Returns + ------- + AsyncArray + The created asynchronous array instance. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> async_arr = await zarr.core.array.AsyncArray.create( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='i4', + >>> fill_value=0) + + + """ store_path = await make_store_path(store) dtype_parsed = parse_dtype(dtype, zarr_format) @@ -558,6 +647,28 @@ async def open( store: StoreLike, zarr_format: ZarrFormat | None = 3, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: + """ + Async method to open an existing Zarr array from a given store. + + Parameters + ---------- + store : StoreLike + The store containing the Zarr array. + zarr_format : ZarrFormat | None, optional + The Zarr format version (default is 3). + + Returns + ------- + AsyncArray + The opened Zarr array. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> async_arr = await AsyncArray.open(store) # doctest: +ELLIPSIS + + """ store_path = await make_store_path(store) metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format) # TODO: remove this cast when we have better type hints @@ -570,14 +681,38 @@ def store(self) -> Store: @property def ndim(self) -> int: + """Returns the number of dimensions in the Array. + + Returns + ------- + int + The number of dimensions in the Array. + """ return len(self.metadata.shape) @property def shape(self) -> ChunkCoords: + """Returns the shape of the Array. + + Returns + ------- + tuple + The shape of the Array. + """ return self.metadata.shape @property def chunks(self) -> ChunkCoords: + """Returns the chunk shape of the Array. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + + Returns + ------- + ChunkCoords: + The chunk shape of the Array. + """ if isinstance(self.metadata.chunk_grid, RegularChunkGrid): return self.metadata.chunk_grid.chunk_shape @@ -589,28 +724,69 @@ def chunks(self) -> ChunkCoords: @property def size(self) -> int: + """Returns the total number of elements in the array + + Returns + ------- + int + Total number of elements in the array + """ return np.prod(self.metadata.shape).item() @property def dtype(self) -> np.dtype[Any]: + """Returns the data type of the array. + + Returns + ------- + np.dtype + Data type of the array + """ return self.metadata.dtype @property def attrs(self) -> dict[str, JSON]: + """Returns the attributes of the array. + + Returns + ------- + dict + Attributes of the array + """ return self.metadata.attributes @property def read_only(self) -> bool: + """Returns True if the array is read-only. + + Returns + ------- + bool + True if the array is read-only + """ + # Backwards compatibility for 2.x return self.store_path.store.mode.readonly @property def path(self) -> str: - """Storage path.""" + """Storage path. + + Returns + ------- + str + The path to the array in the Zarr store. + """ return self.store_path.path @property def name(self) -> str | None: - """Array name following h5py convention.""" + """Array name following h5py convention. + + Returns + ------- + str + The name of the array. + """ if self.path: # follow h5py convention: add leading slash name = self.path @@ -621,7 +797,13 @@ def name(self) -> str | None: @property def basename(self) -> str | None: - """Final component of name.""" + """Final component of name. + + Returns + ------- + str + The basename or final component of the array name. + """ if self.name is not None: return self.name.split("/")[-1] return None @@ -630,6 +812,11 @@ def basename(self) -> str | None: def cdata_shape(self) -> ChunkCoords: """ The shape of the chunk grid for this array. + + Returns + ------- + Tuple[int] + The shape of the chunk grid for this array. """ return tuple(ceildiv(s, c) for s, c in zip(self.shape, self.chunks, strict=False)) @@ -637,6 +824,11 @@ def cdata_shape(self) -> ChunkCoords: def nchunks(self) -> int: """ The number of chunks in the stored representation of this array. + + Returns + ------- + int + The total number of chunks in the array. """ return product(self.cdata_shape) @@ -644,6 +836,11 @@ def nchunks(self) -> int: def nchunks_initialized(self) -> int: """ The number of chunks that have been persisted in storage. + + Returns + ------- + int + The number of initialized chunks in the array. """ return nchunks_initialized(self) @@ -782,6 +979,36 @@ async def getitem( *, prototype: BufferPrototype | None = None, ) -> NDArrayLike: + """ + Asynchronous function that retrieves a subset of the array's data based on the provided selection. + + Parameters + ---------- + selection : BasicSelection + A selection object specifying the subset of data to retrieve. + prototype : BufferPrototype, optional + A buffer prototype to use for the retrieved data (default is None). + + Returns + ------- + NDArrayLike + The retrieved subset of the array's data. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> async_arr = await zarr.core.array.AsyncArray.create( + ... store=store, + ... shape=(100,100), + ... chunks=(10,10), + ... dtype='i4', + ... fill_value=0) + + >>> await async_arr.getitem((0,1)) # doctest: +ELLIPSIS + array(0, dtype=int32) + + """ if prototype is None: prototype = default_buffer_prototype() indexer = BasicIndexer( @@ -924,6 +1151,18 @@ async def info(self) -> None: @dataclass(frozen=True) class Array: + """Instantiate an array from an initialized store. + + Parameters + ---------- + store : StoreLike + The array store that has already been initialized. + shape : ChunkCoords + The shape of the array. + dtype : npt.DTypeLike + The dtype of the array. + """ + _async_array: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @classmethod @@ -957,6 +1196,42 @@ def create( # runtime exists_ok: bool = False, ) -> Array: + """Creates a new Array instance from an initialized store. + + Parameters + ---------- + store : StoreLike + The array store that has already been initialized. + shape : ChunkCoords + The shape of the array. + dtype : npt.DTypeLike + The data type of the array. + chunk_shape : ChunkCoords, optional + The shape of the Array's chunks (default is None). + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding (default is None). + codecs : Iterable[Codec | dict[str, JSON]], optional + The codecs used to encode the data (default is None). + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + chunks : ChunkCoords, optional + The shape of the Array's chunks (default is None). + dimension_separator : Literal[".", "/"], optional + The dimension separator (default is None). + order : Literal["C", "F"], optional + The order of the array (default is None). + filters : list[dict[str, JSON]], optional + The filters used to compress the data (default is None). + compressor : dict[str, JSON], optional + The compressor used to compress the data (default is None). + exists_ok : bool, optional + Whether to raise an error if the store already exists (default is False). + + Returns + ------- + Array + Array created from the store. + """ async_array = sync( AsyncArray.create( store=store, @@ -993,6 +1268,18 @@ def open( cls, store: StoreLike, ) -> Array: + """Opens an existing Array from a store. + + Parameters + ---------- + store : Store + Store containing the Array. + + Returns + ------- + Array + Array opened from the store. + """ async_array = sync(AsyncArray.open(store)) return cls(async_array) @@ -1002,26 +1289,72 @@ def store(self) -> Store: @property def ndim(self) -> int: + """Returns the number of dimensions in the array. + + Returns + ------- + int + The number of dimensions in the array. + """ return self._async_array.ndim @property def shape(self) -> ChunkCoords: + """Returns the shape of the array. + + Returns + ------- + ChunkCoords + The shape of the array. + """ return self._async_array.shape @property def chunks(self) -> ChunkCoords: + """Returns a tuple of integers describing the length of each dimension of a chunk of the array. + + Returns + ------- + tuple + A tuple of integers representing the length of each dimension of a chunk. + """ return self._async_array.chunks @property def size(self) -> int: + """Returns the total number of elements in the array. + + Returns + ------- + int + Total number of elements in the array. + """ return self._async_array.size @property def dtype(self) -> np.dtype[Any]: + """Returns the NumPy data type. + + Returns + ------- + np.dtype + The NumPy data type. + """ return self._async_array.dtype @property def attrs(self) -> Attributes: + """Returns a MutableMapping containing user-defined attributes. + + Returns + ------- + attrs : MutableMapping + A MutableMapping object containing user-defined attributes. + + Notes + ----- + Note that attribute values must be JSON serializable. + """ return Attributes(self) @property From 59350df64b259ca56b40d51b9b4e9a2ddbe1abd7 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 16 Oct 2024 19:04:15 +0200 Subject: [PATCH 13/22] fix/normalize storage paths (#2384) * bring in path normalization function from v2, and add a failing test * rephrase comment * simplify storepath creation * Update tests/v3/test_api.py Co-authored-by: Joe Hamman * refactor: remove redundant zarr format fixture * replace assertion with an informative error message * fix incorrect path concatenation in make_store_path, and refactor store_path tests * remove upath import because we don't need it * apply suggestions from code review --------- Co-authored-by: Joe Hamman --- src/zarr/api/asynchronous.py | 33 ++++--------- src/zarr/storage/_utils.py | 38 +++++++++++++++ src/zarr/storage/common.py | 34 ++++++++----- src/zarr/storage/local.py | 5 +- tests/conftest.py | 2 +- tests/test_api.py | 27 +++++++++- tests/test_group.py | 14 ++---- tests/test_store/test_core.py | 92 ++++++++++++++++++++++++++++------- 8 files changed, 180 insertions(+), 65 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 559049ae4f..e500562c4c 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -170,10 +170,7 @@ async def consolidate_metadata( The group, with the ``consolidated_metadata`` field set to include the metadata of each child node. """ - store_path = await make_store_path(store) - - if path is not None: - store_path = store_path / path + store_path = await make_store_path(store, path=path) group = await AsyncGroup.open(store_path, zarr_format=zarr_format, use_consolidated=False) group.store_path.store._check_writable() @@ -291,10 +288,7 @@ async def open( """ zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) - store_path = await make_store_path(store, mode=mode, storage_options=storage_options) - - if path is not None: - store_path = store_path / path + store_path = await make_store_path(store, mode=mode, path=path, storage_options=storage_options) if "shape" not in kwargs and mode in {"a", "w", "w-"}: try: @@ -401,9 +395,7 @@ async def save_array( ) mode = kwargs.pop("mode", None) - store_path = await make_store_path(store, mode=mode, storage_options=storage_options) - if path is not None: - store_path = store_path / path + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) new = await AsyncArray.create( store_path, zarr_format=zarr_format, @@ -582,9 +574,7 @@ async def group( mode = None if isinstance(store, Store) else cast(AccessModeLiteral, "a") - store_path = await make_store_path(store, mode=mode, storage_options=storage_options) - if path is not None: - store_path = store_path / path + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) if chunk_store is not None: warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2) @@ -697,9 +687,7 @@ async def open_group( if chunk_store is not None: warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2) - store_path = await make_store_path(store, mode=mode, storage_options=storage_options) - if path is not None: - store_path = store_path / path + store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path) if attributes is None: attributes = {} @@ -883,9 +871,7 @@ async def create( if not isinstance(store, Store | StorePath): mode = "a" - store_path = await make_store_path(store, mode=mode, storage_options=storage_options) - if path is not None: - store_path = store_path / path + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) return await AsyncArray.create( store_path, @@ -925,6 +911,7 @@ async def empty( retrieve data from an empty Zarr array, any values may be returned, and these are not guaranteed to be stable from one access to the next. """ + return await create(shape=shape, fill_value=None, **kwargs) @@ -1044,7 +1031,7 @@ async def open_array( store: StoreLike | None = None, zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, - path: PathLike | None = None, + path: PathLike = "", storage_options: dict[str, Any] | None = None, **kwargs: Any, # TODO: type kwargs as valid args to save ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: @@ -1071,9 +1058,7 @@ async def open_array( """ mode = kwargs.pop("mode", None) - store_path = await make_store_path(store, mode=mode) - if path is not None: - store_path = store_path / path + store_path = await make_store_path(store, path=path, mode=mode) zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) diff --git a/src/zarr/storage/_utils.py b/src/zarr/storage/_utils.py index cbc9c42bbd..ae39468897 100644 --- a/src/zarr/storage/_utils.py +++ b/src/zarr/storage/_utils.py @@ -1,11 +1,49 @@ from __future__ import annotations +import re +from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: from zarr.core.buffer import Buffer +def normalize_path(path: str | bytes | Path | None) -> str: + if path is None: + result = "" + elif isinstance(path, bytes): + result = str(path, "ascii") + + # handle pathlib.Path + elif isinstance(path, Path): + result = str(path) + + elif isinstance(path, str): + result = path + + else: + raise TypeError(f'Object {path} has an invalid type for "path": {type(path).__name__}') + + # convert backslash to forward slash + result = result.replace("\\", "/") + + # remove leading and trailing slashes + result = result.strip("/") + + # collapse any repeated slashes + pat = re.compile(r"//+") + result = pat.sub("/", result) + + # disallow path segments with just '.' or '..' + segments = result.split("/") + if any(s in {".", ".."} for s in segments): + raise ValueError( + f"The path {path!r} is invalid because its string representation contains '.' or '..' segments." + ) + + return result + + def _normalize_interval_index( data: Buffer, interval: None | tuple[int | None, int | None] ) -> tuple[int, int]: diff --git a/src/zarr/storage/common.py b/src/zarr/storage/common.py index 101e8f38af..b640a7729b 100644 --- a/src/zarr/storage/common.py +++ b/src/zarr/storage/common.py @@ -8,6 +8,7 @@ from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.core.common import ZARR_JSON, ZARRAY_JSON, ZGROUP_JSON, ZarrFormat from zarr.errors import ContainsArrayAndGroupError, ContainsArrayError, ContainsGroupError +from zarr.storage._utils import normalize_path from zarr.storage.local import LocalStore from zarr.storage.memory import MemoryStore @@ -41,9 +42,9 @@ class StorePath: store: Store path: str - def __init__(self, store: Store, path: str | None = None) -> None: + def __init__(self, store: Store, path: str = "") -> None: self.store = store - self.path = path or "" + self.path = path async def get( self, @@ -159,6 +160,7 @@ def __eq__(self, other: object) -> bool: async def make_store_path( store_like: StoreLike | None, *, + path: str | None = "", mode: AccessModeLiteral | None = None, storage_options: dict[str, Any] | None = None, ) -> StorePath: @@ -189,6 +191,9 @@ async def make_store_path( ---------- store_like : StoreLike | None The object to convert to a `StorePath` object. + path: str | None, optional + The path to use when creating the `StorePath` object. If None, the + default path is the empty string. mode : AccessModeLiteral | None, optional The mode to use when creating the `StorePath` object. If None, the default mode is 'r'. @@ -209,37 +214,44 @@ async def make_store_path( from zarr.storage.remote import RemoteStore # circular import used_storage_options = False - + path_normalized = normalize_path(path) if isinstance(store_like, StorePath): if mode is not None and mode != store_like.store.mode.str: _store = store_like.store.with_mode(mode) await _store._ensure_open() - store_like = StorePath(_store) - result = store_like + store_like = StorePath(_store, path=store_like.path) + result = store_like / path_normalized elif isinstance(store_like, Store): if mode is not None and mode != store_like.mode.str: store_like = store_like.with_mode(mode) await store_like._ensure_open() - result = StorePath(store_like) + result = StorePath(store_like, path=path_normalized) elif store_like is None: # mode = "w" is an exception to the default mode = 'r' - result = StorePath(await MemoryStore.open(mode=mode or "w")) + result = StorePath(await MemoryStore.open(mode=mode or "w"), path=path_normalized) elif isinstance(store_like, Path): - result = StorePath(await LocalStore.open(root=store_like, mode=mode or "r")) + result = StorePath( + await LocalStore.open(root=store_like, mode=mode or "r"), path=path_normalized + ) elif isinstance(store_like, str): storage_options = storage_options or {} if _is_fsspec_uri(store_like): used_storage_options = True result = StorePath( - RemoteStore.from_url(store_like, storage_options=storage_options, mode=mode or "r") + RemoteStore.from_url(store_like, storage_options=storage_options, mode=mode or "r"), + path=path_normalized, ) else: - result = StorePath(await LocalStore.open(root=Path(store_like), mode=mode or "r")) + result = StorePath( + await LocalStore.open(root=Path(store_like), mode=mode or "r"), path=path_normalized + ) elif isinstance(store_like, dict): # We deliberate only consider dict[str, Buffer] here, and not arbitrary mutable mappings. # By only allowing dictionaries, which are in-memory, we know that MemoryStore appropriate. - result = StorePath(await MemoryStore.open(store_dict=store_like, mode=mode or "r")) + result = StorePath( + await MemoryStore.open(store_dict=store_like, mode=mode or "r"), path=path_normalized + ) else: msg = f"Unsupported type for store_like: '{type(store_like).__name__}'" # type: ignore[unreachable] raise TypeError(msg) diff --git a/src/zarr/storage/local.py b/src/zarr/storage/local.py index b80b04e1d0..5c03009a97 100644 --- a/src/zarr/storage/local.py +++ b/src/zarr/storage/local.py @@ -97,7 +97,10 @@ def __init__(self, root: Path | str, *, mode: AccessModeLiteral = "r") -> None: super().__init__(mode=mode) if isinstance(root, str): root = Path(root) - assert isinstance(root, Path) + if not isinstance(root, Path): + raise TypeError( + f'"root" must be a string or Path instance. Got an object with type {type(root)} instead.' + ) self.root = root async def _open(self) -> None: diff --git a/tests/conftest.py b/tests/conftest.py index ad3552ad65..8c66406c9b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -138,7 +138,7 @@ def array_fixture(request: pytest.FixtureRequest) -> npt.NDArray[Any]: ) -@pytest.fixture(params=(2, 3)) +@pytest.fixture(params=(2, 3), ids=["zarr2", "zarr3"]) def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat: if request.param == 2: return 2 diff --git a/tests/test_api.py b/tests/test_api.py index 0614185f68..4952254f65 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,5 +1,6 @@ import pathlib import warnings +from typing import Literal import numpy as np import pytest @@ -10,9 +11,19 @@ import zarr.core.group from zarr import Array, Group from zarr.abc.store import Store -from zarr.api.synchronous import create, group, load, open, open_group, save, save_array, save_group +from zarr.api.synchronous import ( + create, + group, + load, + open, + open_group, + save, + save_array, + save_group, +) from zarr.core.common import ZarrFormat from zarr.errors import MetadataValidationError +from zarr.storage._utils import normalize_path from zarr.storage.memory import MemoryStore @@ -37,6 +48,20 @@ def test_create_array(memory_store: Store) -> None: assert z.chunks == (40,) +@pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) +@pytest.mark.parametrize("node_type", ["array", "group"]) +def test_open_normalized_path( + memory_store: MemoryStore, path: str, node_type: Literal["array", "group"] +) -> None: + node: Group | Array + if node_type == "group": + node = group(store=memory_store, path=path) + elif node_type == "array": + node = create(store=memory_store, path=path, shape=(2,)) + + assert node.path == normalize_path(path) + + async def test_open_array(memory_store: MemoryStore) -> None: store = memory_store diff --git a/tests/test_group.py b/tests/test_group.py index 20960f0346..4f062d5316 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -3,7 +3,7 @@ import contextlib import pickle import warnings -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal import numpy as np import pytest @@ -14,7 +14,6 @@ from zarr import Array, AsyncArray, AsyncGroup, Group from zarr.abc.store import Store from zarr.core.buffer import default_buffer_prototype -from zarr.core.common import JSON, ZarrFormat from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError @@ -26,6 +25,8 @@ if TYPE_CHECKING: from _pytest.compat import LEGACY_PATH + from zarr.core.common import JSON, ZarrFormat + @pytest.fixture(params=["local", "memory", "zip"]) async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: @@ -43,14 +44,6 @@ def exists_ok(request: pytest.FixtureRequest) -> bool: return result -@pytest.fixture(params=[2, 3], ids=["zarr2", "zarr3"]) -def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat: - result = request.param - if result not in (2, 3): - raise ValueError("Wrong value returned from test fixture.") - return cast(ZarrFormat, result) - - def test_group_init(store: Store, zarr_format: ZarrFormat) -> None: """ Test that initializing a group from an asyncgroup works. @@ -587,6 +580,7 @@ def test_group_array_creation( assert empty_array.fill_value == 0 assert empty_array.shape == shape assert empty_array.store_path.store == store + assert empty_array.store_path.path == "empty" empty_like_array = group.empty_like(name="empty_like", data=empty_array) assert isinstance(empty_like_array, Array) diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index b2a8292ea9..771dc3c43e 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -1,39 +1,71 @@ import tempfile from pathlib import Path +from typing import Literal import pytest +from _pytest.compat import LEGACY_PATH +from upath import UPath +from zarr.storage._utils import normalize_path from zarr.storage.common import StoreLike, StorePath, make_store_path from zarr.storage.local import LocalStore from zarr.storage.memory import MemoryStore from zarr.storage.remote import RemoteStore -async def test_make_store_path(tmpdir: str) -> None: - # None - store_path = await make_store_path(None) +@pytest.mark.parametrize("path", [None, "", "bar"]) +async def test_make_store_path_none(path: str) -> None: + """ + Test that creating a store_path with None creates a memorystore + """ + store_path = await make_store_path(None, path=path) assert isinstance(store_path.store, MemoryStore) - - # str - store_path = await make_store_path(str(tmpdir)) + assert store_path.path == normalize_path(path) + + +@pytest.mark.parametrize("path", [None, "", "bar"]) +@pytest.mark.parametrize("store_type", [str, Path, LocalStore]) +@pytest.mark.parametrize("mode", ["r", "w", "a"]) +async def test_make_store_path_local( + tmpdir: LEGACY_PATH, + store_type: type[str] | type[Path] | type[LocalStore], + path: str, + mode: Literal["r", "w", "a"], +) -> None: + """ + Test the various ways of invoking make_store_path that create a LocalStore + """ + store_like = store_type(str(tmpdir)) + store_path = await make_store_path(store_like, path=path, mode=mode) assert isinstance(store_path.store, LocalStore) assert Path(store_path.store.root) == Path(tmpdir) - - # Path - store_path = await make_store_path(Path(tmpdir)) + assert store_path.path == normalize_path(path) + assert store_path.store.mode.str == mode + + +@pytest.mark.parametrize("path", [None, "", "bar"]) +@pytest.mark.parametrize("mode", ["r", "w", "a"]) +async def test_make_store_path_store_path( + tmpdir: LEGACY_PATH, path: str, mode: Literal["r", "w", "a"] +) -> None: + """ + Test invoking make_store_path when the input is another store_path. In particular we want to ensure + that a new path is handled correctly. + """ + store_like = StorePath(LocalStore(str(tmpdir)), path="root") + store_path = await make_store_path(store_like, path=path, mode=mode) assert isinstance(store_path.store, LocalStore) assert Path(store_path.store.root) == Path(tmpdir) + path_normalized = normalize_path(path) + assert store_path.path == (store_like / path_normalized).path - # Store - store_path = await make_store_path(store_path.store) - assert isinstance(store_path.store, LocalStore) - assert Path(store_path.store.root) == Path(tmpdir) + assert store_path.store.mode.str == mode - # StorePath - store_path = await make_store_path(store_path) - assert isinstance(store_path.store, LocalStore) - assert Path(store_path.store.root) == Path(tmpdir) +async def test_make_store_path_invalid() -> None: + """ + Test that invalid types raise TypeError + """ with pytest.raises(TypeError): await make_store_path(1) # type: ignore[arg-type] @@ -65,3 +97,29 @@ async def test_make_store_path_storage_options_raises(store_like: StoreLike) -> async def test_unsupported() -> None: with pytest.raises(TypeError, match="Unsupported type for store_like: 'int'"): await make_store_path(1) # type: ignore[arg-type] + + +@pytest.mark.parametrize( + "path", + [ + "/foo/bar", + "//foo/bar", + "foo///bar", + "foo/bar///", + Path("foo/bar"), + b"foo/bar", + UPath("foo/bar"), + ], +) +def test_normalize_path_valid(path: str | bytes | Path | UPath) -> None: + assert normalize_path(path) == "foo/bar" + + +def test_normalize_path_none(): + assert normalize_path(None) == "" + + +@pytest.mark.parametrize("path", [".", ".."]) +def test_normalize_path_invalid(path: str): + with pytest.raises(ValueError): + normalize_path(path) From 7837e2e8c1c4349c5cf2f3f5f4e071d6c84fd05f Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Thu, 17 Oct 2024 05:50:21 +0200 Subject: [PATCH 14/22] Enforce ruff/flake8-pyi rule PYI013 (#2389) PYI013 Non-empty class body must not contain `...` Note that documentation is enough to fill the class body. --- pyproject.toml | 1 - src/zarr/abc/codec.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4013f9ca22..13d249eec8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -234,7 +234,6 @@ ignore = [ "PT005", # deprecated "PT011", # TODO: apply this rule "PT012", # TODO: apply this rule - "PYI013", "RET505", "RET506", "RUF005", diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 73b1a598b9..3548874409 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -156,20 +156,14 @@ async def encode( class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]): """Base class for array-to-array codecs.""" - ... - class ArrayBytesCodec(BaseCodec[NDBuffer, Buffer]): """Base class for array-to-bytes codecs.""" - ... - class BytesBytesCodec(BaseCodec[Buffer, Buffer]): """Base class for bytes-to-bytes codecs.""" - ... - Codec = ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec From cffced1130caf8d14d7456f3461cf2f9ed60a78a Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 16 Oct 2024 20:52:55 -0700 Subject: [PATCH 15/22] deps: remove fasteners from list of dependencies (#2386) --- .pre-commit-config.yaml | 1 - pyproject.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0dd5bd73df..4667b20de1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,7 +31,6 @@ repos: - asciitree - crc32c - donfig - - fasteners - numcodecs - numpy - typing_extensions diff --git a/pyproject.toml b/pyproject.toml index 13d249eec8..7f0515e1db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ requires-python = ">=3.11" dependencies = [ 'asciitree', 'numpy>=1.25', - 'fasteners', 'numcodecs>=0.10.2', 'fsspec>2024', 'crc32c', From 0b3aee6fc42797b6c1353c2ec744937faee15ec8 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Thu, 17 Oct 2024 05:55:55 +0200 Subject: [PATCH 16/22] Enforce ruff/flake8-annotations rule ANN003 (#2388) ANN003 Missing type annotation Co-authored-by: Joe Hamman --- pyproject.toml | 5 ++--- src/zarr/testing/strategies.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7f0515e1db..4ce69175a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -225,9 +225,8 @@ extend-select = [ "W", # pycodestyle warnings ] ignore = [ - "ANN003", - "ANN101", - "ANN102", + "ANN101", # deprecated + "ANN102", # deprecated "ANN401", "PT004", # deprecated "PT005", # deprecated diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index bb9fda65a1..2c17fbf79d 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -159,7 +159,7 @@ def is_negative_slice(idx: Any) -> bool: @st.composite # type: ignore[misc] -def basic_indices(draw: st.DrawFn, *, shape: tuple[int], **kwargs) -> Any: # type: ignore[no-untyped-def] +def basic_indices(draw: st.DrawFn, *, shape: tuple[int], **kwargs: Any) -> Any: """Basic indices without unsupported negative slices.""" return draw( npst.basic_indices(shape=shape, **kwargs).filter( From 56c6a6ba7c71ef6db309428e23c1612cd4e2588d Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Thu, 17 Oct 2024 05:56:54 +0200 Subject: [PATCH 17/22] Enforce ruff/Perflint rules (PERF) (#2372) * Apply ruff/Perflint rule PERF401 PERF401 Use a list comprehension to create a transformed list * Enforce ruff/Perflint rules (PERF) --- pyproject.toml | 33 +++++++++++++++++---------------- src/zarr/core/array.py | 8 +------- src/zarr/core/metadata/v3.py | 5 +---- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4ce69175a8..9130576351 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -206,23 +206,24 @@ extend-exclude = [ [tool.ruff.lint] extend-select = [ - "ANN", # flake8-annotations - "B", # flake8-bugbear - "C4", # flake8-comprehensions - "FLY", # flynt - "G", # flake8-logging-format - "I", # isort - "ISC", # flake8-implicit-str-concat - "PGH", # pygrep-hooks - "PT", # flake8-pytest-style - "PYI", # flake8-pyi - "RSE", # flake8-raise - "RET", # flake8-return + "ANN", # flake8-annotations + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "FLY", # flynt + "G", # flake8-logging-format + "I", # isort + "ISC", # flake8-implicit-str-concat + "PERF", # Perflint + "PGH", # pygrep-hooks + "PT", # flake8-pytest-style + "PYI", # flake8-pyi + "RSE", # flake8-raise + "RET", # flake8-return "RUF", - "TCH", # flake8-type-checking - "TRY", # tryceratops - "UP", # pyupgrade - "W", # pycodestyle warnings + "TCH", # flake8-type-checking + "TRY", # tryceratops + "UP", # pyupgrade + "W", # pycodestyle warnings ] ignore = [ "ANN101", # deprecated diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0e73d44563..da477056ee 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -2873,13 +2873,7 @@ def chunks_initialized( store_contents = list( collect_aiterator(array.store_path.store.list_prefix(prefix=array.store_path.path)) ) - out: list[str] = [] - - for chunk_key in array._iter_chunk_keys(): - if chunk_key in store_contents: - out.append(chunk_key) - - return tuple(out) + return tuple(chunk_key for chunk_key in array._iter_chunk_keys() if chunk_key in store_contents) def _build_parents( diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index b85932ef82..8aedd2b7b6 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -77,10 +77,7 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: """Check that the codecs are valid for the given dtype""" # ensure that we have at least one ArrayBytesCodec - abcs: list[ArrayBytesCodec] = [] - for codec in codecs: - if isinstance(codec, ArrayBytesCodec): - abcs.append(codec) + abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, ArrayBytesCodec)] if len(abcs) == 0: raise ValueError("At least one ArrayBytesCodec is required.") elif len(abcs) > 1: From bd96afbc67985aed2bb96fba0675cbbe6226e0a3 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 17 Oct 2024 05:49:29 -0700 Subject: [PATCH 18/22] chore: update package maintainers (#2387) * chore: update package maintainers * Update pyproject.toml Co-authored-by: David Stansby --------- Co-authored-by: David Stansby --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9130576351..beb6236545 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,10 @@ maintainers = [ { name = "Juan Nunez-Iglesias", email = "juan.nunez-iglesias@monash.edu" }, { name = "Martin Durant", email = "mdurant@anaconda.com" }, { name = "Norman Rzepka" }, - { name = "Ryan Abernathey" } + { name = "Ryan Abernathey" }, + { name = "David Stansby" }, + { name = "Tom Augspurger", email = "tom.w.augspurger@gmail.com" }, + { name = "Deepak Cherian" } ] requires-python = ">=3.11" # If you add a new dependency here, please also add it to .pre-commit-config.yml From c8c7889b34e867d215314e43f1fc2094f13b1f8b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 17 Oct 2024 09:27:02 -0500 Subject: [PATCH 19/22] Fixed consolidated Group getitem with multi-part key (#2363) * Fixed consolidated Group getitem with multi-part key This fixes `Group.__getitem__` when indexing with a key like 'subgroup/array'. The basic idea is to rewrite the indexing operation as `group['subgroup']['array']` by splitting the key and doing each operation independently. Closes https://github.com/zarr-developers/zarr-python/issues/2358 --------- Co-authored-by: Joe Hamman --- src/zarr/core/group.py | 88 ++++++++++++++++++++++++------------------ tests/test_group.py | 52 ++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 39 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index e25f70eef6..d797ed7370 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -572,7 +572,10 @@ def _from_bytes_v2( @classmethod def _from_bytes_v3( - cls, store_path: StorePath, zarr_json_bytes: Buffer, use_consolidated: bool | None + cls, + store_path: StorePath, + zarr_json_bytes: Buffer, + use_consolidated: bool | None, ) -> AsyncGroup: group_metadata = json.loads(zarr_json_bytes.to_bytes()) if use_consolidated and group_metadata.get("consolidated_metadata") is None: @@ -666,14 +669,33 @@ def _getitem_consolidated( # the caller needs to verify this! assert self.metadata.consolidated_metadata is not None - try: - metadata = self.metadata.consolidated_metadata.metadata[key] - except KeyError as e: - # The Group Metadata has consolidated metadata, but the key - # isn't present. We trust this to mean that the key isn't in - # the hierarchy, and *don't* fall back to checking the store. - msg = f"'{key}' not found in consolidated metadata." - raise KeyError(msg) from e + # we support nested getitems like group/subgroup/array + indexers = key.split("/") + indexers.reverse() + metadata: ArrayV2Metadata | ArrayV3Metadata | GroupMetadata = self.metadata + + while indexers: + indexer = indexers.pop() + if isinstance(metadata, ArrayV2Metadata | ArrayV3Metadata): + # we've indexed into an array with group["array/subarray"]. Invalid. + raise KeyError(key) + if metadata.consolidated_metadata is None: + # we've indexed into a group without consolidated metadata. + # This isn't normal; typically, consolidated metadata + # will include explicit markers for when there are no child + # nodes as metadata={}. + # We have some freedom in exactly how we interpret this case. + # For now, we treat None as the same as {}, i.e. we don't + # have any children. + raise KeyError(key) + try: + metadata = metadata.consolidated_metadata.metadata[indexer] + except KeyError as e: + # The Group Metadata has consolidated metadata, but the key + # isn't present. We trust this to mean that the key isn't in + # the hierarchy, and *don't* fall back to checking the store. + msg = f"'{key}' not found in consolidated metadata." + raise KeyError(msg) from e # update store_path to ensure that AsyncArray/Group.name is correct if prefix != "/": @@ -932,11 +954,7 @@ async def create_array( @deprecated("Use AsyncGroup.create_array instead.") async def create_dataset( - self, - name: str, - *, - shape: ShapeLike, - **kwargs: Any, + self, name: str, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -947,8 +965,6 @@ async def create_dataset( ---------- name : str Array name. - shape : int or tuple of ints - Array shape. kwargs : dict Additional arguments passed to :func:`zarr.AsyncGroup.create_array`. @@ -959,7 +975,7 @@ async def create_dataset( .. deprecated:: 3.0.0 The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead. """ - return await self.create_array(name, shape=shape, **kwargs) + return await self.create_array(name, **kwargs) @deprecated("Use AsyncGroup.require_array instead.") async def require_dataset( @@ -1081,6 +1097,8 @@ async def nmembers( ------- count : int """ + # check if we can use consolidated metadata, which requires that we have non-None + # consolidated metadata at all points in the hierarchy. if self.metadata.consolidated_metadata is not None: return len(self.metadata.consolidated_metadata.flattened_metadata) # TODO: consider using aioitertools.builtins.sum for this @@ -1094,7 +1112,8 @@ async def members( self, max_depth: int | None = 0, ) -> AsyncGenerator[ - tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None + tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], + None, ]: """ Returns an AsyncGenerator over the arrays and groups contained in this group. @@ -1125,12 +1144,12 @@ async def members( async def _members( self, max_depth: int | None, current_depth: int ) -> AsyncGenerator[ - tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None + tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], + None, ]: if self.metadata.consolidated_metadata is not None: # we should be able to do members without any additional I/O members = self._members_consolidated(max_depth, current_depth) - for member in members: yield member return @@ -1186,7 +1205,8 @@ async def _members( def _members_consolidated( self, max_depth: int | None, current_depth: int, prefix: str = "" ) -> Generator[ - tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None + tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], + None, ]: consolidated_metadata = self.metadata.consolidated_metadata @@ -1271,7 +1291,11 @@ async def full( self, *, name: str, shape: ChunkCoords, fill_value: Any | None, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: return await async_api.full( - shape=shape, fill_value=fill_value, store=self.store_path, path=name, **kwargs + shape=shape, + fill_value=fill_value, + store=self.store_path, + path=name, + **kwargs, ) async def empty_like( @@ -1627,13 +1651,7 @@ def create_dataset(self, name: str, **kwargs: Any) -> Array: return Array(self._sync(self._async_group.create_dataset(name, **kwargs))) @deprecated("Use Group.require_array instead.") - def require_dataset( - self, - name: str, - *, - shape: ShapeLike, - **kwargs: Any, - ) -> Array: + def require_dataset(self, name: str, **kwargs: Any) -> Array: """Obtain an array, creating if it doesn't exist. Arrays are known as "datasets" in HDF5 terminology. For compatibility @@ -1660,15 +1678,9 @@ def require_dataset( .. deprecated:: 3.0.0 The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead. """ - return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs))) + return Array(self._sync(self._async_group.require_array(name, **kwargs))) - def require_array( - self, - name: str, - *, - shape: ShapeLike, - **kwargs: Any, - ) -> Array: + def require_array(self, name: str, **kwargs: Any) -> Array: """Obtain an array, creating if it doesn't exist. @@ -1690,7 +1702,7 @@ def require_array( ------- a : Array """ - return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs))) + return Array(self._sync(self._async_group.require_array(name, **kwargs))) def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array: return Array(self._sync(self._async_group.empty(name=name, shape=shape, **kwargs))) diff --git a/tests/test_group.py b/tests/test_group.py index 4f062d5316..2530f64ff4 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -306,18 +306,53 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) + # we're going to assume that `group.metadata` is correct, and reuse that to focus + # on indexing in this test. Other tests verify the correctness of group.metadata object.__setattr__( - subgroup.metadata, "consolidated_metadata", ConsolidatedMetadata(metadata={}) + subgroup.metadata, + "consolidated_metadata", + ConsolidatedMetadata( + metadata={"subarray": group.metadata.consolidated_metadata.metadata["subarray"]} + ), ) assert group["subgroup"] == subgroup assert group["subarray"] == subarray + assert group["subgroup"]["subarray"] == subsubarray + assert group["subgroup/subarray"] == subsubarray + with pytest.raises(KeyError): group["nope"] + with pytest.raises(KeyError, match="subarray/subsubarray"): + group["subarray/subsubarray"] + + # Now test the mixed case + if consolidated: + object.__setattr__( + group.metadata.consolidated_metadata.metadata["subgroup"], + "consolidated_metadata", + None, + ) + + # test the implementation directly + with pytest.raises(KeyError): + group._async_group._getitem_consolidated( + group.store_path, "subgroup/subarray", prefix="/" + ) + + with pytest.raises(KeyError): + # We've chosen to trust the consolidted metadata, which doesn't + # contain this array + group["subgroup/subarray"] + + with pytest.raises(KeyError, match="subarray/subsubarray"): + group["subarray/subsubarray"] + def test_group_get_with_default(store: Store, zarr_format: ZarrFormat) -> None: group = Group.from_store(store, zarr_format=zarr_format) @@ -1008,6 +1043,21 @@ async def test_group_members_async(store: Store, consolidated_metadata: bool) -> with pytest.raises(ValueError, match="max_depth"): [x async for x in group.members(max_depth=-1)] + if consolidated_metadata: + # test for mixed known and unknown metadata. + # For now, we trust the consolidated metadata. + object.__setattr__( + group.metadata.consolidated_metadata.metadata["g0"].consolidated_metadata.metadata[ + "g1" + ], + "consolidated_metadata", + None, + ) + all_children = sorted([x async for x in group.members(max_depth=None)], key=lambda x: x[0]) + assert len(all_children) == 4 + nmembers = await group.nmembers(max_depth=None) + assert nmembers == 4 + async def test_require_group(store: LocalStore | MemoryStore, zarr_format: ZarrFormat) -> None: root = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) From d1f481a1bcbe5cce9f7787bc52e0572ef20ae5a2 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 17 Oct 2024 11:36:00 -0700 Subject: [PATCH 20/22] chore: add python 3.13 to ci / pyproject.toml (#2385) * chore: add python 3.13 to ci / pyproject.toml * update hatch matrix --- .github/workflows/test.yml | 2 +- pyproject.toml | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ae24fca6f5..3bd6226922 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.11', '3.12'] + python-version: ['3.11', '3.12', '3.13'] numpy-version: ['1.25', '1.26', '2.0'] dependency-set: ["minimal", "optional"] diff --git a/pyproject.toml b/pyproject.toml index beb6236545..3764b3e36f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ classifiers = [ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', ] license = {text = "MIT License"} keywords = ["Python", "compressed", "ndimensional-arrays", "zarr"] @@ -132,17 +133,17 @@ dependencies = [ features = ["test", "extra"] [[tool.hatch.envs.test.matrix]] -python = ["3.11", "3.12"] +python = ["3.11", "3.12", "3.13"] numpy = ["1.25", "1.26", "2.0"] version = ["minimal"] [[tool.hatch.envs.test.matrix]] -python = ["3.11", "3.12"] +python = ["3.11", "3.12", "3.13"] numpy = ["1.25", "1.26", "2.0"] features = ["optional"] [[tool.hatch.envs.test.matrix]] -python = ["3.11", "3.12"] +python = ["3.11", "3.12", "3.13"] numpy = ["1.25", "1.26", "2.0"] features = ["gpu"] @@ -163,7 +164,7 @@ dependencies = [ features = ["test", "extra", "gpu"] [[tool.hatch.envs.gputest.matrix]] -python = ["3.11", "3.12"] +python = ["3.11", "3.12", "3.13"] numpy = ["1.25", "1.26", "2.0"] version = ["minimal"] From 605a3a645b61b286c8b1077297a08fb9e82a023e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 18 Oct 2024 16:14:55 +0200 Subject: [PATCH 21/22] remove references to dead test dir in pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3764b3e36f..574b09b076 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -153,7 +153,7 @@ run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pypro run = "run-coverage --no-cov" run-verbose = "run-coverage --verbose" run-mypy = "mypy src" -run-hypothesis = "pytest --hypothesis-profile ci tests/v3/test_properties.py tests/v3/test_store/test_stateful*" +run-hypothesis = "pytest --hypothesis-profile ci tests/test_properties.py tests/test_store/test_stateful*" list-env = "pip list" [tool.hatch.envs.gputest] @@ -173,7 +173,7 @@ run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=tests" run = "run-coverage --no-cov" run-verbose = "run-coverage --verbose" run-mypy = "mypy src" -run-hypothesis = "pytest --hypothesis-profile ci tests/v3/test_properties.py tests/v3/test_store/test_stateful*" +run-hypothesis = "pytest --hypothesis-profile ci tests/test_properties.py tests/test_store/test_stateful*" list-env = "pip list" [tool.hatch.envs.docs] From 5052ad3f33e59388fd079581a6e1a7d7b21eb535 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 18 Oct 2024 16:36:38 +0200 Subject: [PATCH 22/22] remove v3 reference in test --- tests/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index 62907588c7..c4cf794c5f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -87,7 +87,7 @@ class MockClass: assert ( fully_qualified_name(MockClass) - == "tests.v3.test_config.test_fully_qualified_name..MockClass" + == "tests.test_config.test_fully_qualified_name..MockClass" )