From 4263325a6ccf6ac9a4c9fa1cd9101017379d1894 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Feb 2025 14:30:45 -0700 Subject: [PATCH 1/5] Partially Revert "Always skip reads when completely overwriting chunks (#2784)" This reverts commit feeb08f4e49f6574d712fe5ceb42ce80ab6ceb3f. --- src/zarr/abc/codec.py | 4 +-- src/zarr/codecs/sharding.py | 14 +++----- src/zarr/core/array.py | 6 ++-- src/zarr/core/codec_pipeline.py | 60 ++++++++++++++------------------ src/zarr/core/indexing.py | 61 +++++++++++++++++++++------------ src/zarr/storage/_logging.py | 2 +- tests/test_array.py | 11 +----- tests/test_indexing.py | 6 ++++ 8 files changed, 82 insertions(+), 82 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 16400f5f4b..fabd042dbe 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -357,7 +357,7 @@ async def encode( @abstractmethod async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -379,7 +379,7 @@ async def read( @abstractmethod async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 42b1313fac..c287159216 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -455,9 +455,8 @@ async def _decode_single( chunk_spec, chunk_selection, out_selection, - is_complete_shard, ) - for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer + for chunk_coords, chunk_selection, out_selection in indexer ], out, ) @@ -487,7 +486,7 @@ async def _decode_partial_single( ) indexed_chunks = list(indexer) - all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks} + all_chunk_coords = {chunk_coords for chunk_coords, _, _ in indexed_chunks} # reading bytes of all requested chunks shard_dict: ShardMapping = {} @@ -525,9 +524,8 @@ async def _decode_partial_single( chunk_spec, chunk_selection, out_selection, - is_complete_shard, ) - for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer + for chunk_coords, chunk_selection, out_selection in indexer ], out, ) @@ -564,9 +562,8 @@ async def _encode_single( chunk_spec, chunk_selection, out_selection, - is_complete_shard, ) - for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer + for chunk_coords, chunk_selection, out_selection in indexer ], shard_array, ) @@ -608,9 +605,8 @@ async def _encode_partial_single( chunk_spec, chunk_selection, out_selection, - is_complete_shard, ) - for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer + for chunk_coords, chunk_selection, out_selection in indexer ], shard_array, ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9c2f8a7260..ba163c1529 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1291,9 +1291,8 @@ async def _get_selection( self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, - is_complete_chunk, ) - for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer + for chunk_coords, chunk_selection, out_selection in indexer ], out_buffer, drop_axes=indexer.drop_axes, @@ -1419,9 +1418,8 @@ async def _set_selection( self.metadata.get_chunk_spec(chunk_coords, _config, prototype), chunk_selection, out_selection, - is_complete_chunk, ) - for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer + for chunk_coords, chunk_selection, out_selection in indexer ], value_buffer, drop_axes=indexer.drop_axes, diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 0c53cda96c..a35c5ca210 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -16,7 +16,7 @@ ) from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config -from zarr.core.indexing import SelectorTuple, is_scalar +from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice from zarr.core.metadata.v2 import _default_fill_value from zarr.registry import register_pipeline @@ -243,7 +243,7 @@ async def encode_partial_batch( async def read_batch( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -251,10 +251,10 @@ async def read_batch( chunk_array_batch = await self.decode_partial_batch( [ (byte_getter, chunk_selection, chunk_spec) - for byte_getter, chunk_spec, chunk_selection, *_ in batch_info + for byte_getter, chunk_spec, chunk_selection, _ in batch_info ] ) - for chunk_array, (_, chunk_spec, _, out_selection, _) in zip( + for chunk_array, (_, chunk_spec, _, out_selection) in zip( chunk_array_batch, batch_info, strict=False ): if chunk_array is not None: @@ -263,19 +263,22 @@ async def read_batch( out[out_selection] = fill_value_or_default(chunk_spec) else: chunk_bytes_batch = await concurrent_map( - [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], + [ + (byte_getter, array_spec.prototype) + for byte_getter, array_spec, _, _ in batch_info + ], lambda byte_getter, prototype: byte_getter.get(prototype), config.get("async.concurrency"), ) chunk_array_batch = await self.decode_batch( [ (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip( + for chunk_bytes, (_, chunk_spec, _, _) in zip( chunk_bytes_batch, batch_info, strict=False ) ], ) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( + for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( chunk_array_batch, batch_info, strict=False ): if chunk_array is not None: @@ -293,10 +296,9 @@ def _merge_chunk_array( out_selection: SelectorTuple, chunk_spec: ArraySpec, chunk_selection: SelectorTuple, - is_complete_chunk: bool, drop_axes: tuple[int, ...], ) -> NDBuffer: - if is_complete_chunk and value.shape == chunk_spec.shape: + if is_total_slice(chunk_selection, chunk_spec.shape) and value.shape == chunk_spec.shape: return value if existing_chunk_array is None: chunk_array = chunk_spec.prototype.nd_buffer.create( @@ -325,7 +327,7 @@ def _merge_chunk_array( async def write_batch( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -335,14 +337,14 @@ async def write_batch( await self.encode_partial_batch( [ (byte_setter, value, chunk_selection, chunk_spec) - for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info + for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info ], ) else: await self.encode_partial_batch( [ (byte_setter, value[out_selection], chunk_selection, chunk_spec) - for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info + for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info ], ) @@ -359,10 +361,10 @@ async def _read_key( chunk_bytes_batch = await concurrent_map( [ ( - None if is_complete_chunk else byte_setter, + None if is_total_slice(chunk_selection, chunk_spec.shape) else byte_setter, chunk_spec.prototype, ) - for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info + for byte_setter, chunk_spec, chunk_selection, _ in batch_info ], _read_key, config.get("async.concurrency"), @@ -370,7 +372,7 @@ async def _read_key( chunk_array_decoded = await self.decode_batch( [ (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip( + for chunk_bytes, (_, chunk_spec, _, _) in zip( chunk_bytes_batch, batch_info, strict=False ) ], @@ -378,24 +380,14 @@ async def _read_key( chunk_array_merged = [ self._merge_chunk_array( - chunk_array, - value, - out_selection, - chunk_spec, - chunk_selection, - is_complete_chunk, - drop_axes, + chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes + ) + for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( + chunk_array_decoded, batch_info, strict=False ) - for chunk_array, ( - _, - chunk_spec, - chunk_selection, - out_selection, - is_complete_chunk, - ) in zip(chunk_array_decoded, batch_info, strict=False) ] chunk_array_batch: list[NDBuffer | None] = [] - for chunk_array, (_, chunk_spec, *_) in zip( + for chunk_array, (_, chunk_spec, _, _) in zip( chunk_array_merged, batch_info, strict=False ): if chunk_array is None: @@ -411,7 +403,7 @@ async def _read_key( chunk_bytes_batch = await self.encode_batch( [ (chunk_array, chunk_spec) - for chunk_array, (_, chunk_spec, *_) in zip( + for chunk_array, (_, chunk_spec, _, _) in zip( chunk_array_batch, batch_info, strict=False ) ], @@ -426,7 +418,7 @@ async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> Non await concurrent_map( [ (byte_setter, chunk_bytes) - for chunk_bytes, (byte_setter, *_) in zip( + for chunk_bytes, (byte_setter, _, _, _) in zip( chunk_bytes_batch, batch_info, strict=False ) ], @@ -454,7 +446,7 @@ async def encode( async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -469,7 +461,7 @@ async def read( async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index c197f6f397..733b2464ac 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -321,12 +321,12 @@ class ChunkDimProjection(NamedTuple): Selection of items from chunk array. dim_out_sel Selection of items in target (output) array. + """ dim_chunk_ix: int dim_chunk_sel: Selector dim_out_sel: Selector | None - is_complete_chunk: bool @dataclass(frozen=True) @@ -346,8 +346,7 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel - dim_offset dim_out_sel = None - is_complete_chunk = self.dim_chunk_len == 1 - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) @dataclass(frozen=True) @@ -421,10 +420,7 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - is_complete_chunk = ( - dim_chunk_sel_start == 0 and (self.stop >= dim_limit) and self.step in [1, None] - ) - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) def check_selection_length(selection: SelectionNormalized, shape: ChunkCoords) -> None: @@ -497,14 +493,12 @@ class ChunkProjection(NamedTuple): Selection of items from chunk array. out_selection Selection of items in target (output) array. - is_complete_chunk: - True if a complete chunk is indexed + """ chunk_coords: ChunkCoords chunk_selection: tuple[Selector, ...] | npt.NDArray[np.intp] out_selection: tuple[Selector, ...] | npt.NDArray[np.intp] | slice - is_complete_chunk: bool def is_slice(s: Any) -> TypeGuard[slice]: @@ -580,8 +574,8 @@ def __iter__(self) -> Iterator[ChunkProjection]: out_selection = tuple( p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None ) - is_complete_chunk = all(p.is_complete_chunk for p in dim_projections) - yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) @dataclass(frozen=True) @@ -649,9 +643,8 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] stop = self.chunk_nitems_cumsum[dim_chunk_ix] dim_out_sel = slice(start, stop) - is_complete_chunk = False # TODO - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) class Order(Enum): @@ -790,8 +783,8 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: # find region in chunk dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel[start:stop] - dim_offset - is_complete_chunk = False # TODO - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel, is_complete_chunk) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) def slice_to_range(s: slice, length: int) -> range: @@ -928,8 +921,7 @@ def __iter__(self) -> Iterator[ChunkProjection]: if not is_basic_selection(out_selection): out_selection = ix_(out_selection, self.shape) - is_complete_chunk = all(p.is_complete_chunk for p in dim_projections) - yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) @dataclass(frozen=True) @@ -1038,8 +1030,8 @@ def __iter__(self) -> Iterator[ChunkProjection]: out_selection = tuple( p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None ) - is_complete_chunk = all(p.is_complete_chunk for p in dim_projections) - yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) @dataclass(frozen=True) @@ -1206,8 +1198,7 @@ def __iter__(self) -> Iterator[ChunkProjection]: for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets, strict=True) ) - is_complete_chunk = False # TODO - yield ChunkProjection(chunk_coords, chunk_selection, out_selection, is_complete_chunk) + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) @dataclass(frozen=True) @@ -1370,6 +1361,32 @@ def c_order_iter(chunks_per_shard: ChunkCoords) -> Iterator[ChunkCoords]: return itertools.product(*(range(x) for x in chunks_per_shard)) +def is_total_slice(item: Selection, shape: ChunkCoords) -> bool: + """Determine whether `item` specifies a complete slice of array with the + given `shape`. Used to optimize __setitem__ operations on the Chunk + class.""" + + # N.B., assume shape is normalized + if item == slice(None): + return True + if isinstance(item, slice): + item = (item,) + if isinstance(item, tuple): + return all( + (isinstance(dim_sel, int) and dim_len == 1) + or ( + isinstance(dim_sel, slice) + and ( + (dim_sel == slice(None)) + or ((dim_sel.stop - dim_sel.start == dim_len) and (dim_sel.step in [1, None])) + ) + ) + for dim_sel, dim_len in zip(item, shape, strict=False) + ) + else: + raise TypeError(f"expected slice or tuple of slices, found {item!r}") + + def get_indexer( selection: SelectionWithFields, shape: ChunkCoords, chunk_grid: ChunkGrid ) -> Indexer: diff --git a/src/zarr/storage/_logging.py b/src/zarr/storage/_logging.py index 5f1a97acd9..e9d6211588 100644 --- a/src/zarr/storage/_logging.py +++ b/src/zarr/storage/_logging.py @@ -88,7 +88,7 @@ def log(self, hint: Any = "") -> Generator[None, None, None]: op = f"{type(self._store).__name__}.{method}" if hint: op = f"{op}({hint})" - self.logger.info(" Calling %s", op) + self.logger.info("Calling %s", op) start_time = time.time() try: self.counter[method] += 1 diff --git a/tests/test_array.py b/tests/test_array.py index efcf8a6bf9..0b09ef2a58 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1330,18 +1330,9 @@ async def test_orthogonal_set_total_slice() -> None: """Ensure that a whole chunk overwrite does not read chunks""" store = MemoryStore() array = zarr.create_array(store, shape=(20, 20), chunks=(1, 2), dtype=int, fill_value=-1) - with mock.patch("zarr.storage.MemoryStore.get", side_effect=RuntimeError): + with mock.patch("zarr.storage.MemoryStore.get", side_effect=ValueError): array[0, slice(4, 10)] = np.arange(6) - array = zarr.create_array( - store, shape=(20, 21), chunks=(1, 2), dtype=int, fill_value=-1, overwrite=True - ) - with mock.patch("zarr.storage.MemoryStore.get", side_effect=RuntimeError): - array[0, :] = np.arange(21) - - with mock.patch("zarr.storage.MemoryStore.get", side_effect=RuntimeError): - array[:] = 1 - @pytest.mark.skipif( Version(numcodecs.__version__) < Version("0.15.1"), diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 30d0d75f22..932c32f1ae 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -19,6 +19,7 @@ OrthogonalSelection, Selection, _iter_grid, + is_total_slice, make_slice_selection, normalize_integer_selection, oindex, @@ -1953,3 +1954,8 @@ def test_vectorized_indexing_incompatible_shape(store) -> None: ) with pytest.raises(ValueError, match="Attempting to set"): arr[np.array([1, 2]), np.array([1, 2])] = np.array([[-1, -2], [-3, -4]]) + + +def test_is_total_slice(): + assert is_total_slice((0, slice(4, 6)), (1, 2)) + assert is_total_slice((slice(0, 1, None), slice(4, 6)), (1, 2)) From ebc5743f84c51a102b0b433578777a1ee0feffd8 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Feb 2025 14:32:02 -0700 Subject: [PATCH 2/5] Greatly improve property tests --- src/zarr/testing/stateful.py | 8 ++-- src/zarr/testing/strategies.py | 80 ++++++++++++++++++++++++++++------ tests/test_indexing.py | 30 +++++++++++++ tests/test_properties.py | 10 +++-- 4 files changed, 107 insertions(+), 21 deletions(-) diff --git a/src/zarr/testing/stateful.py b/src/zarr/testing/stateful.py index 3e8dbcdf04..ede83201ae 100644 --- a/src/zarr/testing/stateful.py +++ b/src/zarr/testing/stateful.py @@ -325,7 +325,7 @@ def __init__(self, store: Store) -> None: def init_store(self) -> None: self.store.clear() - @rule(key=zarr_keys, data=st.binary(min_size=0, max_size=MAX_BINARY_SIZE)) + @rule(key=zarr_keys(), data=st.binary(min_size=0, max_size=MAX_BINARY_SIZE)) def set(self, key: str, data: DataObject) -> None: note(f"(set) Setting {key!r} with {data}") assert not self.store.read_only @@ -334,7 +334,7 @@ def set(self, key: str, data: DataObject) -> None: self.model[key] = data_buf @precondition(lambda self: len(self.model.keys()) > 0) - @rule(key=zarr_keys, data=st.data()) + @rule(key=zarr_keys(), data=st.data()) def get(self, key: str, data: DataObject) -> None: key = data.draw( st.sampled_from(sorted(self.model.keys())) @@ -344,7 +344,7 @@ def get(self, key: str, data: DataObject) -> None: # to bytes here necessary because data_buf set to model in set() assert self.model[key] == store_value - @rule(key=zarr_keys, data=st.data()) + @rule(key=zarr_keys(), data=st.data()) def get_invalid_zarr_keys(self, key: str, data: DataObject) -> None: note("(get_invalid)") assume(key not in self.model) @@ -408,7 +408,7 @@ def is_empty(self) -> None: # make sure they either both are or both aren't empty (same state) assert self.store.is_empty("") == (not self.model) - @rule(key=zarr_keys) + @rule(key=zarr_keys()) def exists(self, key: str) -> None: note("(exists)") diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 0e25e44592..90e2726dda 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -1,10 +1,11 @@ +import math import sys from typing import Any, Literal import hypothesis.extra.numpy as npst import hypothesis.strategies as st import numpy as np -from hypothesis import given, settings # noqa: F401 +from hypothesis import event, given, settings # noqa: F401 from hypothesis.strategies import SearchStrategy import zarr @@ -28,6 +29,16 @@ ) +@st.composite # type: ignore[misc] +def keys(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> Any: + return draw(st.lists(node_names, min_size=1, max_size=max_num_nodes).map("/".join)) + + +@st.composite # type: ignore[misc] +def paths(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> Any: + return draw(st.just("/") | keys(max_num_nodes=max_num_nodes)) + + def v3_dtypes() -> st.SearchStrategy[np.dtype]: return ( npst.boolean_dtypes() @@ -87,17 +98,19 @@ def clear_store(x: Store) -> Store: node_names = st.text(zarr_key_chars, min_size=1).filter( lambda t: t not in (".", "..") and not t.startswith("__") ) +short_node_names = st.text(zarr_key_chars, max_size=3, min_size=1).filter( + lambda t: t not in (".", "..") and not t.startswith("__") +) array_names = node_names attrs = st.none() | st.dictionaries(_attr_keys, _attr_values) -keys = st.lists(node_names, min_size=1).map("/".join) -paths = st.just("/") | keys # st.builds will only call a new store constructor for different keyword arguments # i.e. stores.examples() will always return the same object per Store class. # So we map a clear to reset the store. stores = st.builds(MemoryStore, st.just({})).map(clear_store) compressors = st.sampled_from([None, "default"]) zarr_formats: st.SearchStrategy[ZarrFormat] = st.sampled_from([3, 2]) -array_shapes = npst.array_shapes(max_dims=4, min_side=0) +# We de-prioritize arrays having dim sizes 0, 1, 2 +array_shapes = npst.array_shapes(max_dims=4, min_side=3) | npst.array_shapes(max_dims=4, min_side=0) @st.composite # type: ignore[misc] @@ -174,11 +187,19 @@ def chunk_shapes(draw: st.DrawFn, *, shape: tuple[int, ...]) -> tuple[int, ...]: st.tuples(*[st.integers(min_value=0 if size == 0 else 1, max_value=size) for size in shape]) ) # 2. and now generate the chunks tuple - return tuple( + chunks = tuple( size // nchunks if nchunks > 0 else 0 for size, nchunks in zip(shape, numchunks, strict=True) ) + for c in chunks: + event("chunk size", c) + + if any((c != 0 and s % c != 0) for s, c in zip(shape, chunks, strict=True)): + event("smaller last chunk") + + return chunks + @st.composite # type: ignore[misc] def shard_shapes( @@ -211,7 +232,7 @@ def arrays( shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, compressors: st.SearchStrategy = compressors, stores: st.SearchStrategy[StoreLike] = stores, - paths: st.SearchStrategy[str | None] = paths, + paths: st.SearchStrategy[str | None] = paths(), # noqa: B008 array_names: st.SearchStrategy = array_names, arrays: st.SearchStrategy | None = None, attrs: st.SearchStrategy = attrs, @@ -267,23 +288,56 @@ def arrays( return a +@st.composite # type: ignore[misc] +def simple_arrays( + draw: st.DrawFn, + *, + shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, +) -> Any: + return draw( + arrays( + shapes=shapes, + paths=paths(max_num_nodes=2), + array_names=short_node_names, + attrs=st.none(), + compressors=st.sampled_from([None, "default"]), + ) + ) + + def is_negative_slice(idx: Any) -> bool: return isinstance(idx, slice) and idx.step is not None and idx.step < 0 +@st.composite # type: ignore[misc] +def end_slices(draw: st.DrawFn, *, shape: tuple[int]) -> Any: + """ + A strategy that slices ranges that include the last chunk. + This is intended to stress-test handling of a possibly smaller last chunk. + """ + slicers = [] + for size in shape: + start = draw(st.integers(min_value=size // 2, max_value=size - 1)) + length = draw(st.integers(min_value=0, max_value=size - start)) + slicers.append(slice(start, start + length)) + event("drawing end slice") + return tuple(slicers) + + @st.composite # type: ignore[misc] def basic_indices(draw: st.DrawFn, *, shape: tuple[int], **kwargs: Any) -> Any: """Basic indices without unsupported negative slices.""" - return draw( - npst.basic_indices(shape=shape, **kwargs).filter( - lambda idxr: ( - not ( - is_negative_slice(idxr) - or (isinstance(idxr, tuple) and any(is_negative_slice(idx) for idx in idxr)) - ) + strategy = npst.basic_indices(shape=shape, **kwargs).filter( + lambda idxr: ( + not ( + is_negative_slice(idxr) + or (isinstance(idxr, tuple) and any(is_negative_slice(idx) for idx in idxr)) ) ) ) + if math.prod(shape) >= 3: + strategy = end_slices(shape=shape) | strategy + return draw(strategy) @st.composite # type: ignore[misc] diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 932c32f1ae..05adcfe8a3 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -425,6 +425,17 @@ def test_orthogonal_indexing_fallback_on_getitem_2d( np.testing.assert_array_equal(z[index], expected_result) +def test_setitem_repeated_index(): + array = zarr.array(data=np.zeros((4,)), chunks=(1,)) + indexer = np.array([-1, -1, 0, 0]) + array.oindex[(indexer,)] = [0, 1, 2, 3] + np.testing.assert_array_equal(array[:], np.array([3, 0, 0, 1])) + + indexer = np.array([-1, 0, 0, -1]) + array.oindex[(indexer,)] = [0, 1, 2, 3] + np.testing.assert_array_equal(array[:], np.array([2, 0, 0, 3])) + + Index = list[int] | tuple[slice | int | list[int], ...] @@ -816,6 +827,25 @@ def test_set_orthogonal_selection_1d(store: StorePath) -> None: _test_set_orthogonal_selection(v, a, z, selection) +def test_set_item_1d_last_two_chunks(): + # regression test for GH2849 + g = zarr.open_group("foo.zarr", zarr_format=3, mode="w") + a = g.create_array("bar", shape=(10,), chunks=(3,), dtype=int) + data = np.array([7, 8, 9]) + a[slice(7, 10)] = data + np.testing.assert_array_equal(a[slice(7, 10)], data) + + z = zarr.open_group("foo.zarr", mode="w") + z.create_array("zoo", dtype=float, shape=()) + z["zoo"][...] = np.array(1) # why doesn't [:] work? + np.testing.assert_equal(z["zoo"][()], np.array(1)) + + z = zarr.open_group("foo.zarr", mode="w") + z.create_array("zoo", dtype=float, shape=()) + z["zoo"][...] = 1 # why doesn't [:] work? + np.testing.assert_equal(z["zoo"][()], np.array(1)) + + def _test_set_orthogonal_selection_2d( v: npt.NDArray[np.int_], a: npt.NDArray[np.int_], diff --git a/tests/test_properties.py b/tests/test_properties.py index 5643cf3853..62cfd9e3fc 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -7,7 +7,7 @@ import hypothesis.extra.numpy as npst import hypothesis.strategies as st -from hypothesis import assume, given +from hypothesis import assume, given, note from zarr.abc.store import Store from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata @@ -18,6 +18,7 @@ basic_indices, numpy_arrays, orthogonal_indices, + simple_arrays, stores, zarr_formats, ) @@ -50,7 +51,7 @@ def test_array_creates_implicit_groups(array): @given(data=st.data()) def test_basic_indexing(data: st.DataObject) -> None: - zarray = data.draw(arrays()) + zarray = data.draw(simple_arrays()) nparray = zarray[:] indexer = data.draw(basic_indices(shape=nparray.shape)) actual = zarray[indexer] @@ -65,7 +66,7 @@ def test_basic_indexing(data: st.DataObject) -> None: @given(data=st.data()) def test_oindex(data: st.DataObject) -> None: # integer_array_indices can't handle 0-size dimensions. - zarray = data.draw(arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) + zarray = data.draw(simple_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) nparray = zarray[:] zindexer, npindexer = data.draw(orthogonal_indices(shape=nparray.shape)) @@ -76,13 +77,14 @@ def test_oindex(data: st.DataObject) -> None: new_data = data.draw(npst.arrays(shape=st.just(actual.shape), dtype=nparray.dtype)) nparray[npindexer] = new_data zarray.oindex[zindexer] = new_data + note((new_data, npindexer, nparray, zindexer, zarray[:])) assert_array_equal(nparray, zarray[:]) @given(data=st.data()) def test_vindex(data: st.DataObject) -> None: # integer_array_indices can't handle 0-size dimensions. - zarray = data.draw(arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) + zarray = data.draw(simple_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) nparray = zarray[:] indexer = data.draw( From 1b7cf9a722ebdd86c56b5e0e3ed2f35d0e464945 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Feb 2025 14:35:39 -0700 Subject: [PATCH 3/5] cleanup --- src/zarr/storage/_logging.py | 2 +- tests/test_properties.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/zarr/storage/_logging.py b/src/zarr/storage/_logging.py index e9d6211588..5f1a97acd9 100644 --- a/src/zarr/storage/_logging.py +++ b/src/zarr/storage/_logging.py @@ -88,7 +88,7 @@ def log(self, hint: Any = "") -> Generator[None, None, None]: op = f"{type(self._store).__name__}.{method}" if hint: op = f"{op}({hint})" - self.logger.info("Calling %s", op) + self.logger.info(" Calling %s", op) start_time = time.time() try: self.counter[method] += 1 diff --git a/tests/test_properties.py b/tests/test_properties.py index 62cfd9e3fc..90acbdb4b9 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -7,7 +7,7 @@ import hypothesis.extra.numpy as npst import hypothesis.strategies as st -from hypothesis import assume, given, note +from hypothesis import assume, given from zarr.abc.store import Store from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata @@ -77,7 +77,6 @@ def test_oindex(data: st.DataObject) -> None: new_data = data.draw(npst.arrays(shape=st.just(actual.shape), dtype=nparray.dtype)) nparray[npindexer] = new_data zarray.oindex[zindexer] = new_data - note((new_data, npindexer, nparray, zindexer, zarray[:])) assert_array_equal(nparray, zarray[:]) From 82a272db92874bacd50533f1151d6bc369b90ebf Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Feb 2025 14:42:40 -0700 Subject: [PATCH 4/5] Fix test --- src/zarr/testing/strategies.py | 6 ++++-- tests/test_properties.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 90e2726dda..96d664f5aa 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -165,13 +165,15 @@ def numpy_arrays( draw: st.DrawFn, *, shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, - zarr_formats: st.SearchStrategy[ZarrFormat] = zarr_formats, + dtype: np.dtype[Any] | None = None, + zarr_formats: st.SearchStrategy[ZarrFormat] | None = zarr_formats, ) -> Any: """ Generate numpy arrays that can be saved in the provided Zarr format. """ zarr_format = draw(zarr_formats) - dtype = draw(v3_dtypes() if zarr_format == 3 else v2_dtypes()) + if dtype is None: + dtype = draw(v3_dtypes() if zarr_format == 3 else v2_dtypes()) if np.issubdtype(dtype, np.str_): safe_unicode_strings = safe_unicode_for_dtype(dtype) return draw(npst.arrays(dtype=dtype, shape=shapes, elements=safe_unicode_strings)) diff --git a/tests/test_properties.py b/tests/test_properties.py index 90acbdb4b9..67c13d0b17 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -57,7 +57,7 @@ def test_basic_indexing(data: st.DataObject) -> None: actual = zarray[indexer] assert_array_equal(nparray[indexer], actual) - new_data = data.draw(npst.arrays(shape=st.just(actual.shape), dtype=nparray.dtype)) + new_data = data.draw(numpy_arrays(shapes=st.just(actual.shape), dtype=nparray.dtype)) zarray[indexer] = new_data nparray[indexer] = new_data assert_array_equal(nparray, zarray[:]) @@ -74,7 +74,7 @@ def test_oindex(data: st.DataObject) -> None: assert_array_equal(nparray[npindexer], actual) assume(zarray.shards is None) # GH2834 - new_data = data.draw(npst.arrays(shape=st.just(actual.shape), dtype=nparray.dtype)) + new_data = data.draw(numpy_arrays(shapes=st.just(actual.shape), dtype=nparray.dtype)) nparray[npindexer] = new_data zarray.oindex[zindexer] = new_data assert_array_equal(nparray, zarray[:]) From df2c99385701dbd34af15c100bdf81b1f4f9703c Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Feb 2025 14:48:12 -0700 Subject: [PATCH 5/5] add release note --- changes/2845.bugfix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/2845.bugfix.rst diff --git a/changes/2845.bugfix.rst b/changes/2845.bugfix.rst new file mode 100644 index 0000000000..4b075936c3 --- /dev/null +++ b/changes/2845.bugfix.rst @@ -0,0 +1 @@ +Revert "Always skip reads when completely overwriting chunks (PR #2784, :issue:`2849`)