From 85c49d3d6995a78f2cb337bf017307d5050d19d8 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Wed, 18 Sep 2024 08:19:59 -0400 Subject: [PATCH 01/17] fix safe chunks validation --- xarray/backends/zarr.py | 111 ++++++++++++++++++++-------------- xarray/tests/test_backends.py | 68 ++++++++++++++++++--- 2 files changed, 128 insertions(+), 51 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 52d2175621f..52de392e85d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -112,7 +112,7 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, region): """ Given encoding chunks (possibly None or []) and variable chunks (possibly None or []). @@ -163,7 +163,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks) + return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks, region) for x in enc_chunks_tuple: if not isinstance(x, int): @@ -189,20 +189,36 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): # TODO: incorporate synchronizer to allow writes from multiple dask # threads if var_chunks and enc_chunks_tuple: - for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks, strict=True): - for dchunk in dchunks[:-1]: + base_error = ( + f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " + f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " + f"Writing this array in parallel with dask could lead to corrupted data." + f"Consider either rechunking using `chunk()`, deleting " + f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." + ) + + for zchunk, dchunks, interval in zip(enc_chunks_tuple, var_chunks, region, strict=True): + if not safe_chunks or len(dchunks) <= 1: + # It is not necessary to perform any additional validation if the + # safe_chunks is False, or there are less than two dchunks + continue + + start = 0 + if interval.start: + # If the start of the interval is not None or 0, it means that the data + # is being appended or updated, and in both cases it is mandatory that + # the residue of the division between the first dchunk and the zchunk + # being equal to the border size + border_size = zchunk - interval.start % zchunk + if dchunks[0] % zchunk != border_size: + raise ValueError(base_error) + # Avoid validating the first chunk inside the loop + start = 1 + + for dchunk in dchunks[start:-1]: if dchunk % zchunk: - base_error = ( - f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " - f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " - f"Writing this array in parallel with dask could lead to corrupted data." - ) - if safe_chunks: - raise ValueError( - base_error - + " Consider either rechunking using `chunk()`, deleting " - "or modifying `encoding['chunks']`, or specify `safe_chunks=False`." - ) + raise ValueError(base_error) + return enc_chunks_tuple raise AssertionError("We should never get here. Function logic must be wrong.") @@ -243,7 +259,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): def extract_zarr_variable_encoding( - variable, raise_on_invalid=False, name=None, safe_chunks=True + variable, region, raise_on_invalid=False, name=None, safe_chunks=True ): """ Extract zarr encoding dictionary from xarray Variable @@ -251,6 +267,7 @@ def extract_zarr_variable_encoding( Parameters ---------- variable : Variable + region: tuple[slice] raise_on_invalid : bool, optional Returns @@ -285,7 +302,7 @@ def extract_zarr_variable_encoding( del encoding[k] chunks = _determine_zarr_chunks( - encoding.get("chunks"), variable.chunks, variable.ndim, name, safe_chunks + encoding.get("chunks"), variable.chunks, variable.ndim, name, safe_chunks, region ) encoding["chunks"] = chunks return encoding @@ -762,16 +779,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if v.encoding == {"_FillValue": None} and fill_value is None: v.encoding = {} - # We need to do this for both new and existing variables to ensure we're not - # writing to a partial chunk, even though we don't use the `encoding` value - # when writing to an existing variable. See - # https://github.com/pydata/xarray/issues/8371 for details. - encoding = extract_zarr_variable_encoding( - v, - raise_on_invalid=vn in check_encoding_set, - name=vn, - safe_chunks=self._safe_chunks, - ) + zarr_array = None + write_region = self._write_region if self._write_region is not None else {} + write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} if name in existing_keys: # existing variable @@ -801,7 +811,36 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No ) else: zarr_array = self.zarr_group[name] - else: + + if self._append_dim is not None and self._append_dim in dims: + # resize existing variable + append_axis = dims.index(self._append_dim) + assert write_region[self._append_dim] == slice(None) + write_region[self._append_dim] = slice( + zarr_array.shape[append_axis], None + ) + + new_shape = list(zarr_array.shape) + new_shape[append_axis] += v.shape[append_axis] + zarr_array.resize(new_shape) + + region = tuple(write_region[dim] for dim in dims) + + # We need to do this for both new and existing variables to ensure we're not + # writing to a partial chunk, even though we don't use the `encoding` value + # when writing to an existing variable. See + # https://github.com/pydata/xarray/issues/8371 for details. + # Note: Ideally there should be two functions, one for validating the chunks and + # another one for extracting the encoding. + encoding = extract_zarr_variable_encoding( + v, + region=region, + raise_on_invalid=vn in check_encoding_set, + name=vn, + safe_chunks=self._safe_chunks, + ) + + if name not in existing_keys: # new variable encoded_attrs = {} # the magic for storing the hidden dimension data @@ -833,22 +872,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No ) zarr_array = _put_attrs(zarr_array, encoded_attrs) - write_region = self._write_region if self._write_region is not None else {} - write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} - - if self._append_dim is not None and self._append_dim in dims: - # resize existing variable - append_axis = dims.index(self._append_dim) - assert write_region[self._append_dim] == slice(None) - write_region[self._append_dim] = slice( - zarr_array.shape[append_axis], None - ) - - new_shape = list(zarr_array.shape) - new_shape[append_axis] += v.shape[append_axis] - zarr_array.resize(new_shape) - - region = tuple(write_region[dim] for dim in dims) writer.add(v.data, zarr_array, region) def close(self) -> None: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 13258fcf6ea..a78b583598b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5496,24 +5496,26 @@ def test_encode_zarr_attr_value() -> None: @requires_zarr def test_extract_zarr_variable_encoding() -> None: + # The region is not useful in these cases, but I still think that it must be mandatory + # because the validation of the chunks is in the same function var = xr.Variable("x", [1, 2]) - actual = backends.zarr.extract_zarr_variable_encoding(var) + actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) assert "chunks" in actual assert actual["chunks"] is None var = xr.Variable("x", [1, 2], encoding={"chunks": (1,)}) - actual = backends.zarr.extract_zarr_variable_encoding(var) + actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) assert actual["chunks"] == (1,) # does not raise on invalid var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) - actual = backends.zarr.extract_zarr_variable_encoding(var) + actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) # raises on invalid var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) with pytest.raises(ValueError, match=r"unexpected encoding parameters"): actual = backends.zarr.extract_zarr_variable_encoding( - var, raise_on_invalid=True + var, raise_on_invalid=True, region=tuple() ) @@ -6096,6 +6098,58 @@ def test_zarr_region_chunk_partial_offset(tmp_path): store, safe_chunks=False, region="auto" ) - # This write is unsafe, and should raise an error, but does not. - # with pytest.raises(ValueError): - # da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto") + with pytest.raises(ValueError): + da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto") + + +@requires_zarr +@requires_dask +def test_zarr_safe_chunk(tmp_path): + # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 + store = tmp_path / "foo.zarr" + data = np.ones((20,)) + da = xr.DataArray(data, dims=["x"], coords={"x": range(20)}, name="foo").chunk(x=5) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk is smaller than the border size then raise an error + da.isel(x=slice(7, 11)).chunk(x=(2, 2)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # If the first chunk is of the size of the border size then it is valid + da.isel(x=slice(7, 11)).chunk(x=(3, 1)).to_zarr( + store, safe_chunks=True, append_dim="x" + ) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 11))) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # If the first chunk is of the size of the border size + N * zchunk then it is valid + da.isel(x=slice(7, 17)).chunk(x=(8, 2)).to_zarr( + store, safe_chunks=True, append_dim="x" + ) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 17))) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk is valid but the other are not then raise an error + da.isel(x=slice(7, 14)).chunk(x=(3, 3, 1)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk have a size bigger than the border size but not enough + # to complete the size of the next chunk then an error must be raised + da.isel(x=slice(7, 14)).chunk(x=(4, 3)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # Append with a single chunk it's totally valid, + # and it does not matter the size of the chunk + da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr( + store, append_dim="x", safe_chunks=True + ) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 19))) From 0160d48ee35153f26e96515f887affca61a89348 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 12:25:29 +0000 Subject: [PATCH 02/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 11 +++++++++-- xarray/tests/test_backends.py | 4 +--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 52de392e85d..c4099f1f5fe 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -197,7 +197,9 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." ) - for zchunk, dchunks, interval in zip(enc_chunks_tuple, var_chunks, region, strict=True): + for zchunk, dchunks, interval in zip( + enc_chunks_tuple, var_chunks, region, strict=True + ): if not safe_chunks or len(dchunks) <= 1: # It is not necessary to perform any additional validation if the # safe_chunks is False, or there are less than two dchunks @@ -302,7 +304,12 @@ def extract_zarr_variable_encoding( del encoding[k] chunks = _determine_zarr_chunks( - encoding.get("chunks"), variable.chunks, variable.ndim, name, safe_chunks, region + encoding.get("chunks"), + variable.chunks, + variable.ndim, + name, + safe_chunks, + region, ) encoding["chunks"] = chunks return encoding diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a78b583598b..06646e6ec4a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6149,7 +6149,5 @@ def test_zarr_safe_chunk(tmp_path): da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") # Append with a single chunk it's totally valid, # and it does not matter the size of the chunk - da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr( - store, append_dim="x", safe_chunks=True - ) + da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr(store, append_dim="x", safe_chunks=True) assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 19))) From 60a7a3f18e2b450590d311141ca2ee4b79df6dc8 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Wed, 18 Sep 2024 08:55:26 -0400 Subject: [PATCH 03/17] fix safe chunks validation --- doc/whats-new.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 264c07f562b..56f4dda4cca 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -51,7 +51,9 @@ Bug fixes the non-missing times could in theory be encoded with integers (:issue:`9488`, :pull:`9497`). By `Spencer Clark `_. - +- Fix the safe_chunks validation option on the to_zarr method + (:issue:`5511`, :pull:`9513`). By `Joseph Nowak + `_. Documentation ~~~~~~~~~~~~~ From 6c41f4beb059d4ac0a8c04cda117177284e3fd62 Mon Sep 17 00:00:00 2001 From: joseph nowak Date: Wed, 18 Sep 2024 15:26:00 -0400 Subject: [PATCH 04/17] Update xarray/tests/test_backends.py Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 06646e6ec4a..a2419cf9145 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6104,7 +6104,7 @@ def test_zarr_region_chunk_partial_offset(tmp_path): @requires_zarr @requires_dask -def test_zarr_safe_chunk(tmp_path): +def test_zarr_safe_chunk_append_dim(tmp_path): # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 store = tmp_path / "foo.zarr" data = np.ones((20,)) From a2a786bcbf0bd0692dcbab2e9196cb0379c70d0a Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Fri, 20 Sep 2024 16:15:50 -0400 Subject: [PATCH 05/17] The validation of the chunks now is able to detect full or partial chunk and raise a proper error based on the mode selected, it is also possible to use the auto region detection with the mode "a" --- xarray/backends/zarr.py | 76 ++++++++++++++++++++------------ xarray/core/dataarray.py | 8 ++++ xarray/core/dataset.py | 8 ++++ xarray/tests/test_backends.py | 83 +++++++++++++++++++++++++++++++++-- 4 files changed, 143 insertions(+), 32 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index af289d2ea7b..98936aae31a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -112,7 +112,7 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, region): +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode): """ Given encoding chunks (possibly None or []) and variable chunks (possibly None or []). @@ -163,7 +163,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks, region) + return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks, region, mode) for x in enc_chunks_tuple: if not isinstance(x, int): @@ -189,9 +189,19 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi # TODO: incorporate synchronizer to allow writes from multiple dask # threads if var_chunks and enc_chunks_tuple: + # If it is possible to write on partial chunks then it is not necessary to check + # the last one contained on the region + allow_partial_chunks = True + end = -1 + if mode == "r+": + # This mode forces to write only on full chunks, even on the last one + allow_partial_chunks = False + end = None + base_error = ( f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " - f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " + f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r} " + f"on the region {region}. " f"Writing this array in parallel with dask could lead to corrupted data." f"Consider either rechunking using `chunk()`, deleting " f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." @@ -200,27 +210,27 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi for zchunk, dchunks, interval in zip( enc_chunks_tuple, var_chunks, region, strict=True ): - if not safe_chunks or len(dchunks) <= 1: - # It is not necessary to perform any additional validation if the - # safe_chunks is False, or there are less than two dchunks + if not safe_chunks: continue - start = 0 + # The first border size is the amount of data that needs to be updated on the + # first chunk taking into account the region slice. + first_border_size = zchunk if interval.start: - # If the start of the interval is not None or 0, it means that the data - # is being appended or updated, and in both cases it is mandatory that - # the residue of the division between the first dchunk and the zchunk - # being equal to the border size - border_size = zchunk - interval.start % zchunk - if dchunks[0] % zchunk != border_size: - raise ValueError(base_error) - # Avoid validating the first chunk inside the loop - start = 1 + first_border_size = zchunk - interval.start % zchunk - for dchunk in dchunks[start:-1]: - if dchunk % zchunk: + if not allow_partial_chunks and first_border_size < zchunk: + # If the border is smaller than zchunk, then it is a partial chunk write + raise ValueError(first_border_size) + + for dchunk in dchunks[:end]: + if (dchunk - first_border_size) % zchunk: raise ValueError(base_error) + # The first border is only useful during the first iteration, + # so ignore it in the next validations + first_border_size = 0 + return enc_chunks_tuple raise AssertionError("We should never get here. Function logic must be wrong.") @@ -261,7 +271,12 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): def extract_zarr_variable_encoding( - variable, region, raise_on_invalid=False, name=None, safe_chunks=True + variable, + raise_on_invalid=False, + name=None, + safe_chunks=True, + region=None, + mode=None ): """ Extract zarr encoding dictionary from xarray Variable @@ -269,8 +284,11 @@ def extract_zarr_variable_encoding( Parameters ---------- variable : Variable - region: tuple[slice] + region: tuple[slice], optional raise_on_invalid : bool, optional + safe_chunks: bool, optional + name: str | Hashable, optional + mode: str, optional Returns ------- @@ -304,12 +322,13 @@ def extract_zarr_variable_encoding( del encoding[k] chunks = _determine_zarr_chunks( - encoding.get("chunks"), - variable.chunks, - variable.ndim, - name, - safe_chunks, - region, + enc_chunks=encoding.get("chunks"), + var_chunks=variable.chunks, + ndim=variable.ndim, + name=name, + safe_chunks=safe_chunks, + region=region, + mode=mode ) encoding["chunks"] = chunks return encoding @@ -845,6 +864,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No raise_on_invalid=vn in check_encoding_set, name=vn, safe_chunks=self._safe_chunks, + mode=self._mode ) if name not in existing_keys: @@ -927,9 +947,9 @@ def _validate_and_autodetect_region(self, ds) -> None: if not isinstance(region, dict): raise TypeError(f"``region`` must be a dict, got {type(region)}") if any(v == "auto" for v in region.values()): - if self._mode != "r+": + if self._mode not in ["r+", "a"]: raise ValueError( - f"``mode`` must be 'r+' when using ``region='auto'``, got {self._mode!r}" + f"``mode`` must be 'r+' or 'a' when using ``region='auto'``, got {self._mode!r}" ) region = self._auto_detect_regions(ds, region) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 37369afbf96..1a308213ab3 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4304,6 +4304,14 @@ def to_zarr( if Zarr arrays are written in parallel. This option may be useful in combination with ``compute=False`` to initialize a Zarr store from an existing DataArray with arbitrary chunk structure. + In addition to the many-to-one relationship validation, it also detects partial + chunks writes when using the region parameter, + these partial chunks are considered unsafe in the mode "r+" but safe in + the mode "a". + Note: Even with these validations it can still be unsafe to write + two or more chunked arrays in the same location in parallel if they are + not writing in independent regions, for those cases it is better to use + a synchronizer. storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7b9b4819245..b1ce264cbc8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2509,6 +2509,14 @@ def to_zarr( if Zarr arrays are written in parallel. This option may be useful in combination with ``compute=False`` to initialize a Zarr from an existing Dataset with arbitrary chunk structure. + In addition to the many-to-one relationship validation, it also detects partial + chunks writes when using the region parameter, + these partial chunks are considered unsafe in the mode "r+" but safe in + the mode "a". + Note: Even with these validations it can still be unsafe to write + two or more chunked arrays in the same location in parallel if they are + not writing in independent regions, for those cases it is better to use + a synchronizer. storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index beaf22826ec..a7f13c12f8a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5991,9 +5991,10 @@ def test_zarr_region_append(self, tmp_path): } ) - # Don't allow auto region detection in append mode due to complexities in - # implementing the overlap logic and lack of safety with parallel writes - with pytest.raises(ValueError): + # Now it is valid to use auto region detection with the append mode, + # but it is still unsafe to modify dimensions or metadata using the region + # parameter. + with pytest.raises(KeyError): ds_new.to_zarr( tmp_path / "test.zarr", mode="a", append_dim="x", region="auto" ) @@ -6105,7 +6106,6 @@ def test_zarr_region_chunk_partial_offset(tmp_path): @requires_zarr @requires_dask def test_zarr_safe_chunk_append_dim(tmp_path): - # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 store = tmp_path / "foo.zarr" data = np.ones((20,)) da = xr.DataArray(data, dims=["x"], coords={"x": range(20)}, name="foo").chunk(x=5) @@ -6151,3 +6151,78 @@ def test_zarr_safe_chunk_append_dim(tmp_path): # and it does not matter the size of the chunk da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr(store, append_dim="x", safe_chunks=True) assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 19))) + + +@requires_zarr +@requires_dask +def test_zarr_safe_chunk_region(tmp_path): + store = tmp_path / "foo.zarr" + + arr = xr.DataArray( + list(range(10)), + dims=["a"], + coords={"a": list(range(10))}, + name="foo" + ).chunk(a=3) + arr.to_zarr(store, mode="w") + + for mode in ["r+", "a"]: + with pytest.raises(ValueError): + # There are two Dask chunks on the same Zarr chunk, + # which means that it is unsafe in any mode + arr.isel(a=slice(0, 3)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode=mode) + + with pytest.raises(ValueError): + # the first chunk is covering the border size, but it is not + # completely covering the second chunk, which means that it is + # unsafe in any mode + arr.isel(a=slice(1, 5)).chunk(a=(3, 1)).to_zarr(store, region="auto", mode=mode) + + with pytest.raises(ValueError): + # The first chunk is safe but the other two chunks are overlapping with + # the same Zarr chunk + arr.isel(a=slice(0, 5)).chunk(a=(3, 1, 1)).to_zarr(store, region="auto", mode=mode) + + # Fully update two contiguous chunks is safe in any mode + arr.isel(a=slice(3, 9)).to_zarr(store, region="auto", mode=mode) + + # Write the last chunk partially is safe in "a" mode + arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # with "r+" mode it is invalid to write partial chunk even on the last one + arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="r+") + + # This is safe with mode "a", the border size is covered by the first chunk of Dask + arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # This is considered unsafe in mode "r+" because it is writing in a partial chunk + arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="r+") + + # This is safe on mode "a" because there is a single dask chunk + arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # This is unsafe on mode "r+", because there is a single dask + # chunk smaller than the Zarr chunk + arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="r+") + + # The first chunk is completely covering the first Zarr chunk + # and the last chunk is a partial chunk + arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # The last chunk is partial, so it is considered unsafe on mode "r+" + arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="r+") + + # The first chunk is covering the border size (2 elements) + # and also the second chunk (3 elements), so it is valid + arr.isel(a=slice(1, 8)).chunk(a=(5, 2)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # The first chunk is not fully covering the first zarr chunk + arr.isel(a=slice(1, 8)).chunk(a=(5, 2)).to_zarr(store, region="auto", mode="r+") + + with pytest.raises(ValueError): + # Validate that the border condition is not affecting the "r+" mode + arr.isel(a=slice(1, 9)).to_zarr(store, region="auto", mode="r+") From 604b8e16bcdc1f8565bf561b10e19185183e6efd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Sep 2024 20:20:55 +0000 Subject: [PATCH 06/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 14 +++++++++----- xarray/tests/test_backends.py | 17 ++++++++++------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 98936aae31a..c66cd65e4ad 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -112,7 +112,9 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode): +def _determine_zarr_chunks( + enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode +): """ Given encoding chunks (possibly None or []) and variable chunks (possibly None or []). @@ -163,7 +165,9 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks, region, mode) + return _determine_zarr_chunks( + None, var_chunks, ndim, name, safe_chunks, region, mode + ) for x in enc_chunks_tuple: if not isinstance(x, int): @@ -276,7 +280,7 @@ def extract_zarr_variable_encoding( name=None, safe_chunks=True, region=None, - mode=None + mode=None, ): """ Extract zarr encoding dictionary from xarray Variable @@ -328,7 +332,7 @@ def extract_zarr_variable_encoding( name=name, safe_chunks=safe_chunks, region=region, - mode=mode + mode=mode, ) encoding["chunks"] = chunks return encoding @@ -864,7 +868,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No raise_on_invalid=vn in check_encoding_set, name=vn, safe_chunks=self._safe_chunks, - mode=self._mode + mode=self._mode, ) if name not in existing_keys: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a7f13c12f8a..3a3e16afe93 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6159,10 +6159,7 @@ def test_zarr_safe_chunk_region(tmp_path): store = tmp_path / "foo.zarr" arr = xr.DataArray( - list(range(10)), - dims=["a"], - coords={"a": list(range(10))}, - name="foo" + list(range(10)), dims=["a"], coords={"a": list(range(10))}, name="foo" ).chunk(a=3) arr.to_zarr(store, mode="w") @@ -6170,18 +6167,24 @@ def test_zarr_safe_chunk_region(tmp_path): with pytest.raises(ValueError): # There are two Dask chunks on the same Zarr chunk, # which means that it is unsafe in any mode - arr.isel(a=slice(0, 3)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(0, 3)).chunk(a=(2, 1)).to_zarr( + store, region="auto", mode=mode + ) with pytest.raises(ValueError): # the first chunk is covering the border size, but it is not # completely covering the second chunk, which means that it is # unsafe in any mode - arr.isel(a=slice(1, 5)).chunk(a=(3, 1)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(1, 5)).chunk(a=(3, 1)).to_zarr( + store, region="auto", mode=mode + ) with pytest.raises(ValueError): # The first chunk is safe but the other two chunks are overlapping with # the same Zarr chunk - arr.isel(a=slice(0, 5)).chunk(a=(3, 1, 1)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(0, 5)).chunk(a=(3, 1, 1)).to_zarr( + store, region="auto", mode=mode + ) # Fully update two contiguous chunks is safe in any mode arr.isel(a=slice(3, 9)).to_zarr(store, region="auto", mode=mode) From a30b1e07df9fff306c247e69e421b6ac4de1598c Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Fri, 20 Sep 2024 16:22:54 -0400 Subject: [PATCH 07/17] The test_extract_zarr_variable_encoding does not need to use the region parameter --- xarray/tests/test_backends.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a7f13c12f8a..032a24c037c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5496,26 +5496,24 @@ def test_encode_zarr_attr_value() -> None: @requires_zarr def test_extract_zarr_variable_encoding() -> None: - # The region is not useful in these cases, but I still think that it must be mandatory - # because the validation of the chunks is in the same function var = xr.Variable("x", [1, 2]) - actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) + actual = backends.zarr.extract_zarr_variable_encoding(var) assert "chunks" in actual assert actual["chunks"] is None var = xr.Variable("x", [1, 2], encoding={"chunks": (1,)}) - actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) + actual = backends.zarr.extract_zarr_variable_encoding(var) assert actual["chunks"] == (1,) # does not raise on invalid var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) - actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) + actual = backends.zarr.extract_zarr_variable_encoding(var) # raises on invalid var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) with pytest.raises(ValueError, match=r"unexpected encoding parameters"): actual = backends.zarr.extract_zarr_variable_encoding( - var, raise_on_invalid=True, region=tuple() + var, raise_on_invalid=True ) From c781042a1250731ed26e3a674075813f2def4091 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Fri, 20 Sep 2024 17:15:06 -0400 Subject: [PATCH 08/17] Inline the code of the allow_partial_chunks and end, document the parameter in order on the extract_zarr_variable_encoding method, raise the correct error if the border size is smaller than the zchunk on mode equal to r+ --- xarray/backends/zarr.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c66cd65e4ad..756ce21bc9b 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -195,12 +195,9 @@ def _determine_zarr_chunks( if var_chunks and enc_chunks_tuple: # If it is possible to write on partial chunks then it is not necessary to check # the last one contained on the region - allow_partial_chunks = True - end = -1 - if mode == "r+": - # This mode forces to write only on full chunks, even on the last one - allow_partial_chunks = False - end = None + allow_partial_chunks = mode != "r+" + # The r+ mode force to write only on full chunks, even on the last one + end = None if mode == "r+" else -1 base_error = ( f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " @@ -225,7 +222,7 @@ def _determine_zarr_chunks( if not allow_partial_chunks and first_border_size < zchunk: # If the border is smaller than zchunk, then it is a partial chunk write - raise ValueError(first_border_size) + raise ValueError(base_error) for dchunk in dchunks[:end]: if (dchunk - first_border_size) % zchunk: @@ -278,6 +275,7 @@ def extract_zarr_variable_encoding( variable, raise_on_invalid=False, name=None, + *, safe_chunks=True, region=None, mode=None, @@ -288,10 +286,10 @@ def extract_zarr_variable_encoding( Parameters ---------- variable : Variable - region: tuple[slice], optional + name: str | Hashable, optional raise_on_invalid : bool, optional safe_chunks: bool, optional - name: str | Hashable, optional + region: tuple[slice], optional mode: str, optional Returns From c454cfef842f9f795460c5994f8bd0ccf0ad3cf4 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Fri, 20 Sep 2024 17:17:18 -0400 Subject: [PATCH 09/17] Inline the code of the allow_partial_chunks and end, document the parameter in order on the extract_zarr_variable_encoding method, raise the correct error if the border size is smaller than the zchunk on mode equal to r+ --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 756ce21bc9b..b10f3c8da94 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -286,8 +286,8 @@ def extract_zarr_variable_encoding( Parameters ---------- variable : Variable - name: str | Hashable, optional raise_on_invalid : bool, optional + name: str | Hashable, optional safe_chunks: bool, optional region: tuple[slice], optional mode: str, optional From cc585d0fb4b7003822d75c8199b30a0afb47b278 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Sat, 21 Sep 2024 18:14:24 -0400 Subject: [PATCH 10/17] Now the mode r+ is able to update the last chunk of Zarr even if it is not "complete" --- xarray/backends/zarr.py | 55 ++++++++++++++++++++++------------- xarray/tests/test_backends.py | 41 ++++++++++++++++++++++---- 2 files changed, 70 insertions(+), 26 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b10f3c8da94..e6fe93a398a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -113,7 +113,7 @@ def __getitem__(self, key): def _determine_zarr_chunks( - enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode + enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode, shape ): """ Given encoding chunks (possibly None or []) and variable chunks @@ -166,7 +166,7 @@ def _determine_zarr_chunks( if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over return _determine_zarr_chunks( - None, var_chunks, ndim, name, safe_chunks, region, mode + None, var_chunks, ndim, name, safe_chunks, region, mode, shape ) for x in enc_chunks_tuple: @@ -208,29 +208,38 @@ def _determine_zarr_chunks( f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." ) - for zchunk, dchunks, interval in zip( - enc_chunks_tuple, var_chunks, region, strict=True + for zchunk, dchunks, interval, size in zip( + enc_chunks_tuple, var_chunks, region, shape, strict=True ): if not safe_chunks: continue - # The first border size is the amount of data that needs to be updated on the - # first chunk taking into account the region slice. - first_border_size = zchunk - if interval.start: - first_border_size = zchunk - interval.start % zchunk + for dchunk in dchunks[1:-1]: + if dchunk % zchunk: + raise ValueError(base_error) + + region_start = interval.start if interval.start else 0 - if not allow_partial_chunks and first_border_size < zchunk: - # If the border is smaller than zchunk, then it is a partial chunk write - raise ValueError(base_error) + if len(dchunks) > 1: + # The first border size is the amount of data that needs to be updated on the + # first chunk taking into account the region slice. + first_border_size = zchunk + if allow_partial_chunks: + first_border_size = zchunk - region_start % zchunk - for dchunk in dchunks[:end]: - if (dchunk - first_border_size) % zchunk: + if (dchunks[0] - first_border_size) % zchunk: raise ValueError(base_error) - # The first border is only useful during the first iteration, - # so ignore it in the next validations - first_border_size = 0 + if not allow_partial_chunks: + region_stop = interval.stop if interval.stop else size + cover_last_chunk = region_stop > size - size % zchunk + + if not cover_last_chunk: + if dchunks[-1] % zchunk: + raise ValueError(base_error) + elif dchunks[-1] % zchunk != size % zchunk: + # The remainder must be equal to the size of the last Zarr chunk + raise ValueError(base_error) return enc_chunks_tuple @@ -279,6 +288,7 @@ def extract_zarr_variable_encoding( safe_chunks=True, region=None, mode=None, + shape=None ): """ Extract zarr encoding dictionary from xarray Variable @@ -289,9 +299,9 @@ def extract_zarr_variable_encoding( raise_on_invalid : bool, optional name: str | Hashable, optional safe_chunks: bool, optional - region: tuple[slice], optional + region: tuple[slice, ...], optional mode: str, optional - + shape: tuple[int, ...], optional Returns ------- encoding : dict @@ -331,6 +341,7 @@ def extract_zarr_variable_encoding( safe_chunks=safe_chunks, region=region, mode=mode, + shape=shape ) encoding["chunks"] = chunks return encoding @@ -808,6 +819,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No v.encoding = {} zarr_array = None + zarr_shape = None write_region = self._write_region if self._write_region is not None else {} write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} @@ -852,6 +864,8 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No new_shape[append_axis] += v.shape[append_axis] zarr_array.resize(new_shape) + zarr_shape = zarr_array.shape + region = tuple(write_region[dim] for dim in dims) # We need to do this for both new and existing variables to ensure we're not @@ -862,11 +876,12 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No # another one for extracting the encoding. encoding = extract_zarr_variable_encoding( v, - region=region, raise_on_invalid=vn in check_encoding_set, name=vn, safe_chunks=self._safe_chunks, + region=region, mode=self._mode, + shape=zarr_shape ) if name not in existing_keys: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 78d50fcbdac..c04f71ae61c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6157,7 +6157,7 @@ def test_zarr_safe_chunk_region(tmp_path): store = tmp_path / "foo.zarr" arr = xr.DataArray( - list(range(10)), dims=["a"], coords={"a": list(range(10))}, name="foo" + list(range(11)), dims=["a"], coords={"a": list(range(11))}, name="foo" ).chunk(a=3) arr.to_zarr(store, mode="w") @@ -6187,10 +6187,14 @@ def test_zarr_safe_chunk_region(tmp_path): # Fully update two contiguous chunks is safe in any mode arr.isel(a=slice(3, 9)).to_zarr(store, region="auto", mode=mode) - # Write the last chunk partially is safe in "a" mode + # The last chunk is considered full based on their current size (2) + arr.isel(a=slice(9, 11)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(6, None)).chunk(a=-1).to_zarr(store, region="auto", mode=mode) + + # Write the last chunk of a region partially is safe in "a" mode arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="a") with pytest.raises(ValueError): - # with "r+" mode it is invalid to write partial chunk even on the last one + # with "r+" mode it is invalid to write partial chunk arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="r+") # This is safe with mode "a", the border size is covered by the first chunk of Dask @@ -6204,12 +6208,12 @@ def test_zarr_safe_chunk_region(tmp_path): arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="a") with pytest.raises(ValueError): - # This is unsafe on mode "r+", because there is a single dask - # chunk smaller than the Zarr chunk + # This is unsafe on mode "r+", because the Dask chunk is partially writing + # in the first chunk of Zarr arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="r+") # The first chunk is completely covering the first Zarr chunk - # and the last chunk is a partial chunk + # and the last chunk is a partial one arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="a") with pytest.raises(ValueError): @@ -6227,3 +6231,28 @@ def test_zarr_safe_chunk_region(tmp_path): with pytest.raises(ValueError): # Validate that the border condition is not affecting the "r+" mode arr.isel(a=slice(1, 9)).to_zarr(store, region="auto", mode="r+") + + arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # Validate that even if we write with a single Dask chunk on the last Zarr + # chunk it is still unsafe if it is not fully covering it + # (the last Zarr chunk has size 2) + arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="r+") + + # Validate the same than the above test but in the beginning of the last chunk + arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="r+") + + arr.isel(a=slice(7, None)).chunk(a=-1).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # Test that even a Dask chunk that covers the last Zarr chunk can be unsafe + # if it is partial covering other Zarr chunks + arr.isel(a=slice(7, None)).chunk(a=-1).to_zarr(store, region="auto", mode="r+") + + with pytest.raises(ValueError): + # If the chunk is of size equal to the one in the Zarr encoding, but + # it is partially writing in the last chunk then raise an error + arr.isel(a=slice(8, None)).chunk(a=3).to_zarr(store, region="auto", mode="r+") + + From 9302036426847f3fbde31915660e23e826684633 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 21 Sep 2024 22:15:01 +0000 Subject: [PATCH 11/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 6 +++--- xarray/tests/test_backends.py | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e6fe93a398a..775bd1e6d80 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -288,7 +288,7 @@ def extract_zarr_variable_encoding( safe_chunks=True, region=None, mode=None, - shape=None + shape=None, ): """ Extract zarr encoding dictionary from xarray Variable @@ -341,7 +341,7 @@ def extract_zarr_variable_encoding( safe_chunks=safe_chunks, region=region, mode=mode, - shape=shape + shape=shape, ) encoding["chunks"] = chunks return encoding @@ -881,7 +881,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No safe_chunks=self._safe_chunks, region=region, mode=self._mode, - shape=zarr_shape + shape=zarr_shape, ) if name not in existing_keys: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c04f71ae61c..6529dd74c21 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6254,5 +6254,3 @@ def test_zarr_safe_chunk_region(tmp_path): # If the chunk is of size equal to the one in the Zarr encoding, but # it is partially writing in the last chunk then raise an error arr.isel(a=slice(8, None)).chunk(a=3).to_zarr(store, region="auto", mode="r+") - - From 0b4b9b1f9bb61becc39ad3bea6da9775cb82a72d Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Sat, 21 Sep 2024 19:54:22 -0400 Subject: [PATCH 12/17] Now the mode r+ is able to update the last chunk of Zarr even if it is not "complete" --- xarray/backends/zarr.py | 21 ++++++++++++++------- xarray/tests/test_backends.py | 9 ++++----- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e6fe93a398a..197f735f950 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -196,8 +196,6 @@ def _determine_zarr_chunks( # If it is possible to write on partial chunks then it is not necessary to check # the last one contained on the region allow_partial_chunks = mode != "r+" - # The r+ mode force to write only on full chunks, even on the last one - end = None if mode == "r+" else -1 base_error = ( f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " @@ -231,14 +229,21 @@ def _determine_zarr_chunks( raise ValueError(base_error) if not allow_partial_chunks: + chunk_start = sum(dchunks[:-1]) + region_start + if chunk_start % zchunk: + # The last chunk which can also be the only one is a partial chunk + # if it is not aligned at the beginning + raise ValueError(base_error) + region_stop = interval.stop if interval.stop else size - cover_last_chunk = region_stop > size - size % zchunk - if not cover_last_chunk: - if dchunks[-1] % zchunk: + if size - region_stop + 1 < zchunk: + # If the region is covering the last chunk then check + # if the reminder with the default chunk size + # is equal to the size of the last chunk + if dchunks[-1] % zchunk != size % zchunk: raise ValueError(base_error) - elif dchunks[-1] % zchunk != size % zchunk: - # The remainder must be equal to the size of the last Zarr chunk + elif dchunks[-1] % zchunk: raise ValueError(base_error) return enc_chunks_tuple @@ -307,6 +312,8 @@ def extract_zarr_variable_encoding( encoding : dict Zarr encoding for `variable` """ + + shape = shape if shape else variable.shape encoding = variable.encoding.copy() safe_to_drop = {"source", "original_shape"} diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c04f71ae61c..919317fb0d0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6199,14 +6199,12 @@ def test_zarr_safe_chunk_region(tmp_path): # This is safe with mode "a", the border size is covered by the first chunk of Dask arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): # This is considered unsafe in mode "r+" because it is writing in a partial chunk arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="r+") # This is safe on mode "a" because there is a single dask chunk arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): # This is unsafe on mode "r+", because the Dask chunk is partially writing # in the first chunk of Zarr @@ -6239,7 +6237,7 @@ def test_zarr_safe_chunk_region(tmp_path): # (the last Zarr chunk has size 2) arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="r+") - # Validate the same than the above test but in the beginning of the last chunk + # Validate the same as the above test but in the beginning of the last chunk arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="a") with pytest.raises(ValueError): arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="r+") @@ -6252,7 +6250,8 @@ def test_zarr_safe_chunk_region(tmp_path): with pytest.raises(ValueError): # If the chunk is of size equal to the one in the Zarr encoding, but - # it is partially writing in the last chunk then raise an error + # it is partially writing in the first chunk then raise an error arr.isel(a=slice(8, None)).chunk(a=3).to_zarr(store, region="auto", mode="r+") - + with pytest.raises(ValueError): + arr.isel(a=slice(5, -1)).chunk(a=5).to_zarr(store, region="auto", mode="r+") From 23a864aa9b1ce298be58506203c31abed6499d76 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Sat, 21 Sep 2024 20:19:55 -0400 Subject: [PATCH 13/17] Add a typehint to the modes to avoid issues with mypy --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 919317fb0d0..ccf1bc73dd6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6161,7 +6161,8 @@ def test_zarr_safe_chunk_region(tmp_path): ).chunk(a=3) arr.to_zarr(store, mode="w") - for mode in ["r+", "a"]: + modes: list[Literal["r+", "a"]] = ["r+", "a"] + for mode in modes: with pytest.raises(ValueError): # There are two Dask chunks on the same Zarr chunk, # which means that it is unsafe in any mode From 1825af355760009ee65026277906046f98631ff4 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Mon, 30 Sep 2024 15:36:11 -0400 Subject: [PATCH 14/17] Fix the detection of the last chunk --- xarray/backends/zarr.py | 9 ++++----- xarray/tests/test_backends.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2c6b50b3589..c048ea63419 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -229,15 +229,14 @@ def _determine_zarr_chunks( raise ValueError(base_error) if not allow_partial_chunks: - chunk_start = sum(dchunks[:-1]) + region_start - if chunk_start % zchunk: + region_stop = interval.stop if interval.stop else size + + if region_start % zchunk: # The last chunk which can also be the only one is a partial chunk # if it is not aligned at the beginning raise ValueError(base_error) - region_stop = interval.stop if interval.stop else size - - if size - region_stop + 1 < zchunk: + if np.ceil(region_stop / zchunk) == np.ceil(size / zchunk): # If the region is covering the last chunk then check # if the reminder with the default chunk size # is equal to the size of the last chunk diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ccf1bc73dd6..430cbb0b011 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6256,3 +6256,13 @@ def test_zarr_safe_chunk_region(tmp_path): with pytest.raises(ValueError): arr.isel(a=slice(5, -1)).chunk(a=5).to_zarr(store, region="auto", mode="r+") + + # Test if the code is detecting the last chunk correctly + data = np.random.RandomState(0).randn(2920, 25, 53) + ds = xr.Dataset({'temperature': (('time', 'lat', 'lon'), data)}) + chunks = {'time': 1000, 'lat': 25, 'lon': 53} + ds.chunk(chunks).to_zarr(store, compute=False) + region = {'time': slice(1000, 2000, 1)} + chunk = ds.isel(region) + chunk = chunk.chunk() + chunk.chunk().to_zarr(store, region=region) From 81a27060093e90a10cdcdc2bbfc183fbf8d908dd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Sep 2024 19:38:32 +0000 Subject: [PATCH 15/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 430cbb0b011..68470b201d6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6259,10 +6259,10 @@ def test_zarr_safe_chunk_region(tmp_path): # Test if the code is detecting the last chunk correctly data = np.random.RandomState(0).randn(2920, 25, 53) - ds = xr.Dataset({'temperature': (('time', 'lat', 'lon'), data)}) - chunks = {'time': 1000, 'lat': 25, 'lon': 53} + ds = xr.Dataset({"temperature": (("time", "lat", "lon"), data)}) + chunks = {"time": 1000, "lat": 25, "lon": 53} ds.chunk(chunks).to_zarr(store, compute=False) - region = {'time': slice(1000, 2000, 1)} + region = {"time": slice(1000, 2000, 1)} chunk = ds.isel(region) chunk = chunk.chunk() chunk.chunk().to_zarr(store, region=region) From 4924776e77ffce112f63062d0d6877fc54106515 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Mon, 30 Sep 2024 15:50:04 -0400 Subject: [PATCH 16/17] Fix the whats-new and add mode="w" to the new test case --- doc/whats-new.rst | 2 +- xarray/tests/test_backends.py | 418 +++++++++++++++++----------------- 2 files changed, 210 insertions(+), 210 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 89c8d3b4599..72e49a983e3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -59,7 +59,7 @@ Bug fixes - Fix a few bugs affecting groupby reductions with `flox`. (:issue:`8090`, :issue:`9398`). By `Deepak Cherian `_. - Fix the safe_chunks validation option on the to_zarr method - (:issue:`5511`, :pull:`9513`). By `Joseph Nowak + (:issue:`5511`, :pull:`9559`). By `Joseph Nowak `_. Documentation diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 68470b201d6..0e5db458116 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -41,11 +41,11 @@ ) from xarray.backends.common import robust_getitem from xarray.backends.h5netcdf_ import H5netcdfBackendEntrypoint -from xarray.backends.netcdf3 import _nc3_dtype_coercions from xarray.backends.netCDF4_ import ( NetCDF4BackendEntrypoint, _extract_nc4_variable_encoding, ) +from xarray.backends.netcdf3 import _nc3_dtype_coercions from xarray.backends.pydap_ import PydapDataStore from xarray.backends.scipy_ import ScipyBackendEntrypoint from xarray.coding.cftime_offsets import cftime_range @@ -306,7 +306,7 @@ class NetCDF3Only: def test_dtype_coercion_error(self) -> None: """Failing dtype coercion should lead to an error""" for dtype, format in itertools.product( - _nc3_dtype_coercions, self.netcdf3_formats + _nc3_dtype_coercions, self.netcdf3_formats ): if dtype == "bool": # coerced upcast (bool to int8) ==> can never fail @@ -332,7 +332,7 @@ def create_store(self): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -345,7 +345,7 @@ def roundtrip( @contextlib.contextmanager def roundtrip_append( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -392,8 +392,8 @@ def check_dtypes_roundtripped(self, expected, actual): # For NetCDF3, the backend should perform dtype coercion if ( - isinstance(self, NetCDF3Only) - and str(expected_dtype) in _nc3_dtype_coercions + isinstance(self, NetCDF3Only) + and str(expected_dtype) in _nc3_dtype_coercions ): expected_dtype = np.dtype(_nc3_dtype_coercions[str(expected_dtype)]) @@ -401,8 +401,8 @@ def check_dtypes_roundtripped(self, expected, actual): # TODO: check expected behavior for string dtypes more carefully string_kinds = {"O", "S", "U"} assert expected_dtype == actual_dtype or ( - expected_dtype.kind in string_kinds - and actual_dtype.kind in string_kinds + expected_dtype.kind in string_kinds + and actual_dtype.kind in string_kinds ) def test_roundtrip_test_data(self) -> None: @@ -584,8 +584,8 @@ def test_roundtrip_cftime_datetime_data(self) -> None: abs_diff = abs(actual.t.values - expected_decoded_t) assert (abs_diff <= np.timedelta64(1, "s")).all() assert ( - actual.t.encoding["units"] - == "days since 0001-01-01 00:00:00.000000" + actual.t.encoding["units"] + == "days since 0001-01-01 00:00:00.000000" ) assert actual.t.encoding["calendar"] == expected_calendar @@ -626,7 +626,7 @@ def test_roundtrip_coordinates(self) -> None: with self.roundtrip(original, open_kwargs={"decode_coords": False}) as expected: # check roundtripping when decode_coords=False with self.roundtrip( - expected, open_kwargs={"decode_coords": False} + expected, open_kwargs={"decode_coords": False} ) as actual: assert_identical(expected, actual) @@ -905,8 +905,8 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: "decoded_fn, encoded_fn", [ ( - create_unsigned_masked_scaled_data, - create_encoded_unsigned_masked_scaled_data, + create_unsigned_masked_scaled_data, + create_encoded_unsigned_masked_scaled_data, ), pytest.param( create_bad_unsigned_masked_scaled_data, @@ -914,12 +914,12 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: marks=pytest.mark.xfail(reason="Bad _Unsigned attribute."), ), ( - create_signed_masked_scaled_data, - create_encoded_signed_masked_scaled_data, + create_signed_masked_scaled_data, + create_encoded_signed_masked_scaled_data, ), ( - create_unsigned_false_masked_scaled_data, - create_encoded_unsigned_false_masked_scaled_data, + create_unsigned_false_masked_scaled_data, + create_encoded_unsigned_false_masked_scaled_data, ), (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], @@ -931,9 +931,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: decoded = decoded_fn(dtype) encoded = encoded_fn(dtype) if decoded["x"].encoding["dtype"] == "u1" and not ( - self.engine == "netcdf4" - and self.file_format is None - or self.file_format == "NETCDF4" + self.engine == "netcdf4" + and self.file_format is None + or self.file_format == "NETCDF4" ): pytest.skip("uint8 data can't be written to non-NetCDF4 data") @@ -942,8 +942,8 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert decoded.variables[k].dtype == actual.variables[k].dtype # CF _FillValue is always on-disk type assert ( - decoded.variables[k].encoding["_FillValue"] - == actual.variables[k].encoding["_FillValue"] + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].encoding["_FillValue"] ) assert_allclose(decoded, actual, decode_bytes=False) @@ -954,8 +954,8 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert encoded.variables[k].dtype == actual.variables[k].dtype # CF _FillValue is always on-disk type assert ( - decoded.variables[k].encoding["_FillValue"] - == actual.variables[k].attrs["_FillValue"] + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].attrs["_FillValue"] ) assert_allclose(encoded, actual, decode_bytes=False) @@ -964,8 +964,8 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert encoded.variables[k].dtype == actual.variables[k].dtype # CF _FillValue is always on-disk type assert ( - encoded.variables[k].attrs["_FillValue"] - == actual.variables[k].attrs["_FillValue"] + encoded.variables[k].attrs["_FillValue"] + == actual.variables[k].attrs["_FillValue"] ) assert_allclose(encoded, actual, decode_bytes=False) @@ -1030,7 +1030,7 @@ def _roundtrip_with_warnings(*args, **kwargs): assert_allclose(decoded, actual, decode_bytes=False) with _roundtrip_with_warnings( - decoded, open_kwargs=dict(decode_cf=False) + decoded, open_kwargs=dict(decode_cf=False) ) as actual: for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype @@ -1120,7 +1120,7 @@ def test_coordinate_variables_after_dataset_roundtrip(self) -> None: assert_equal(actual, expected) def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip( - self, + self, ) -> None: original = self._create_cf_dataset() # The DataArray roundtrip should have the same warnings as the @@ -1132,14 +1132,14 @@ def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip( # needs the to_dataset. The other backends should be fine # without it. with pytest.warns( - UserWarning, - match=( - r"Variable\(s\) referenced in bounds not in variables: " - r"\['l(at|ong)itude_bnds'\]" - ), + UserWarning, + match=( + r"Variable\(s\) referenced in bounds not in variables: " + r"\['l(at|ong)itude_bnds'\]" + ), ): with self.roundtrip( - original["variable"].to_dataset(), open_kwargs={"decode_coords": "all"} + original["variable"].to_dataset(), open_kwargs={"decode_coords": "all"} ) as actual: assert_identical(actual, original["variable"].to_dataset()) @@ -1224,7 +1224,7 @@ def test_invalid_dataarray_names_raise(self) -> None: data = np.random.random((2, 2)) da = xr.DataArray(data) for name, (error, msg) in zip( - [0, (4, 5), True, ""], [te, te, te, ve], strict=True + [0, (4, 5), True, ""], [te, te, te, ve], strict=True ): ds = Dataset({name: da}) with pytest.raises(error) as excinfo: @@ -1357,7 +1357,7 @@ def test_append_with_invalid_dim_raises(self) -> None: data["var9"] = data["var2"] * 3 data = data.isel(dim1=slice(2, 6)) # modify one dimension with pytest.raises( - ValueError, match=r"Unable to update size for existing dimension" + ValueError, match=r"Unable to update size for existing dimension" ): self.save(data, tmp_file, mode="a") @@ -1419,7 +1419,7 @@ def test_byte_attrs(self, byte_attrs_dataset: dict[str, Any]) -> None: @contextlib.contextmanager def create_tmp_file( - suffix: str = ".nc", allow_cleanup_failure: bool = False + suffix: str = ".nc", allow_cleanup_failure: bool = False ) -> Iterator[str]: temp_dir = tempfile.mkdtemp() path = os.path.join(temp_dir, f"temp-{next(_counter)}{suffix}") @@ -1435,7 +1435,7 @@ def create_tmp_file( @contextlib.contextmanager def create_tmp_files( - nfiles: int, suffix: str = ".nc", allow_cleanup_failure: bool = False + nfiles: int, suffix: str = ".nc", allow_cleanup_failure: bool = False ) -> Iterator[list[str]]: with ExitStack() as stack: files = [ @@ -1517,7 +1517,7 @@ def test_write_groups(self) -> None: ], ) def test_encoding_kwarg_vlen_string( - self, input_strings: list[str], is_bytes: bool + self, input_strings: list[str], is_bytes: bool ) -> None: original = Dataset({"x": input_strings}) @@ -1689,9 +1689,9 @@ def test_auto_chunking_is_based_on_disk_chunk_sizes(self) -> None: with dask.config.set({"array.chunk-size": "100KiB"}): with self.chunked_roundtrip( - (1, y_size, x_size), - (1, y_chunksize, x_chunksize), - open_kwargs={"chunks": "auto"}, + (1, y_size, x_size), + (1, y_chunksize, x_chunksize), + open_kwargs={"chunks": "auto"}, ) as ds: t_chunks, y_chunks, x_chunks = ds["image"].data.chunks assert all(np.asanyarray(y_chunks) == y_chunksize) @@ -1705,21 +1705,21 @@ def test_base_chunking_uses_disk_chunk_sizes(self) -> None: x_chunksize = 10 with self.chunked_roundtrip( - (1, y_size, x_size), - (1, y_chunksize, x_chunksize), - open_kwargs={"chunks": {}}, + (1, y_size, x_size), + (1, y_chunksize, x_chunksize), + open_kwargs={"chunks": {}}, ) as ds: for chunksizes, expected in zip( - ds["image"].data.chunks, (1, y_chunksize, x_chunksize), strict=True + ds["image"].data.chunks, (1, y_chunksize, x_chunksize), strict=True ): assert all(np.asanyarray(chunksizes) == expected) @contextlib.contextmanager def chunked_roundtrip( - self, - array_shape: tuple[int, int, int], - chunk_sizes: tuple[int, int, int], - open_kwargs: dict[str, Any] | None = None, + self, + array_shape: tuple[int, int, int], + chunk_sizes: tuple[int, int, int], + open_kwargs: dict[str, Any] | None = None, ) -> Generator[Dataset, None, None]: t_size, y_size, x_size = array_shape t_chunksize, y_chunksize, x_chunksize = chunk_sizes @@ -1742,7 +1742,7 @@ def test_preferred_chunks_are_disk_chunk_sizes(self) -> None: x_chunksize = 10 with self.chunked_roundtrip( - (1, y_size, x_size), (1, y_chunksize, x_chunksize) + (1, y_size, x_size), (1, y_chunksize, x_chunksize) ) as ds: assert ds["image"].encoding["preferred_chunks"] == { "t": 1, @@ -1759,7 +1759,7 @@ def test_encoding_chunksizes_unlimited(self) -> None: "complevel": 0, "fletcher32": False, "contiguous": False, - "chunksizes": (2**20,), + "chunksizes": (2 ** 20,), "original_shape": (3,), } with self.roundtrip(ds) as actual: @@ -1862,14 +1862,14 @@ def test_encoding_enum__no_fill_value(self): with self.roundtrip(original, save_kwargs=save_kwargs) as actual: assert_equal(original, actual) assert ( - actual.clouds.encoding["dtype"].metadata["enum"] - == cloud_type_dict + actual.clouds.encoding["dtype"].metadata["enum"] + == cloud_type_dict ) if self.engine != "h5netcdf": # not implemented in h5netcdf yet assert ( - actual.clouds.encoding["dtype"].metadata["enum_name"] - == "cloud_type" + actual.clouds.encoding["dtype"].metadata["enum_name"] + == "cloud_type" ) @requires_netCDF4 @@ -1898,21 +1898,21 @@ def test_encoding_enum__multiple_variable_with_enum(self): with self.roundtrip(original, save_kwargs=save_kwargs) as actual: assert_equal(original, actual) assert ( - actual.clouds.encoding["dtype"] == actual.tifa.encoding["dtype"] + actual.clouds.encoding["dtype"] == actual.tifa.encoding["dtype"] ) assert ( - actual.clouds.encoding["dtype"].metadata - == actual.tifa.encoding["dtype"].metadata + actual.clouds.encoding["dtype"].metadata + == actual.tifa.encoding["dtype"].metadata ) assert ( - actual.clouds.encoding["dtype"].metadata["enum"] - == cloud_type_dict + actual.clouds.encoding["dtype"].metadata["enum"] + == cloud_type_dict ) if self.engine != "h5netcdf": # not implemented in h5netcdf yet assert ( - actual.clouds.encoding["dtype"].metadata["enum_name"] - == "cloud_type" + actual.clouds.encoding["dtype"].metadata["enum_name"] + == "cloud_type" ) @requires_netCDF4 @@ -1940,8 +1940,8 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): ) with open_dataset(tmp_file) as original: assert ( - original.clouds.encoding["dtype"].metadata - == original.tifa.encoding["dtype"].metadata + original.clouds.encoding["dtype"].metadata + == original.tifa.encoding["dtype"].metadata ) modified_enum = original.clouds.encoding["dtype"].metadata["enum"] modified_enum.update({"neblig": 2}) @@ -1952,11 +1952,11 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): if self.engine != "h5netcdf": # not implemented yet in h5netcdf with pytest.raises( - ValueError, - match=( - "Cannot save variable .*" - " because an enum `cloud_type` already exists in the Dataset .*" - ), + ValueError, + match=( + "Cannot save variable .*" + " because an enum `cloud_type` already exists in the Dataset .*" + ), ): with self.roundtrip(original): pass @@ -2081,8 +2081,8 @@ def test_compression_encoding(self, compression: str | None) -> None: actual_encoding = actual["var2"].encoding assert expected_encoding.items() <= actual_encoding.items() if ( - encoding_params["compression"] is not None - and "blosc" not in encoding_params["compression"] + encoding_params["compression"] is not None + and "blosc" not in encoding_params["compression"] ): # regression test for #156 expected = data.isel(dim1=0) @@ -2153,7 +2153,7 @@ def test_deepcopy(self) -> None: class TestNetCDF4ViaDaskData(TestNetCDF4Data): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if open_kwargs is None: open_kwargs = {} @@ -2161,7 +2161,7 @@ def roundtrip( save_kwargs = {} open_kwargs.setdefault("chunks", -1) with TestNetCDF4Data.roundtrip( - self, data, save_kwargs, open_kwargs, allow_cleanup_failure + self, data, save_kwargs, open_kwargs, allow_cleanup_failure ) as ds: yield ds @@ -2219,13 +2219,13 @@ def save(self, dataset, store_target, **kwargs): # type: ignore[override] @contextlib.contextmanager def open(self, store_target, **kwargs): with xr.open_dataset( - store_target, engine="zarr", **kwargs, **self.version_kwargs + store_target, engine="zarr", **kwargs, **self.version_kwargs ) as ds: yield ds @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -2242,9 +2242,9 @@ def test_roundtrip_consolidated(self, consolidated) -> None: pytest.xfail("consolidated metadata is not supported for zarr v3 yet") expected = create_test_data() with self.roundtrip( - expected, - save_kwargs={"consolidated": consolidated}, - open_kwargs={"backend_kwargs": {"consolidated": consolidated}}, + expected, + save_kwargs={"consolidated": consolidated}, + open_kwargs={"backend_kwargs": {"consolidated": consolidated}}, ) as actual: self.check_dtypes_roundtripped(expected, actual) assert_identical(expected, actual) @@ -2257,8 +2257,8 @@ def test_read_non_consolidated_warning(self) -> None: with self.create_zarr_target() as store: expected.to_zarr(store, consolidated=False, **self.version_kwargs) with pytest.warns( - RuntimeWarning, - match="Failed to open Zarr store with consolidated", + RuntimeWarning, + match="Failed to open Zarr store with consolidated", ): with xr.open_zarr(store, **self.version_kwargs) as ds: assert_identical(ds, expected) @@ -2529,17 +2529,17 @@ def test_write_persistence_modes(self, group) -> None: # overwrite mode with self.roundtrip( - original, - save_kwargs={"mode": "w", "group": group}, - open_kwargs={"group": group}, + original, + save_kwargs={"mode": "w", "group": group}, + open_kwargs={"group": group}, ) as actual: assert_identical(original, actual) # don't overwrite mode with self.roundtrip( - original, - save_kwargs={"mode": "w-", "group": group}, - open_kwargs={"group": group}, + original, + save_kwargs={"mode": "w-", "group": group}, + open_kwargs={"group": group}, ) as actual: assert_identical(original, actual) @@ -2555,9 +2555,9 @@ def test_write_persistence_modes(self, group) -> None: # check append mode for normal write with self.roundtrip( - original, - save_kwargs={"mode": "a", "group": group}, - open_kwargs={"group": group}, + original, + save_kwargs={"mode": "a", "group": group}, + open_kwargs={"group": group}, ) as actual: assert_identical(original, actual) @@ -2590,7 +2590,7 @@ def test_group(self) -> None: original = create_test_data() group = "some/random/path" with self.roundtrip( - original, save_kwargs={"group": group}, open_kwargs={"group": group} + original, save_kwargs={"group": group}, open_kwargs={"group": group} ) as actual: assert_identical(original, actual) @@ -2640,7 +2640,7 @@ def test_append_with_mode_rplus_fails(self) -> None: with self.create_zarr_target() as store: original.to_zarr(store, **self.version_kwargs) with pytest.raises( - ValueError, match="dataset contains non-pre-existing variables" + ValueError, match="dataset contains non-pre-existing variables" ): modified.to_zarr(store, mode="r+", **self.version_kwargs) @@ -2649,7 +2649,7 @@ def test_append_with_invalid_dim_raises(self) -> None: with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises( - ValueError, match="does not match any existing dataset dimensions" + ValueError, match="does not match any existing dataset dimensions" ): ds_to_append.to_zarr( store_target, append_dim="notvalid", **self.version_kwargs @@ -2870,7 +2870,7 @@ def test_write_region(self, consolidated, compute, use_dask, write_empty) -> Non ) if compute: with xr.open_zarr( - store, consolidated=consolidated, **self.version_kwargs + store, consolidated=consolidated, **self.version_kwargs ) as actual: assert_identical(actual, zeros) for i in range(0, 10, 2): @@ -2883,7 +2883,7 @@ def test_write_region(self, consolidated, compute, use_dask, write_empty) -> Non **self.version_kwargs, ) with xr.open_zarr( - store, consolidated=consolidated, **self.version_kwargs + store, consolidated=consolidated, **self.version_kwargs ) as actual: assert_identical(actual, nonzeros) @@ -2963,10 +2963,10 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises( - ValueError, - match=re.escape( - "cannot set region unless mode='a', mode='a-', mode='r+' or mode=None" - ), + ValueError, + match=re.escape( + "cannot set region unless mode='a', mode='a-', mode='r+' or mode=None" + ), ): data.to_zarr( store, region={"x": slice(None)}, mode="w", **self.version_kwargs @@ -2988,15 +2988,15 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises( - ValueError, - match=r"all keys in ``region`` are not in Dataset dimensions", + ValueError, + match=r"all keys in ``region`` are not in Dataset dimensions", ): data.to_zarr(store, region={"y": slice(None)}, **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises( - ValueError, - match=r"all variables in the dataset to write must have at least one dimension in common", + ValueError, + match=r"all variables in the dataset to write must have at least one dimension in common", ): data2.assign(v=2).to_zarr( store, region={"x": slice(2)}, **self.version_kwargs @@ -3004,7 +3004,7 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises( - ValueError, match=r"cannot list the same dimension in both" + ValueError, match=r"cannot list the same dimension in both" ): data.to_zarr( store, @@ -3015,8 +3015,8 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises( - ValueError, - match=r"variable 'u' already exists with different dimension sizes", + ValueError, + match=r"variable 'u' already exists with different dimension sizes", ): data2.to_zarr(store, region={"x": slice(3)}, **self.version_kwargs) @@ -3043,7 +3043,7 @@ def test_chunk_encoding_with_partial_dask_chunks(self) -> None: ).chunk({"a": 3}) with self.roundtrip( - original, save_kwargs={"encoding": {"x": {"chunks": [3, 2]}}} + original, save_kwargs={"encoding": {"x": {"chunks": [3, 2]}}} ) as ds1: assert_equal(ds1, original) @@ -3052,7 +3052,7 @@ def test_chunk_encoding_with_larger_dask_chunks(self) -> None: original = xr.Dataset({"a": ("x", [1, 2, 3, 4])}).chunk({"x": 2}) with self.roundtrip( - original, save_kwargs={"encoding": {"a": {"chunks": [1]}}} + original, save_kwargs={"encoding": {"a": {"chunks": [1]}}} ) as ds1: assert_equal(ds1, original) @@ -3322,12 +3322,12 @@ def temp_dir(self) -> Iterator[tuple[str, str]]: @contextlib.contextmanager def roundtrip_dir( - self, - data, - store, - save_kwargs=None, - open_kwargs=None, - allow_cleanup_failure=False, + self, + data, + store, + save_kwargs=None, + open_kwargs=None, + allow_cleanup_failure=False, ) -> Iterator[Dataset]: if save_kwargs is None: save_kwargs = {} @@ -3336,14 +3336,14 @@ def roundtrip_dir( data.to_zarr(store, **save_kwargs, **self.version_kwargs) with xr.open_dataset( - store, engine="zarr", **open_kwargs, **self.version_kwargs + store, engine="zarr", **open_kwargs, **self.version_kwargs ) as ds: yield ds @pytest.mark.parametrize("consolidated", [True, False, None]) @pytest.mark.parametrize("write_empty", [True, False, None]) def test_write_empty( - self, consolidated: bool | None, write_empty: bool | None + self, consolidated: bool | None, write_empty: bool | None ) -> None: if write_empty is False: expected = ["0.1.0", "1.1.0"] @@ -3383,9 +3383,9 @@ def test_write_empty( ) with self.roundtrip_dir( - ds, - store, - {"mode": "a", "append_dim": "Z", "write_empty_chunks": write_empty}, + ds, + store, + {"mode": "a", "append_dim": "Z", "write_empty_chunks": write_empty}, ) as a_ds: expected_ds = xr.concat([ds, ds], dim="Z") @@ -3514,7 +3514,7 @@ def create_store(self): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -3582,7 +3582,7 @@ class TestNetCDF3ViaNetCDF4Data(CFEncodedBase, NetCDF3Only): def create_store(self): with create_tmp_file() as tmp_file: with backends.NetCDF4DataStore.open( - tmp_file, mode="w", format="NETCDF3_CLASSIC" + tmp_file, mode="w", format="NETCDF3_CLASSIC" ) as store: yield store @@ -3603,7 +3603,7 @@ class TestNetCDF4ClassicViaNetCDF4Data(CFEncodedBase, NetCDF3Only): def create_store(self): with create_tmp_file() as tmp_file: with backends.NetCDF4DataStore.open( - tmp_file, mode="w", format="NETCDF4_CLASSIC" + tmp_file, mode="w", format="NETCDF4_CLASSIC" ) as store: yield store @@ -3706,7 +3706,7 @@ def test_complex_error(self, invalid_netcdf) -> None: expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))}) save_kwargs = {"invalid_netcdf": invalid_netcdf} with pytest.raises( - h5netcdf.CompatibilityError, match="are not a supported NetCDF feature" + h5netcdf.CompatibilityError, match="are not a supported NetCDF feature" ): with self.roundtrip(expected, save_kwargs=save_kwargs) as actual: assert_equal(expected, actual) @@ -3821,7 +3821,7 @@ def test_compression_check_encoding_h5py(self) -> None: # Incompatible encodings cause a crash with create_tmp_file() as tmp_file: with pytest.raises( - ValueError, match=r"'zlib' and 'compression' encodings mismatch" + ValueError, match=r"'zlib' and 'compression' encodings mismatch" ): data.to_netcdf( tmp_file, @@ -3831,8 +3831,8 @@ def test_compression_check_encoding_h5py(self) -> None: with create_tmp_file() as tmp_file: with pytest.raises( - ValueError, - match=r"'complevel' and 'compression_opts' encodings mismatch", + ValueError, + match=r"'complevel' and 'compression_opts' encodings mismatch", ): data.to_netcdf( tmp_file, @@ -3929,7 +3929,7 @@ def test_open_badbytes(self) -> None: with open_dataset(b"\211HDF\r\n\032\n", engine="h5netcdf"): # type: ignore[arg-type] pass with pytest.raises( - ValueError, match=r"match in any of xarray's currently installed IO" + ValueError, match=r"match in any of xarray's currently installed IO" ): with open_dataset(b"garbage"): # type: ignore[arg-type] pass @@ -3937,7 +3937,7 @@ def test_open_badbytes(self) -> None: with open_dataset(b"garbage", engine="netcdf4"): # type: ignore[arg-type] pass with pytest.raises( - ValueError, match=r"not the signature of a valid netCDF4 file" + ValueError, match=r"not the signature of a valid netCDF4 file" ): with open_dataset(BytesIO(b"garbage"), engine="h5netcdf"): pass @@ -3991,7 +3991,7 @@ def test_open_fileobj(self) -> None: class TestH5NetCDFViaDaskData(TestH5NetCDFData): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -3999,7 +3999,7 @@ def roundtrip( open_kwargs = {} open_kwargs.setdefault("chunks", -1) with TestH5NetCDFData.roundtrip( - self, data, save_kwargs, open_kwargs, allow_cleanup_failure + self, data, save_kwargs, open_kwargs, allow_cleanup_failure ) as ds: yield ds @@ -4035,9 +4035,9 @@ class TestH5NetCDFDataRos3Driver(TestCommon): @pytest.mark.filterwarnings("ignore:Duplicate dimension names") def test_get_variable_list(self) -> None: with open_dataset( - self.test_remote_dataset, - engine="h5netcdf", - backend_kwargs={"driver": "ros3"}, + self.test_remote_dataset, + engine="h5netcdf", + backend_kwargs={"driver": "ros3"}, ) as actual: assert "Temperature" in list(actual) @@ -4050,7 +4050,7 @@ def test_get_variable_list_empty_driver_kwds(self) -> None: backend_kwargs = {"driver": "ros3", "driver_kwds": driver_kwds} with open_dataset( - self.test_remote_dataset, engine="h5netcdf", backend_kwargs=backend_kwargs + self.test_remote_dataset, engine="h5netcdf", backend_kwargs=backend_kwargs ) as actual: assert "Temperature" in list(actual) @@ -4114,7 +4114,7 @@ def skip_if_not_engine(engine): reason="Flaky test which can cause the worker to crash (so don't xfail). Very open to contributions fixing this" ) def test_open_mfdataset_manyfiles( - readengine, nfiles, parallel, chunks, file_cache_maxsize + readengine, nfiles, parallel, chunks, file_cache_maxsize ): # skip certain combinations skip_if_not_engine(readengine) @@ -4133,12 +4133,12 @@ def test_open_mfdataset_manyfiles( # check that calculation on opened datasets works properly with open_mfdataset( - tmpfiles, - combine="nested", - concat_dim="x", - engine=readengine, - parallel=parallel, - chunks=chunks if (not chunks and readengine != "zarr") else "auto", + tmpfiles, + combine="nested", + concat_dim="x", + engine=readengine, + parallel=parallel, + chunks=chunks if (not chunks and readengine != "zarr") else "auto", ) as actual: # check that using open_mfdataset returns dask arrays for variables assert isinstance(actual["foo"].data, dask_array_type) @@ -4175,7 +4175,7 @@ def test_open_mfdataset_list_attr() -> None: with open_dataset(nfiles[1]) as ds2: original = xr.concat([ds1, ds2], dim="x") with xr.open_mfdataset( - [nfiles[0], nfiles[1]], combine="nested", concat_dim="x" + [nfiles[0], nfiles[1]], combine="nested", concat_dim="x" ) as actual: assert_identical(actual, original) @@ -4230,13 +4230,13 @@ def gen_datasets_with_common_coord_and_time(self): @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) @pytest.mark.parametrize("join", ["outer", "inner", "left", "right"]) def test_open_mfdataset_does_same_as_concat( - self, combine, concat_dim, opt, join + self, combine, concat_dim, opt, join ) -> None: with self.setup_files_and_datasets() as (files, [ds1, ds2]): if combine == "by_coords": files.reverse() with open_mfdataset( - files, data_vars=opt, combine=combine, concat_dim=concat_dim, join=join + files, data_vars=opt, combine=combine, concat_dim=concat_dim, join=join ) as ds: ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t", join=join) assert_identical(ds, ds_expect) @@ -4244,31 +4244,31 @@ def test_open_mfdataset_does_same_as_concat( @pytest.mark.parametrize( ["combine_attrs", "attrs", "expected", "expect_error"], ( - pytest.param("drop", [{"a": 1}, {"a": 2}], {}, False, id="drop"), - pytest.param( - "override", [{"a": 1}, {"a": 2}], {"a": 1}, False, id="override" - ), - pytest.param( - "no_conflicts", [{"a": 1}, {"a": 2}], None, True, id="no_conflicts" - ), - pytest.param( - "identical", - [{"a": 1, "b": 2}, {"a": 1, "c": 3}], - None, - True, - id="identical", - ), - pytest.param( - "drop_conflicts", - [{"a": 1, "b": 2}, {"b": -1, "c": 3}], - {"a": 1, "c": 3}, - False, - id="drop_conflicts", - ), + pytest.param("drop", [{"a": 1}, {"a": 2}], {}, False, id="drop"), + pytest.param( + "override", [{"a": 1}, {"a": 2}], {"a": 1}, False, id="override" + ), + pytest.param( + "no_conflicts", [{"a": 1}, {"a": 2}], None, True, id="no_conflicts" + ), + pytest.param( + "identical", + [{"a": 1, "b": 2}, {"a": 1, "c": 3}], + None, + True, + id="identical", + ), + pytest.param( + "drop_conflicts", + [{"a": 1, "b": 2}, {"b": -1, "c": 3}], + {"a": 1, "c": 3}, + False, + id="drop_conflicts", + ), ), ) def test_open_mfdataset_dataset_combine_attrs( - self, combine_attrs, attrs, expected, expect_error + self, combine_attrs, attrs, expected, expect_error ): with self.setup_files_and_datasets() as (files, [ds1, ds2]): # Give the files an inconsistent attribute @@ -4288,10 +4288,10 @@ def test_open_mfdataset_dataset_combine_attrs( ) else: with xr.open_mfdataset( - files, - combine="nested", - concat_dim="t", - combine_attrs=combine_attrs, + files, + combine="nested", + concat_dim="t", + combine_attrs=combine_attrs, ) as ds: assert ds.attrs == expected @@ -4330,13 +4330,13 @@ def test_open_mfdataset_dataarray_attr_by_coords(self) -> None: ) @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) def test_open_mfdataset_exact_join_raises_error( - self, combine, concat_dim, opt + self, combine, concat_dim, opt ) -> None: with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]): if combine == "by_coords": files.reverse() with pytest.raises( - ValueError, match=r"cannot align objects.*join.*exact.*" + ValueError, match=r"cannot align objects.*join.*exact.*" ): open_mfdataset( files, @@ -4352,7 +4352,7 @@ def test_common_coord_when_datavars_all(self) -> None: with self.setup_files_and_datasets() as (files, [ds1, ds2]): # open the files with the data_var option with open_mfdataset( - files, data_vars=opt, combine="nested", concat_dim="t" + files, data_vars=opt, combine="nested", concat_dim="t" ) as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -4370,7 +4370,7 @@ def test_common_coord_when_datavars_minimal(self) -> None: with self.setup_files_and_datasets() as (files, [ds1, ds2]): # open the files using data_vars option with open_mfdataset( - files, data_vars=opt, combine="nested", concat_dim="t" + files, data_vars=opt, combine="nested", concat_dim="t" ) as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -4404,7 +4404,7 @@ def create_store(self): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): yield data.chunk() @@ -4460,13 +4460,13 @@ def test_open_mfdataset(self) -> None: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) assert_identical(original, actual) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", chunks={"x": 3} + [tmp1, tmp2], concat_dim="x", combine="nested", chunks={"x": 3} ) as actual: assert actual.foo.variable.data.chunks == ((3, 2, 3, 2),) @@ -4494,18 +4494,18 @@ def test_open_mfdataset_2d(self) -> None: original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset( - [[tmp1, tmp2], [tmp3, tmp4]], - combine="nested", - concat_dim=["y", "x"], + [[tmp1, tmp2], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5), (4, 4)) assert_identical(original, actual) with open_mfdataset( - [[tmp1, tmp2], [tmp3, tmp4]], - combine="nested", - concat_dim=["y", "x"], - chunks={"x": 3, "y": 2}, + [[tmp1, tmp2], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], + chunks={"x": 3, "y": 2}, ) as actual: assert actual.foo.variable.data.chunks == ( (3, 2, 3, 2), @@ -4521,7 +4521,7 @@ def test_open_mfdataset_pathlib(self) -> None: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert_identical(original, actual) @@ -4540,9 +4540,9 @@ def test_open_mfdataset_2d_pathlib(self) -> None: original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset( - [[tmp1, tmp2], [tmp3, tmp4]], - combine="nested", - concat_dim=["y", "x"], + [[tmp1, tmp2], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], ) as actual: assert_identical(original, actual) @@ -4554,7 +4554,7 @@ def test_open_mfdataset_2(self) -> None: original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert_identical(original, actual) @@ -4569,7 +4569,7 @@ def test_attrs_mfdataset(self) -> None: ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: # presumes that attributes inherited from # first dataset loaded @@ -4588,7 +4588,7 @@ def test_open_mfdataset_attrs_file(self) -> None: ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 ) as actual: # attributes are inherited from the master file assert actual.attrs["test2"] == ds2.attrs["test2"] @@ -4607,7 +4607,7 @@ def test_open_mfdataset_attrs_file_path(self) -> None: ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 ) as actual: # attributes are inherited from the master file assert actual.attrs["test2"] == ds2.attrs["test2"] @@ -4666,7 +4666,7 @@ def preprocess(ds): expected = preprocess(original) with open_mfdataset( - tmp, preprocess=preprocess, combine="by_coords" + tmp, preprocess=preprocess, combine="by_coords" ) as actual: assert_identical(expected, actual) @@ -4677,7 +4677,7 @@ def test_save_mfdataset_roundtrip(self) -> None: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert_identical(actual, original) @@ -4703,7 +4703,7 @@ def test_save_mfdataset_pathlib_roundtrip(self) -> None: tmp2 = Path(tmps2) save_mfdataset(datasets, [tmp1, tmp2]) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert_identical(actual, original) @@ -4745,7 +4745,7 @@ def test_open_mfdataset_concat_dim_none(self) -> None: data.to_netcdf(tmp1) Dataset({"x": np.nan}).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim=None, combine="nested" + [tmp1, tmp2], concat_dim=None, combine="nested" ) as actual: assert_identical(data, actual) @@ -4807,7 +4807,7 @@ def test_open_multi_dataset(self) -> None: original.to_netcdf(tmp1) original.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim=dim, combine="nested" + [tmp1, tmp2], concat_dim=dim, combine="nested" ) as actual: assert_identical(expected, actual) @@ -4861,7 +4861,7 @@ def test_save_mfdataset_compute_false_roundtrip(self) -> None: assert isinstance(delayed_obj, Delayed) delayed_obj.compute() with open_mfdataset( - [tmp1, tmp2], combine="nested", concat_dim="x" + [tmp1, tmp2], combine="nested", concat_dim="x" ) as actual: assert_identical(actual, original) @@ -5339,7 +5339,7 @@ def test_use_cftime_standard_calendar_default_in_range(calendar) -> None: @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) @pytest.mark.parametrize("units_year", [1500, 2500]) def test_use_cftime_standard_calendar_default_out_of_range( - calendar, units_year + calendar, units_year ) -> None: import cftime @@ -5525,7 +5525,7 @@ def test_open_fsspec() -> None: import zarr if not hasattr(zarr.storage, "FSStore") or not hasattr( - zarr.storage.FSStore, "getitems" + zarr.storage.FSStore, "getitems" ): pytest.skip("zarr too old") @@ -5608,7 +5608,7 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path: Path) -> None: with dask.config.set({"array.chunk-size": "1MiB"}): expected = ds.chunk(chunks) with open_dataset( - tmp_path / "test.zarr", engine="zarr", chunks=chunks + tmp_path / "test.zarr", engine="zarr", chunks=chunks ) as actual: xr.testing.assert_chunks_equal(actual, expected) @@ -5639,7 +5639,7 @@ def test_chunking_consintency(chunks, tmp_path: Path) -> None: with dask.config.set({"array.chunk-size": "1MiB"}): expected = ds.chunk(chunks) with xr.open_dataset( - tmp_path / "test.zarr", engine="zarr", chunks=chunks + tmp_path / "test.zarr", engine="zarr", chunks=chunks ) as actual: xr.testing.assert_chunks_equal(actual, expected) @@ -5733,7 +5733,7 @@ def test_h5netcdf_entrypoint(tmp_path: Path) -> None: @requires_netCDF4 @pytest.mark.parametrize("str_type", (str, np.str_)) def test_write_file_from_np_str( - str_type: type[str] | type[np.str_], tmpdir: str + str_type: type[str] | type[np.str_], tmpdir: str ) -> None: # https://github.com/pydata/xarray/pull/5264 scenarios = [str_type(v) for v in ["scenario_a", "scenario_b", "scenario_c"]] @@ -5799,7 +5799,7 @@ def test_raise_writing_to_nczarr(self, mode) -> None: with create_tmp_file(suffix=".zarr") as tmp: ds = self._create_nczarr(tmp) with pytest.raises( - KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," + KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," ): ds.to_zarr(tmp, mode=mode) @@ -5948,10 +5948,10 @@ def test_zarr_region_index_write(self, tmp_path): region: Mapping[str, slice] | Literal["auto"] for region in [region_slice, "auto"]: # type: ignore[assignment] with patch.object( - ZarrStore, - "set_variables", - side_effect=ZarrStore.set_variables, - autospec=True, + ZarrStore, + "set_variables", + side_effect=ZarrStore.set_variables, + autospec=True, ) as mock: ds_region.to_zarr(tmp_path / "test.zarr", region=region, mode="r+") @@ -6261,7 +6261,7 @@ def test_zarr_safe_chunk_region(tmp_path): data = np.random.RandomState(0).randn(2920, 25, 53) ds = xr.Dataset({"temperature": (("time", "lat", "lon"), data)}) chunks = {"time": 1000, "lat": 25, "lon": 53} - ds.chunk(chunks).to_zarr(store, compute=False) + ds.chunk(chunks).to_zarr(store, compute=False, mode="w") region = {"time": slice(1000, 2000, 1)} chunk = ds.isel(region) chunk = chunk.chunk() From 58f1866c584bf29abc64f2ef93d5c66844b91c2f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Sep 2024 19:50:51 +0000 Subject: [PATCH 17/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 416 +++++++++++++++++----------------- 1 file changed, 208 insertions(+), 208 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0e5db458116..cc8dbd4e02c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -41,11 +41,11 @@ ) from xarray.backends.common import robust_getitem from xarray.backends.h5netcdf_ import H5netcdfBackendEntrypoint +from xarray.backends.netcdf3 import _nc3_dtype_coercions from xarray.backends.netCDF4_ import ( NetCDF4BackendEntrypoint, _extract_nc4_variable_encoding, ) -from xarray.backends.netcdf3 import _nc3_dtype_coercions from xarray.backends.pydap_ import PydapDataStore from xarray.backends.scipy_ import ScipyBackendEntrypoint from xarray.coding.cftime_offsets import cftime_range @@ -306,7 +306,7 @@ class NetCDF3Only: def test_dtype_coercion_error(self) -> None: """Failing dtype coercion should lead to an error""" for dtype, format in itertools.product( - _nc3_dtype_coercions, self.netcdf3_formats + _nc3_dtype_coercions, self.netcdf3_formats ): if dtype == "bool": # coerced upcast (bool to int8) ==> can never fail @@ -332,7 +332,7 @@ def create_store(self): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -345,7 +345,7 @@ def roundtrip( @contextlib.contextmanager def roundtrip_append( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -392,8 +392,8 @@ def check_dtypes_roundtripped(self, expected, actual): # For NetCDF3, the backend should perform dtype coercion if ( - isinstance(self, NetCDF3Only) - and str(expected_dtype) in _nc3_dtype_coercions + isinstance(self, NetCDF3Only) + and str(expected_dtype) in _nc3_dtype_coercions ): expected_dtype = np.dtype(_nc3_dtype_coercions[str(expected_dtype)]) @@ -401,8 +401,8 @@ def check_dtypes_roundtripped(self, expected, actual): # TODO: check expected behavior for string dtypes more carefully string_kinds = {"O", "S", "U"} assert expected_dtype == actual_dtype or ( - expected_dtype.kind in string_kinds - and actual_dtype.kind in string_kinds + expected_dtype.kind in string_kinds + and actual_dtype.kind in string_kinds ) def test_roundtrip_test_data(self) -> None: @@ -584,8 +584,8 @@ def test_roundtrip_cftime_datetime_data(self) -> None: abs_diff = abs(actual.t.values - expected_decoded_t) assert (abs_diff <= np.timedelta64(1, "s")).all() assert ( - actual.t.encoding["units"] - == "days since 0001-01-01 00:00:00.000000" + actual.t.encoding["units"] + == "days since 0001-01-01 00:00:00.000000" ) assert actual.t.encoding["calendar"] == expected_calendar @@ -626,7 +626,7 @@ def test_roundtrip_coordinates(self) -> None: with self.roundtrip(original, open_kwargs={"decode_coords": False}) as expected: # check roundtripping when decode_coords=False with self.roundtrip( - expected, open_kwargs={"decode_coords": False} + expected, open_kwargs={"decode_coords": False} ) as actual: assert_identical(expected, actual) @@ -905,8 +905,8 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: "decoded_fn, encoded_fn", [ ( - create_unsigned_masked_scaled_data, - create_encoded_unsigned_masked_scaled_data, + create_unsigned_masked_scaled_data, + create_encoded_unsigned_masked_scaled_data, ), pytest.param( create_bad_unsigned_masked_scaled_data, @@ -914,12 +914,12 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: marks=pytest.mark.xfail(reason="Bad _Unsigned attribute."), ), ( - create_signed_masked_scaled_data, - create_encoded_signed_masked_scaled_data, + create_signed_masked_scaled_data, + create_encoded_signed_masked_scaled_data, ), ( - create_unsigned_false_masked_scaled_data, - create_encoded_unsigned_false_masked_scaled_data, + create_unsigned_false_masked_scaled_data, + create_encoded_unsigned_false_masked_scaled_data, ), (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], @@ -931,9 +931,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: decoded = decoded_fn(dtype) encoded = encoded_fn(dtype) if decoded["x"].encoding["dtype"] == "u1" and not ( - self.engine == "netcdf4" - and self.file_format is None - or self.file_format == "NETCDF4" + self.engine == "netcdf4" + and self.file_format is None + or self.file_format == "NETCDF4" ): pytest.skip("uint8 data can't be written to non-NetCDF4 data") @@ -942,8 +942,8 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert decoded.variables[k].dtype == actual.variables[k].dtype # CF _FillValue is always on-disk type assert ( - decoded.variables[k].encoding["_FillValue"] - == actual.variables[k].encoding["_FillValue"] + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].encoding["_FillValue"] ) assert_allclose(decoded, actual, decode_bytes=False) @@ -954,8 +954,8 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert encoded.variables[k].dtype == actual.variables[k].dtype # CF _FillValue is always on-disk type assert ( - decoded.variables[k].encoding["_FillValue"] - == actual.variables[k].attrs["_FillValue"] + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].attrs["_FillValue"] ) assert_allclose(encoded, actual, decode_bytes=False) @@ -964,8 +964,8 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert encoded.variables[k].dtype == actual.variables[k].dtype # CF _FillValue is always on-disk type assert ( - encoded.variables[k].attrs["_FillValue"] - == actual.variables[k].attrs["_FillValue"] + encoded.variables[k].attrs["_FillValue"] + == actual.variables[k].attrs["_FillValue"] ) assert_allclose(encoded, actual, decode_bytes=False) @@ -1030,7 +1030,7 @@ def _roundtrip_with_warnings(*args, **kwargs): assert_allclose(decoded, actual, decode_bytes=False) with _roundtrip_with_warnings( - decoded, open_kwargs=dict(decode_cf=False) + decoded, open_kwargs=dict(decode_cf=False) ) as actual: for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype @@ -1120,7 +1120,7 @@ def test_coordinate_variables_after_dataset_roundtrip(self) -> None: assert_equal(actual, expected) def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip( - self, + self, ) -> None: original = self._create_cf_dataset() # The DataArray roundtrip should have the same warnings as the @@ -1132,14 +1132,14 @@ def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip( # needs the to_dataset. The other backends should be fine # without it. with pytest.warns( - UserWarning, - match=( - r"Variable\(s\) referenced in bounds not in variables: " - r"\['l(at|ong)itude_bnds'\]" - ), + UserWarning, + match=( + r"Variable\(s\) referenced in bounds not in variables: " + r"\['l(at|ong)itude_bnds'\]" + ), ): with self.roundtrip( - original["variable"].to_dataset(), open_kwargs={"decode_coords": "all"} + original["variable"].to_dataset(), open_kwargs={"decode_coords": "all"} ) as actual: assert_identical(actual, original["variable"].to_dataset()) @@ -1224,7 +1224,7 @@ def test_invalid_dataarray_names_raise(self) -> None: data = np.random.random((2, 2)) da = xr.DataArray(data) for name, (error, msg) in zip( - [0, (4, 5), True, ""], [te, te, te, ve], strict=True + [0, (4, 5), True, ""], [te, te, te, ve], strict=True ): ds = Dataset({name: da}) with pytest.raises(error) as excinfo: @@ -1357,7 +1357,7 @@ def test_append_with_invalid_dim_raises(self) -> None: data["var9"] = data["var2"] * 3 data = data.isel(dim1=slice(2, 6)) # modify one dimension with pytest.raises( - ValueError, match=r"Unable to update size for existing dimension" + ValueError, match=r"Unable to update size for existing dimension" ): self.save(data, tmp_file, mode="a") @@ -1419,7 +1419,7 @@ def test_byte_attrs(self, byte_attrs_dataset: dict[str, Any]) -> None: @contextlib.contextmanager def create_tmp_file( - suffix: str = ".nc", allow_cleanup_failure: bool = False + suffix: str = ".nc", allow_cleanup_failure: bool = False ) -> Iterator[str]: temp_dir = tempfile.mkdtemp() path = os.path.join(temp_dir, f"temp-{next(_counter)}{suffix}") @@ -1435,7 +1435,7 @@ def create_tmp_file( @contextlib.contextmanager def create_tmp_files( - nfiles: int, suffix: str = ".nc", allow_cleanup_failure: bool = False + nfiles: int, suffix: str = ".nc", allow_cleanup_failure: bool = False ) -> Iterator[list[str]]: with ExitStack() as stack: files = [ @@ -1517,7 +1517,7 @@ def test_write_groups(self) -> None: ], ) def test_encoding_kwarg_vlen_string( - self, input_strings: list[str], is_bytes: bool + self, input_strings: list[str], is_bytes: bool ) -> None: original = Dataset({"x": input_strings}) @@ -1689,9 +1689,9 @@ def test_auto_chunking_is_based_on_disk_chunk_sizes(self) -> None: with dask.config.set({"array.chunk-size": "100KiB"}): with self.chunked_roundtrip( - (1, y_size, x_size), - (1, y_chunksize, x_chunksize), - open_kwargs={"chunks": "auto"}, + (1, y_size, x_size), + (1, y_chunksize, x_chunksize), + open_kwargs={"chunks": "auto"}, ) as ds: t_chunks, y_chunks, x_chunks = ds["image"].data.chunks assert all(np.asanyarray(y_chunks) == y_chunksize) @@ -1705,21 +1705,21 @@ def test_base_chunking_uses_disk_chunk_sizes(self) -> None: x_chunksize = 10 with self.chunked_roundtrip( - (1, y_size, x_size), - (1, y_chunksize, x_chunksize), - open_kwargs={"chunks": {}}, + (1, y_size, x_size), + (1, y_chunksize, x_chunksize), + open_kwargs={"chunks": {}}, ) as ds: for chunksizes, expected in zip( - ds["image"].data.chunks, (1, y_chunksize, x_chunksize), strict=True + ds["image"].data.chunks, (1, y_chunksize, x_chunksize), strict=True ): assert all(np.asanyarray(chunksizes) == expected) @contextlib.contextmanager def chunked_roundtrip( - self, - array_shape: tuple[int, int, int], - chunk_sizes: tuple[int, int, int], - open_kwargs: dict[str, Any] | None = None, + self, + array_shape: tuple[int, int, int], + chunk_sizes: tuple[int, int, int], + open_kwargs: dict[str, Any] | None = None, ) -> Generator[Dataset, None, None]: t_size, y_size, x_size = array_shape t_chunksize, y_chunksize, x_chunksize = chunk_sizes @@ -1742,7 +1742,7 @@ def test_preferred_chunks_are_disk_chunk_sizes(self) -> None: x_chunksize = 10 with self.chunked_roundtrip( - (1, y_size, x_size), (1, y_chunksize, x_chunksize) + (1, y_size, x_size), (1, y_chunksize, x_chunksize) ) as ds: assert ds["image"].encoding["preferred_chunks"] == { "t": 1, @@ -1759,7 +1759,7 @@ def test_encoding_chunksizes_unlimited(self) -> None: "complevel": 0, "fletcher32": False, "contiguous": False, - "chunksizes": (2 ** 20,), + "chunksizes": (2**20,), "original_shape": (3,), } with self.roundtrip(ds) as actual: @@ -1862,14 +1862,14 @@ def test_encoding_enum__no_fill_value(self): with self.roundtrip(original, save_kwargs=save_kwargs) as actual: assert_equal(original, actual) assert ( - actual.clouds.encoding["dtype"].metadata["enum"] - == cloud_type_dict + actual.clouds.encoding["dtype"].metadata["enum"] + == cloud_type_dict ) if self.engine != "h5netcdf": # not implemented in h5netcdf yet assert ( - actual.clouds.encoding["dtype"].metadata["enum_name"] - == "cloud_type" + actual.clouds.encoding["dtype"].metadata["enum_name"] + == "cloud_type" ) @requires_netCDF4 @@ -1898,21 +1898,21 @@ def test_encoding_enum__multiple_variable_with_enum(self): with self.roundtrip(original, save_kwargs=save_kwargs) as actual: assert_equal(original, actual) assert ( - actual.clouds.encoding["dtype"] == actual.tifa.encoding["dtype"] + actual.clouds.encoding["dtype"] == actual.tifa.encoding["dtype"] ) assert ( - actual.clouds.encoding["dtype"].metadata - == actual.tifa.encoding["dtype"].metadata + actual.clouds.encoding["dtype"].metadata + == actual.tifa.encoding["dtype"].metadata ) assert ( - actual.clouds.encoding["dtype"].metadata["enum"] - == cloud_type_dict + actual.clouds.encoding["dtype"].metadata["enum"] + == cloud_type_dict ) if self.engine != "h5netcdf": # not implemented in h5netcdf yet assert ( - actual.clouds.encoding["dtype"].metadata["enum_name"] - == "cloud_type" + actual.clouds.encoding["dtype"].metadata["enum_name"] + == "cloud_type" ) @requires_netCDF4 @@ -1940,8 +1940,8 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): ) with open_dataset(tmp_file) as original: assert ( - original.clouds.encoding["dtype"].metadata - == original.tifa.encoding["dtype"].metadata + original.clouds.encoding["dtype"].metadata + == original.tifa.encoding["dtype"].metadata ) modified_enum = original.clouds.encoding["dtype"].metadata["enum"] modified_enum.update({"neblig": 2}) @@ -1952,11 +1952,11 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): if self.engine != "h5netcdf": # not implemented yet in h5netcdf with pytest.raises( - ValueError, - match=( - "Cannot save variable .*" - " because an enum `cloud_type` already exists in the Dataset .*" - ), + ValueError, + match=( + "Cannot save variable .*" + " because an enum `cloud_type` already exists in the Dataset .*" + ), ): with self.roundtrip(original): pass @@ -2081,8 +2081,8 @@ def test_compression_encoding(self, compression: str | None) -> None: actual_encoding = actual["var2"].encoding assert expected_encoding.items() <= actual_encoding.items() if ( - encoding_params["compression"] is not None - and "blosc" not in encoding_params["compression"] + encoding_params["compression"] is not None + and "blosc" not in encoding_params["compression"] ): # regression test for #156 expected = data.isel(dim1=0) @@ -2153,7 +2153,7 @@ def test_deepcopy(self) -> None: class TestNetCDF4ViaDaskData(TestNetCDF4Data): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if open_kwargs is None: open_kwargs = {} @@ -2161,7 +2161,7 @@ def roundtrip( save_kwargs = {} open_kwargs.setdefault("chunks", -1) with TestNetCDF4Data.roundtrip( - self, data, save_kwargs, open_kwargs, allow_cleanup_failure + self, data, save_kwargs, open_kwargs, allow_cleanup_failure ) as ds: yield ds @@ -2219,13 +2219,13 @@ def save(self, dataset, store_target, **kwargs): # type: ignore[override] @contextlib.contextmanager def open(self, store_target, **kwargs): with xr.open_dataset( - store_target, engine="zarr", **kwargs, **self.version_kwargs + store_target, engine="zarr", **kwargs, **self.version_kwargs ) as ds: yield ds @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -2242,9 +2242,9 @@ def test_roundtrip_consolidated(self, consolidated) -> None: pytest.xfail("consolidated metadata is not supported for zarr v3 yet") expected = create_test_data() with self.roundtrip( - expected, - save_kwargs={"consolidated": consolidated}, - open_kwargs={"backend_kwargs": {"consolidated": consolidated}}, + expected, + save_kwargs={"consolidated": consolidated}, + open_kwargs={"backend_kwargs": {"consolidated": consolidated}}, ) as actual: self.check_dtypes_roundtripped(expected, actual) assert_identical(expected, actual) @@ -2257,8 +2257,8 @@ def test_read_non_consolidated_warning(self) -> None: with self.create_zarr_target() as store: expected.to_zarr(store, consolidated=False, **self.version_kwargs) with pytest.warns( - RuntimeWarning, - match="Failed to open Zarr store with consolidated", + RuntimeWarning, + match="Failed to open Zarr store with consolidated", ): with xr.open_zarr(store, **self.version_kwargs) as ds: assert_identical(ds, expected) @@ -2529,17 +2529,17 @@ def test_write_persistence_modes(self, group) -> None: # overwrite mode with self.roundtrip( - original, - save_kwargs={"mode": "w", "group": group}, - open_kwargs={"group": group}, + original, + save_kwargs={"mode": "w", "group": group}, + open_kwargs={"group": group}, ) as actual: assert_identical(original, actual) # don't overwrite mode with self.roundtrip( - original, - save_kwargs={"mode": "w-", "group": group}, - open_kwargs={"group": group}, + original, + save_kwargs={"mode": "w-", "group": group}, + open_kwargs={"group": group}, ) as actual: assert_identical(original, actual) @@ -2555,9 +2555,9 @@ def test_write_persistence_modes(self, group) -> None: # check append mode for normal write with self.roundtrip( - original, - save_kwargs={"mode": "a", "group": group}, - open_kwargs={"group": group}, + original, + save_kwargs={"mode": "a", "group": group}, + open_kwargs={"group": group}, ) as actual: assert_identical(original, actual) @@ -2590,7 +2590,7 @@ def test_group(self) -> None: original = create_test_data() group = "some/random/path" with self.roundtrip( - original, save_kwargs={"group": group}, open_kwargs={"group": group} + original, save_kwargs={"group": group}, open_kwargs={"group": group} ) as actual: assert_identical(original, actual) @@ -2640,7 +2640,7 @@ def test_append_with_mode_rplus_fails(self) -> None: with self.create_zarr_target() as store: original.to_zarr(store, **self.version_kwargs) with pytest.raises( - ValueError, match="dataset contains non-pre-existing variables" + ValueError, match="dataset contains non-pre-existing variables" ): modified.to_zarr(store, mode="r+", **self.version_kwargs) @@ -2649,7 +2649,7 @@ def test_append_with_invalid_dim_raises(self) -> None: with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises( - ValueError, match="does not match any existing dataset dimensions" + ValueError, match="does not match any existing dataset dimensions" ): ds_to_append.to_zarr( store_target, append_dim="notvalid", **self.version_kwargs @@ -2870,7 +2870,7 @@ def test_write_region(self, consolidated, compute, use_dask, write_empty) -> Non ) if compute: with xr.open_zarr( - store, consolidated=consolidated, **self.version_kwargs + store, consolidated=consolidated, **self.version_kwargs ) as actual: assert_identical(actual, zeros) for i in range(0, 10, 2): @@ -2883,7 +2883,7 @@ def test_write_region(self, consolidated, compute, use_dask, write_empty) -> Non **self.version_kwargs, ) with xr.open_zarr( - store, consolidated=consolidated, **self.version_kwargs + store, consolidated=consolidated, **self.version_kwargs ) as actual: assert_identical(actual, nonzeros) @@ -2963,10 +2963,10 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises( - ValueError, - match=re.escape( - "cannot set region unless mode='a', mode='a-', mode='r+' or mode=None" - ), + ValueError, + match=re.escape( + "cannot set region unless mode='a', mode='a-', mode='r+' or mode=None" + ), ): data.to_zarr( store, region={"x": slice(None)}, mode="w", **self.version_kwargs @@ -2988,15 +2988,15 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises( - ValueError, - match=r"all keys in ``region`` are not in Dataset dimensions", + ValueError, + match=r"all keys in ``region`` are not in Dataset dimensions", ): data.to_zarr(store, region={"y": slice(None)}, **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises( - ValueError, - match=r"all variables in the dataset to write must have at least one dimension in common", + ValueError, + match=r"all variables in the dataset to write must have at least one dimension in common", ): data2.assign(v=2).to_zarr( store, region={"x": slice(2)}, **self.version_kwargs @@ -3004,7 +3004,7 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises( - ValueError, match=r"cannot list the same dimension in both" + ValueError, match=r"cannot list the same dimension in both" ): data.to_zarr( store, @@ -3015,8 +3015,8 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises( - ValueError, - match=r"variable 'u' already exists with different dimension sizes", + ValueError, + match=r"variable 'u' already exists with different dimension sizes", ): data2.to_zarr(store, region={"x": slice(3)}, **self.version_kwargs) @@ -3043,7 +3043,7 @@ def test_chunk_encoding_with_partial_dask_chunks(self) -> None: ).chunk({"a": 3}) with self.roundtrip( - original, save_kwargs={"encoding": {"x": {"chunks": [3, 2]}}} + original, save_kwargs={"encoding": {"x": {"chunks": [3, 2]}}} ) as ds1: assert_equal(ds1, original) @@ -3052,7 +3052,7 @@ def test_chunk_encoding_with_larger_dask_chunks(self) -> None: original = xr.Dataset({"a": ("x", [1, 2, 3, 4])}).chunk({"x": 2}) with self.roundtrip( - original, save_kwargs={"encoding": {"a": {"chunks": [1]}}} + original, save_kwargs={"encoding": {"a": {"chunks": [1]}}} ) as ds1: assert_equal(ds1, original) @@ -3322,12 +3322,12 @@ def temp_dir(self) -> Iterator[tuple[str, str]]: @contextlib.contextmanager def roundtrip_dir( - self, - data, - store, - save_kwargs=None, - open_kwargs=None, - allow_cleanup_failure=False, + self, + data, + store, + save_kwargs=None, + open_kwargs=None, + allow_cleanup_failure=False, ) -> Iterator[Dataset]: if save_kwargs is None: save_kwargs = {} @@ -3336,14 +3336,14 @@ def roundtrip_dir( data.to_zarr(store, **save_kwargs, **self.version_kwargs) with xr.open_dataset( - store, engine="zarr", **open_kwargs, **self.version_kwargs + store, engine="zarr", **open_kwargs, **self.version_kwargs ) as ds: yield ds @pytest.mark.parametrize("consolidated", [True, False, None]) @pytest.mark.parametrize("write_empty", [True, False, None]) def test_write_empty( - self, consolidated: bool | None, write_empty: bool | None + self, consolidated: bool | None, write_empty: bool | None ) -> None: if write_empty is False: expected = ["0.1.0", "1.1.0"] @@ -3383,9 +3383,9 @@ def test_write_empty( ) with self.roundtrip_dir( - ds, - store, - {"mode": "a", "append_dim": "Z", "write_empty_chunks": write_empty}, + ds, + store, + {"mode": "a", "append_dim": "Z", "write_empty_chunks": write_empty}, ) as a_ds: expected_ds = xr.concat([ds, ds], dim="Z") @@ -3514,7 +3514,7 @@ def create_store(self): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -3582,7 +3582,7 @@ class TestNetCDF3ViaNetCDF4Data(CFEncodedBase, NetCDF3Only): def create_store(self): with create_tmp_file() as tmp_file: with backends.NetCDF4DataStore.open( - tmp_file, mode="w", format="NETCDF3_CLASSIC" + tmp_file, mode="w", format="NETCDF3_CLASSIC" ) as store: yield store @@ -3603,7 +3603,7 @@ class TestNetCDF4ClassicViaNetCDF4Data(CFEncodedBase, NetCDF3Only): def create_store(self): with create_tmp_file() as tmp_file: with backends.NetCDF4DataStore.open( - tmp_file, mode="w", format="NETCDF4_CLASSIC" + tmp_file, mode="w", format="NETCDF4_CLASSIC" ) as store: yield store @@ -3706,7 +3706,7 @@ def test_complex_error(self, invalid_netcdf) -> None: expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))}) save_kwargs = {"invalid_netcdf": invalid_netcdf} with pytest.raises( - h5netcdf.CompatibilityError, match="are not a supported NetCDF feature" + h5netcdf.CompatibilityError, match="are not a supported NetCDF feature" ): with self.roundtrip(expected, save_kwargs=save_kwargs) as actual: assert_equal(expected, actual) @@ -3821,7 +3821,7 @@ def test_compression_check_encoding_h5py(self) -> None: # Incompatible encodings cause a crash with create_tmp_file() as tmp_file: with pytest.raises( - ValueError, match=r"'zlib' and 'compression' encodings mismatch" + ValueError, match=r"'zlib' and 'compression' encodings mismatch" ): data.to_netcdf( tmp_file, @@ -3831,8 +3831,8 @@ def test_compression_check_encoding_h5py(self) -> None: with create_tmp_file() as tmp_file: with pytest.raises( - ValueError, - match=r"'complevel' and 'compression_opts' encodings mismatch", + ValueError, + match=r"'complevel' and 'compression_opts' encodings mismatch", ): data.to_netcdf( tmp_file, @@ -3929,7 +3929,7 @@ def test_open_badbytes(self) -> None: with open_dataset(b"\211HDF\r\n\032\n", engine="h5netcdf"): # type: ignore[arg-type] pass with pytest.raises( - ValueError, match=r"match in any of xarray's currently installed IO" + ValueError, match=r"match in any of xarray's currently installed IO" ): with open_dataset(b"garbage"): # type: ignore[arg-type] pass @@ -3937,7 +3937,7 @@ def test_open_badbytes(self) -> None: with open_dataset(b"garbage", engine="netcdf4"): # type: ignore[arg-type] pass with pytest.raises( - ValueError, match=r"not the signature of a valid netCDF4 file" + ValueError, match=r"not the signature of a valid netCDF4 file" ): with open_dataset(BytesIO(b"garbage"), engine="h5netcdf"): pass @@ -3991,7 +3991,7 @@ def test_open_fileobj(self) -> None: class TestH5NetCDFViaDaskData(TestH5NetCDFData): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): if save_kwargs is None: save_kwargs = {} @@ -3999,7 +3999,7 @@ def roundtrip( open_kwargs = {} open_kwargs.setdefault("chunks", -1) with TestH5NetCDFData.roundtrip( - self, data, save_kwargs, open_kwargs, allow_cleanup_failure + self, data, save_kwargs, open_kwargs, allow_cleanup_failure ) as ds: yield ds @@ -4035,9 +4035,9 @@ class TestH5NetCDFDataRos3Driver(TestCommon): @pytest.mark.filterwarnings("ignore:Duplicate dimension names") def test_get_variable_list(self) -> None: with open_dataset( - self.test_remote_dataset, - engine="h5netcdf", - backend_kwargs={"driver": "ros3"}, + self.test_remote_dataset, + engine="h5netcdf", + backend_kwargs={"driver": "ros3"}, ) as actual: assert "Temperature" in list(actual) @@ -4050,7 +4050,7 @@ def test_get_variable_list_empty_driver_kwds(self) -> None: backend_kwargs = {"driver": "ros3", "driver_kwds": driver_kwds} with open_dataset( - self.test_remote_dataset, engine="h5netcdf", backend_kwargs=backend_kwargs + self.test_remote_dataset, engine="h5netcdf", backend_kwargs=backend_kwargs ) as actual: assert "Temperature" in list(actual) @@ -4114,7 +4114,7 @@ def skip_if_not_engine(engine): reason="Flaky test which can cause the worker to crash (so don't xfail). Very open to contributions fixing this" ) def test_open_mfdataset_manyfiles( - readengine, nfiles, parallel, chunks, file_cache_maxsize + readengine, nfiles, parallel, chunks, file_cache_maxsize ): # skip certain combinations skip_if_not_engine(readengine) @@ -4133,12 +4133,12 @@ def test_open_mfdataset_manyfiles( # check that calculation on opened datasets works properly with open_mfdataset( - tmpfiles, - combine="nested", - concat_dim="x", - engine=readengine, - parallel=parallel, - chunks=chunks if (not chunks and readengine != "zarr") else "auto", + tmpfiles, + combine="nested", + concat_dim="x", + engine=readengine, + parallel=parallel, + chunks=chunks if (not chunks and readengine != "zarr") else "auto", ) as actual: # check that using open_mfdataset returns dask arrays for variables assert isinstance(actual["foo"].data, dask_array_type) @@ -4175,7 +4175,7 @@ def test_open_mfdataset_list_attr() -> None: with open_dataset(nfiles[1]) as ds2: original = xr.concat([ds1, ds2], dim="x") with xr.open_mfdataset( - [nfiles[0], nfiles[1]], combine="nested", concat_dim="x" + [nfiles[0], nfiles[1]], combine="nested", concat_dim="x" ) as actual: assert_identical(actual, original) @@ -4230,13 +4230,13 @@ def gen_datasets_with_common_coord_and_time(self): @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) @pytest.mark.parametrize("join", ["outer", "inner", "left", "right"]) def test_open_mfdataset_does_same_as_concat( - self, combine, concat_dim, opt, join + self, combine, concat_dim, opt, join ) -> None: with self.setup_files_and_datasets() as (files, [ds1, ds2]): if combine == "by_coords": files.reverse() with open_mfdataset( - files, data_vars=opt, combine=combine, concat_dim=concat_dim, join=join + files, data_vars=opt, combine=combine, concat_dim=concat_dim, join=join ) as ds: ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t", join=join) assert_identical(ds, ds_expect) @@ -4244,31 +4244,31 @@ def test_open_mfdataset_does_same_as_concat( @pytest.mark.parametrize( ["combine_attrs", "attrs", "expected", "expect_error"], ( - pytest.param("drop", [{"a": 1}, {"a": 2}], {}, False, id="drop"), - pytest.param( - "override", [{"a": 1}, {"a": 2}], {"a": 1}, False, id="override" - ), - pytest.param( - "no_conflicts", [{"a": 1}, {"a": 2}], None, True, id="no_conflicts" - ), - pytest.param( - "identical", - [{"a": 1, "b": 2}, {"a": 1, "c": 3}], - None, - True, - id="identical", - ), - pytest.param( - "drop_conflicts", - [{"a": 1, "b": 2}, {"b": -1, "c": 3}], - {"a": 1, "c": 3}, - False, - id="drop_conflicts", - ), + pytest.param("drop", [{"a": 1}, {"a": 2}], {}, False, id="drop"), + pytest.param( + "override", [{"a": 1}, {"a": 2}], {"a": 1}, False, id="override" + ), + pytest.param( + "no_conflicts", [{"a": 1}, {"a": 2}], None, True, id="no_conflicts" + ), + pytest.param( + "identical", + [{"a": 1, "b": 2}, {"a": 1, "c": 3}], + None, + True, + id="identical", + ), + pytest.param( + "drop_conflicts", + [{"a": 1, "b": 2}, {"b": -1, "c": 3}], + {"a": 1, "c": 3}, + False, + id="drop_conflicts", + ), ), ) def test_open_mfdataset_dataset_combine_attrs( - self, combine_attrs, attrs, expected, expect_error + self, combine_attrs, attrs, expected, expect_error ): with self.setup_files_and_datasets() as (files, [ds1, ds2]): # Give the files an inconsistent attribute @@ -4288,10 +4288,10 @@ def test_open_mfdataset_dataset_combine_attrs( ) else: with xr.open_mfdataset( - files, - combine="nested", - concat_dim="t", - combine_attrs=combine_attrs, + files, + combine="nested", + concat_dim="t", + combine_attrs=combine_attrs, ) as ds: assert ds.attrs == expected @@ -4330,13 +4330,13 @@ def test_open_mfdataset_dataarray_attr_by_coords(self) -> None: ) @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) def test_open_mfdataset_exact_join_raises_error( - self, combine, concat_dim, opt + self, combine, concat_dim, opt ) -> None: with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]): if combine == "by_coords": files.reverse() with pytest.raises( - ValueError, match=r"cannot align objects.*join.*exact.*" + ValueError, match=r"cannot align objects.*join.*exact.*" ): open_mfdataset( files, @@ -4352,7 +4352,7 @@ def test_common_coord_when_datavars_all(self) -> None: with self.setup_files_and_datasets() as (files, [ds1, ds2]): # open the files with the data_var option with open_mfdataset( - files, data_vars=opt, combine="nested", concat_dim="t" + files, data_vars=opt, combine="nested", concat_dim="t" ) as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -4370,7 +4370,7 @@ def test_common_coord_when_datavars_minimal(self) -> None: with self.setup_files_and_datasets() as (files, [ds1, ds2]): # open the files using data_vars option with open_mfdataset( - files, data_vars=opt, combine="nested", concat_dim="t" + files, data_vars=opt, combine="nested", concat_dim="t" ) as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -4404,7 +4404,7 @@ def create_store(self): @contextlib.contextmanager def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): yield data.chunk() @@ -4460,13 +4460,13 @@ def test_open_mfdataset(self) -> None: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) assert_identical(original, actual) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", chunks={"x": 3} + [tmp1, tmp2], concat_dim="x", combine="nested", chunks={"x": 3} ) as actual: assert actual.foo.variable.data.chunks == ((3, 2, 3, 2),) @@ -4494,18 +4494,18 @@ def test_open_mfdataset_2d(self) -> None: original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset( - [[tmp1, tmp2], [tmp3, tmp4]], - combine="nested", - concat_dim=["y", "x"], + [[tmp1, tmp2], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5), (4, 4)) assert_identical(original, actual) with open_mfdataset( - [[tmp1, tmp2], [tmp3, tmp4]], - combine="nested", - concat_dim=["y", "x"], - chunks={"x": 3, "y": 2}, + [[tmp1, tmp2], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], + chunks={"x": 3, "y": 2}, ) as actual: assert actual.foo.variable.data.chunks == ( (3, 2, 3, 2), @@ -4521,7 +4521,7 @@ def test_open_mfdataset_pathlib(self) -> None: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert_identical(original, actual) @@ -4540,9 +4540,9 @@ def test_open_mfdataset_2d_pathlib(self) -> None: original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset( - [[tmp1, tmp2], [tmp3, tmp4]], - combine="nested", - concat_dim=["y", "x"], + [[tmp1, tmp2], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], ) as actual: assert_identical(original, actual) @@ -4554,7 +4554,7 @@ def test_open_mfdataset_2(self) -> None: original.isel(x=slice(5, 10)).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert_identical(original, actual) @@ -4569,7 +4569,7 @@ def test_attrs_mfdataset(self) -> None: ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: # presumes that attributes inherited from # first dataset loaded @@ -4588,7 +4588,7 @@ def test_open_mfdataset_attrs_file(self) -> None: ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 ) as actual: # attributes are inherited from the master file assert actual.attrs["test2"] == ds2.attrs["test2"] @@ -4607,7 +4607,7 @@ def test_open_mfdataset_attrs_file_path(self) -> None: ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 ) as actual: # attributes are inherited from the master file assert actual.attrs["test2"] == ds2.attrs["test2"] @@ -4666,7 +4666,7 @@ def preprocess(ds): expected = preprocess(original) with open_mfdataset( - tmp, preprocess=preprocess, combine="by_coords" + tmp, preprocess=preprocess, combine="by_coords" ) as actual: assert_identical(expected, actual) @@ -4677,7 +4677,7 @@ def test_save_mfdataset_roundtrip(self) -> None: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert_identical(actual, original) @@ -4703,7 +4703,7 @@ def test_save_mfdataset_pathlib_roundtrip(self) -> None: tmp2 = Path(tmps2) save_mfdataset(datasets, [tmp1, tmp2]) with open_mfdataset( - [tmp1, tmp2], concat_dim="x", combine="nested" + [tmp1, tmp2], concat_dim="x", combine="nested" ) as actual: assert_identical(actual, original) @@ -4745,7 +4745,7 @@ def test_open_mfdataset_concat_dim_none(self) -> None: data.to_netcdf(tmp1) Dataset({"x": np.nan}).to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim=None, combine="nested" + [tmp1, tmp2], concat_dim=None, combine="nested" ) as actual: assert_identical(data, actual) @@ -4807,7 +4807,7 @@ def test_open_multi_dataset(self) -> None: original.to_netcdf(tmp1) original.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim=dim, combine="nested" + [tmp1, tmp2], concat_dim=dim, combine="nested" ) as actual: assert_identical(expected, actual) @@ -4861,7 +4861,7 @@ def test_save_mfdataset_compute_false_roundtrip(self) -> None: assert isinstance(delayed_obj, Delayed) delayed_obj.compute() with open_mfdataset( - [tmp1, tmp2], combine="nested", concat_dim="x" + [tmp1, tmp2], combine="nested", concat_dim="x" ) as actual: assert_identical(actual, original) @@ -5339,7 +5339,7 @@ def test_use_cftime_standard_calendar_default_in_range(calendar) -> None: @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) @pytest.mark.parametrize("units_year", [1500, 2500]) def test_use_cftime_standard_calendar_default_out_of_range( - calendar, units_year + calendar, units_year ) -> None: import cftime @@ -5525,7 +5525,7 @@ def test_open_fsspec() -> None: import zarr if not hasattr(zarr.storage, "FSStore") or not hasattr( - zarr.storage.FSStore, "getitems" + zarr.storage.FSStore, "getitems" ): pytest.skip("zarr too old") @@ -5608,7 +5608,7 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path: Path) -> None: with dask.config.set({"array.chunk-size": "1MiB"}): expected = ds.chunk(chunks) with open_dataset( - tmp_path / "test.zarr", engine="zarr", chunks=chunks + tmp_path / "test.zarr", engine="zarr", chunks=chunks ) as actual: xr.testing.assert_chunks_equal(actual, expected) @@ -5639,7 +5639,7 @@ def test_chunking_consintency(chunks, tmp_path: Path) -> None: with dask.config.set({"array.chunk-size": "1MiB"}): expected = ds.chunk(chunks) with xr.open_dataset( - tmp_path / "test.zarr", engine="zarr", chunks=chunks + tmp_path / "test.zarr", engine="zarr", chunks=chunks ) as actual: xr.testing.assert_chunks_equal(actual, expected) @@ -5733,7 +5733,7 @@ def test_h5netcdf_entrypoint(tmp_path: Path) -> None: @requires_netCDF4 @pytest.mark.parametrize("str_type", (str, np.str_)) def test_write_file_from_np_str( - str_type: type[str] | type[np.str_], tmpdir: str + str_type: type[str] | type[np.str_], tmpdir: str ) -> None: # https://github.com/pydata/xarray/pull/5264 scenarios = [str_type(v) for v in ["scenario_a", "scenario_b", "scenario_c"]] @@ -5799,7 +5799,7 @@ def test_raise_writing_to_nczarr(self, mode) -> None: with create_tmp_file(suffix=".zarr") as tmp: ds = self._create_nczarr(tmp) with pytest.raises( - KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," + KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," ): ds.to_zarr(tmp, mode=mode) @@ -5948,10 +5948,10 @@ def test_zarr_region_index_write(self, tmp_path): region: Mapping[str, slice] | Literal["auto"] for region in [region_slice, "auto"]: # type: ignore[assignment] with patch.object( - ZarrStore, - "set_variables", - side_effect=ZarrStore.set_variables, - autospec=True, + ZarrStore, + "set_variables", + side_effect=ZarrStore.set_variables, + autospec=True, ) as mock: ds_region.to_zarr(tmp_path / "test.zarr", region=region, mode="r+")