From 120cfb6dd4a9aa8dac5d27a23af38273b9447210 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 4 May 2020 17:58:44 -0700 Subject: [PATCH 1/2] Support overriding existing variables in to_zarr() without appending This should be useful for cases where users want to update values in existing Zarr datasets. --- doc/whats-new.rst | 4 ++- xarray/backends/api.py | 35 ++++++++++++++----- xarray/backends/zarr.py | 31 ++++++++++------- xarray/tests/test_backends.py | 65 +++++++++++++---------------------- 4 files changed, 72 insertions(+), 63 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 051a41a57e5..d2087bc7d1f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -49,7 +49,9 @@ New Features By `Todd Jennings `_ - More support for unit aware arrays with pint (:pull:`3643`) By `Justus Magin `_. - +- Support overriding existing variables in ``to_zarr()`` with ``mode='a'`` even + without ``append_dim``, as long as dimension sizes do not change. + By `Stephan Hoyer `_. - Allow plotting of boolean arrays. (:pull:`3766`) By `Marek Jacob `_ - A ``days_in_month`` accessor for :py:class:`xarray.CFTimeIndex`, analogous to diff --git a/xarray/backends/api.py b/xarray/backends/api.py index c7481e22b59..184aad579a2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1279,18 +1279,35 @@ def _validate_append_dim_and_encoding( return if append_dim: if append_dim not in ds.dims: - raise ValueError(f"{append_dim} not a valid dimension in the Dataset") - for data_var in ds_to_append: - if data_var in ds: - if append_dim is None: + raise ValueError( + f"append_dim={append_dim!r} does not match any existing " + f"dataset dimensions {ds.dims}" + ) + for var_name in ds_to_append: + if var_name in ds: + if ds_to_append[var_name].dims != ds[var_name].dims: + raise ValueError( + f"variable {var_name!r} already exists with different " + f"dimension names {ds[var_name].dims} != " + f"{ds_to_append[var_name].dims}, but changing variable " + "dimensions is not supported by to_zarr()." + ) + existing_sizes = { + k: v for k, v in ds[var_name].sizes.items() if k != append_dim + } + new_sizes = { + k: v for k, v in ds_to_append[var_name].sizes.items() if k != append_dim + } + if existing_sizes != new_sizes: raise ValueError( - "variable '{}' already exists, but append_dim " - "was not set".format(data_var) + f"variable {var_name!r} already exists with different " + "dimension sizes: {existing_sizes} != {new_sizes}. " + "to_zarr() only supports changing dimension sizes when " + f"explicitly appending, but append_dim={append_dim!r}." ) - if data_var in encoding.keys(): + if var_name in encoding.keys(): raise ValueError( - "variable '{}' already exists, but encoding was" - "provided".format(data_var) + f"variable {var_name!r} already exists, but encoding was provided" ) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 973c167911e..de6b627447e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -445,18 +445,23 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No fill_value = attrs.pop("_FillValue", None) if v.encoding == {"_FillValue": None} and fill_value is None: v.encoding = {} - if name in self.ds: + + if self.append_dim is not None and self.append_dim in dims: + # resize existing variable zarr_array = self.ds[name] - if self.append_dim in dims: - # this is the DataArray that has append_dim as a - # dimension - append_axis = dims.index(self.append_dim) - new_shape = list(zarr_array.shape) - new_shape[append_axis] += v.shape[append_axis] - new_region = [slice(None)] * len(new_shape) - new_region[append_axis] = slice(zarr_array.shape[append_axis], None) - zarr_array.resize(new_shape) - writer.add(v.data, zarr_array, region=tuple(new_region)) + append_axis = dims.index(self.append_dim) + + new_region = [slice(None)] * len(dims) + new_region[append_axis] = slice(zarr_array.shape[append_axis], None) + region = tuple(new_region) + + new_shape = list(zarr_array.shape) + new_shape[append_axis] += v.shape[append_axis] + zarr_array.resize(new_shape) + elif name in self.ds: + # override existing variable + zarr_array = self.ds[name] + region = None else: # new variable encoding = extract_zarr_variable_encoding( @@ -474,7 +479,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding ) zarr_array.attrs.put(encoded_attrs) - writer.add(v.data, zarr_array) + region = None + + writer.add(v.data, zarr_array, region=region) def close(self): if self._consolidate_on_close: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 916c29ba7bd..90deea51d2a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1526,12 +1526,6 @@ def roundtrip( with self.open(store_target, **open_kwargs) as ds: yield ds - @contextlib.contextmanager - def roundtrip_append( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False - ): - pytest.skip("zarr backend does not support appending") - def test_roundtrip_consolidated(self): pytest.importorskip("zarr", minversion="2.2.1.dev2") expected = create_test_data() @@ -1826,7 +1820,7 @@ def test_encoding_kwarg_fixed_width_string(self): # not relevant for zarr, since we don't use EncodedStringCoder pass - # TODO: someone who understand caching figure out whether chaching + # TODO: someone who understand caching figure out whether caching # makes sense for Zarr backend @pytest.mark.xfail(reason="Zarr caching not implemented") def test_dataset_caching(self): @@ -1834,55 +1828,44 @@ def test_dataset_caching(self): @pytest.mark.skipif(LooseVersion(dask_version) < "2.4", reason="dask GH5334") def test_append_write(self): - ds, ds_to_append, _ = create_append_test_data() - with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") - ds_to_append.to_zarr(store_target, append_dim="time") - original = xr.concat([ds, ds_to_append], dim="time") - assert_identical(original, xr.open_zarr(store_target)) - - @pytest.mark.xfail(reason="Zarr stores can not be appended to") - def test_append_overwrite_values(self): - super().test_append_overwrite_values() + super().test_append_write() def test_append_with_invalid_dim_raises(self): - ds, ds_to_append, _ = create_append_test_data() - - # check failure when append_dim not valid - with pytest.raises(ValueError): - with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + with self.create_zarr_target() as store_target: + ds.to_zarr(store_target, mode="w") + with pytest.raises( + ValueError, match="does not match any existing dataset dimensions" + ): ds_to_append.to_zarr(store_target, append_dim="notvalid") - def test_append_with_append_dim_not_set_raises(self): + def test_append_with_no_dims_raises(self): + with self.create_zarr_target() as store_target: + Dataset({"foo": ("x", [1])}).to_zarr(store_target, mode="w") + with pytest.raises(ValueError, match="different dimension names"): + Dataset({"foo": ("y", [2])}).to_zarr(store_target, mode="a") + def test_append_with_append_dim_not_set_raises(self): ds, ds_to_append, _ = create_append_test_data() - - # check failure when append_dim not set - with pytest.raises(ValueError): - with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + with self.create_zarr_target() as store_target: + ds.to_zarr(store_target, mode="w") + with pytest.raises(ValueError, match="different dimension sizes"): ds_to_append.to_zarr(store_target, mode="a") def test_append_with_mode_not_a_raises(self): - ds, ds_to_append, _ = create_append_test_data() - - # check failure when append_dim is set and mode != 'a' - with pytest.raises(ValueError): - with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + with self.create_zarr_target() as store_target: + ds.to_zarr(store_target, mode="w") + with pytest.raises( + ValueError, match="append_dim was set along with mode='w'" + ): ds_to_append.to_zarr(store_target, mode="w", append_dim="time") def test_append_with_existing_encoding_raises(self): - ds, ds_to_append, _ = create_append_test_data() - - # check failure when providing encoding to existing variable - with pytest.raises(ValueError): - with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + with self.create_zarr_target() as store_target: + ds.to_zarr(store_target, mode="w") + with pytest.raises(ValueError, match="but encoding was provided"): ds_to_append.to_zarr( store_target, append_dim="time", From 104f9514a704d259f077132de47b6d8c89ab1d93 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 5 May 2020 09:46:01 -0700 Subject: [PATCH 2/2] Update docstring for to_zarr --- xarray/core/dataset.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 01dda828d8a..2a8b7bdbb9a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1579,7 +1579,7 @@ def to_zarr( mode : {'w', 'w-', 'a', None} Persistence mode: 'w' means create (overwrite if exists); 'w-' means create (fail if exists); - 'a' means append (create if does not exist). + 'a' means override existing variables (create if does not exist). If ``append_dim`` is set, ``mode`` can be omitted as it is internally set to ``'a'``. Otherwise, ``mode`` will default to `w-` if not set. @@ -1598,7 +1598,8 @@ def to_zarr( If True, apply zarr's `consolidate_metadata` function to the store after writing. append_dim: hashable, optional - If set, the dimension on which the data will be appended. + If set, the dimension along which the data will be appended. All + other dimensions on overriden variables must remain the same size. References ---------- @@ -1766,7 +1767,7 @@ def maybe_chunk(name, var, chunks): return self._replace(variables) def _validate_indexers( - self, indexers: Mapping[Hashable, Any], missing_dims: str = "raise", + self, indexers: Mapping[Hashable, Any], missing_dims: str = "raise" ) -> Iterator[Tuple[Hashable, Union[int, slice, np.ndarray, Variable]]]: """ Here we make sure + indexer has a valid keys @@ -5933,7 +5934,7 @@ def polyfit( "The number of data points must exceed order to scale the covariance matrix." ) fac = residuals / (x.shape[0] - order) - covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j"),) * fac + covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j")) * fac variables[name + "polyfit_covariance"] = covariance return Dataset(data_vars=variables, attrs=self.attrs.copy()) @@ -6199,7 +6200,7 @@ def idxmin( skipna=skipna, fill_value=fill_value, keep_attrs=keep_attrs, - ), + ) ) def idxmax( @@ -6297,7 +6298,7 @@ def idxmax( skipna=skipna, fill_value=fill_value, keep_attrs=keep_attrs, - ), + ) )