Skip to content

Support overriding existing variables in to_zarr() without appending #4029

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ New Features
By `Todd Jennings <https://github.com/toddrjen>`_
- More support for unit aware arrays with pint (:pull:`3643`)
By `Justus Magin <https://github.com/keewis>`_.

- Support overriding existing variables in ``to_zarr()`` with ``mode='a'`` even
without ``append_dim``, as long as dimension sizes do not change.
By `Stephan Hoyer <https://github.com/shoyer>`_.
- Allow plotting of boolean arrays. (:pull:`3766`)
By `Marek Jacob <https://github.com/MeraX>`_
- A ``days_in_month`` accessor for :py:class:`xarray.CFTimeIndex`, analogous to
Expand Down
35 changes: 26 additions & 9 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1279,18 +1279,35 @@ def _validate_append_dim_and_encoding(
return
if append_dim:
if append_dim not in ds.dims:
raise ValueError(f"{append_dim} not a valid dimension in the Dataset")
for data_var in ds_to_append:
if data_var in ds:
if append_dim is None:
raise ValueError(
f"append_dim={append_dim!r} does not match any existing "
f"dataset dimensions {ds.dims}"
)
for var_name in ds_to_append:
if var_name in ds:
if ds_to_append[var_name].dims != ds[var_name].dims:
raise ValueError(
f"variable {var_name!r} already exists with different "
f"dimension names {ds[var_name].dims} != "
f"{ds_to_append[var_name].dims}, but changing variable "
"dimensions is not supported by to_zarr()."
)
existing_sizes = {
k: v for k, v in ds[var_name].sizes.items() if k != append_dim
}
new_sizes = {
k: v for k, v in ds_to_append[var_name].sizes.items() if k != append_dim
}
if existing_sizes != new_sizes:
raise ValueError(
"variable '{}' already exists, but append_dim "
"was not set".format(data_var)
f"variable {var_name!r} already exists with different "
"dimension sizes: {existing_sizes} != {new_sizes}. "
"to_zarr() only supports changing dimension sizes when "
f"explicitly appending, but append_dim={append_dim!r}."
)
if data_var in encoding.keys():
if var_name in encoding.keys():
raise ValueError(
"variable '{}' already exists, but encoding was"
"provided".format(data_var)
f"variable {var_name!r} already exists, but encoding was provided"
)


Expand Down
31 changes: 19 additions & 12 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,18 +445,23 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
fill_value = attrs.pop("_FillValue", None)
if v.encoding == {"_FillValue": None} and fill_value is None:
v.encoding = {}
if name in self.ds:

if self.append_dim is not None and self.append_dim in dims:
# resize existing variable
zarr_array = self.ds[name]
if self.append_dim in dims:
# this is the DataArray that has append_dim as a
# dimension
append_axis = dims.index(self.append_dim)
new_shape = list(zarr_array.shape)
new_shape[append_axis] += v.shape[append_axis]
new_region = [slice(None)] * len(new_shape)
new_region[append_axis] = slice(zarr_array.shape[append_axis], None)
zarr_array.resize(new_shape)
writer.add(v.data, zarr_array, region=tuple(new_region))
append_axis = dims.index(self.append_dim)

new_region = [slice(None)] * len(dims)
new_region[append_axis] = slice(zarr_array.shape[append_axis], None)
region = tuple(new_region)

new_shape = list(zarr_array.shape)
new_shape[append_axis] += v.shape[append_axis]
zarr_array.resize(new_shape)
elif name in self.ds:
# override existing variable
zarr_array = self.ds[name]
region = None
else:
# new variable
encoding = extract_zarr_variable_encoding(
Expand All @@ -474,7 +479,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding
)
zarr_array.attrs.put(encoded_attrs)
writer.add(v.data, zarr_array)
region = None

writer.add(v.data, zarr_array, region=region)

def close(self):
if self._consolidate_on_close:
Expand Down
13 changes: 7 additions & 6 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1579,7 +1579,7 @@ def to_zarr(
mode : {'w', 'w-', 'a', None}
Persistence mode: 'w' means create (overwrite if exists);
'w-' means create (fail if exists);
'a' means append (create if does not exist).
'a' means override existing variables (create if does not exist).
If ``append_dim`` is set, ``mode`` can be omitted as it is
internally set to ``'a'``. Otherwise, ``mode`` will default to
`w-` if not set.
Expand All @@ -1598,7 +1598,8 @@ def to_zarr(
If True, apply zarr's `consolidate_metadata` function to the store
after writing.
append_dim: hashable, optional
If set, the dimension on which the data will be appended.
If set, the dimension along which the data will be appended. All
other dimensions on overriden variables must remain the same size.

References
----------
Expand Down Expand Up @@ -1766,7 +1767,7 @@ def maybe_chunk(name, var, chunks):
return self._replace(variables)

def _validate_indexers(
self, indexers: Mapping[Hashable, Any], missing_dims: str = "raise",
self, indexers: Mapping[Hashable, Any], missing_dims: str = "raise"
) -> Iterator[Tuple[Hashable, Union[int, slice, np.ndarray, Variable]]]:
""" Here we make sure
+ indexer has a valid keys
Expand Down Expand Up @@ -5933,7 +5934,7 @@ def polyfit(
"The number of data points must exceed order to scale the covariance matrix."
)
fac = residuals / (x.shape[0] - order)
covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j"),) * fac
covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j")) * fac
variables[name + "polyfit_covariance"] = covariance

return Dataset(data_vars=variables, attrs=self.attrs.copy())
Expand Down Expand Up @@ -6199,7 +6200,7 @@ def idxmin(
skipna=skipna,
fill_value=fill_value,
keep_attrs=keep_attrs,
),
)
)

def idxmax(
Expand Down Expand Up @@ -6297,7 +6298,7 @@ def idxmax(
skipna=skipna,
fill_value=fill_value,
keep_attrs=keep_attrs,
),
)
)


Expand Down
65 changes: 24 additions & 41 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1526,12 +1526,6 @@ def roundtrip(
with self.open(store_target, **open_kwargs) as ds:
yield ds

@contextlib.contextmanager
def roundtrip_append(
self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False
):
pytest.skip("zarr backend does not support appending")

def test_roundtrip_consolidated(self):
pytest.importorskip("zarr", minversion="2.2.1.dev2")
expected = create_test_data()
Expand Down Expand Up @@ -1826,63 +1820,52 @@ def test_encoding_kwarg_fixed_width_string(self):
# not relevant for zarr, since we don't use EncodedStringCoder
pass

# TODO: someone who understand caching figure out whether chaching
# TODO: someone who understand caching figure out whether caching
# makes sense for Zarr backend
@pytest.mark.xfail(reason="Zarr caching not implemented")
def test_dataset_caching(self):
super().test_dataset_caching()

@pytest.mark.skipif(LooseVersion(dask_version) < "2.4", reason="dask GH5334")
def test_append_write(self):
ds, ds_to_append, _ = create_append_test_data()
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
ds_to_append.to_zarr(store_target, append_dim="time")
original = xr.concat([ds, ds_to_append], dim="time")
assert_identical(original, xr.open_zarr(store_target))

@pytest.mark.xfail(reason="Zarr stores can not be appended to")
def test_append_overwrite_values(self):
super().test_append_overwrite_values()
super().test_append_write()

def test_append_with_invalid_dim_raises(self):

ds, ds_to_append, _ = create_append_test_data()

# check failure when append_dim not valid
with pytest.raises(ValueError):
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
with pytest.raises(
ValueError, match="does not match any existing dataset dimensions"
):
ds_to_append.to_zarr(store_target, append_dim="notvalid")

def test_append_with_append_dim_not_set_raises(self):
def test_append_with_no_dims_raises(self):
with self.create_zarr_target() as store_target:
Dataset({"foo": ("x", [1])}).to_zarr(store_target, mode="w")
with pytest.raises(ValueError, match="different dimension names"):
Dataset({"foo": ("y", [2])}).to_zarr(store_target, mode="a")

def test_append_with_append_dim_not_set_raises(self):
ds, ds_to_append, _ = create_append_test_data()

# check failure when append_dim not set
with pytest.raises(ValueError):
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
with pytest.raises(ValueError, match="different dimension sizes"):
ds_to_append.to_zarr(store_target, mode="a")

def test_append_with_mode_not_a_raises(self):

ds, ds_to_append, _ = create_append_test_data()

# check failure when append_dim is set and mode != 'a'
with pytest.raises(ValueError):
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
with pytest.raises(
ValueError, match="append_dim was set along with mode='w'"
):
ds_to_append.to_zarr(store_target, mode="w", append_dim="time")

def test_append_with_existing_encoding_raises(self):

ds, ds_to_append, _ = create_append_test_data()

# check failure when providing encoding to existing variable
with pytest.raises(ValueError):
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w")
with pytest.raises(ValueError, match="but encoding was provided"):
ds_to_append.to_zarr(
store_target,
append_dim="time",
Expand Down