Skip to content

preserve vlen string dtypes, allow vlen string fill_values #7869

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ci/requirements/min-all-deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ dependencies:
- dask-core=2022.7
- distributed=2022.7
- flox=0.5
- h5netcdf=1.0
- h5netcdf=1.1
# h5py and hdf5 tend to cause conflicts
# for e.g. hdf5 1.12 conflicts with h5py=3.1
# prioritize bumping other packages instead
- h5py=3.6
- h5py=3.7
- hdf5=1.12
- hypothesis
- iris=3.2
Expand Down
5 changes: 4 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,23 @@ New Features
By `Sam Levang <https://github.com/slevang>`_.
- Allow the usage of h5py drivers (eg: ros3) via h5netcdf (:pull:`8360`).
By `Ezequiel Cimadevilla <https://github.com/zequihg50>`_.
- Enable VLEN string fill_values, preserve VLEN string dtypes (:issue:`1647`, :issue:`7652`, :issue:`7868`, :pull:`7869`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Breaking changes
~~~~~~~~~~~~~~~~
- drop support for `cdms2 <https://github.com/CDAT/cdms>`_. Please use
`xcdat <https://github.com/xCDAT/xcdat>`_ instead (:pull:`8441`).
By `Justus Magin <https://github.com/keewis>`_.

- Following pandas, :py:meth:`infer_freq` will return ``"Y"``, ``"YS"``,
``"QE"``, ``"ME"``, ``"h"``, ``"min"``, ``"s"``, ``"ms"``, ``"us"``, or
``"ns"`` instead of ``"A"``, ``"AS"``, ``"Q"``, ``"M"``, ``"H"``, ``"T"``,
``"S"``, ``"L"``, ``"U"``, or ``"N"``. This is to be consistent with the
deprecation of the latter frequency strings (:issue:`8394`, :pull:`8415`). By
`Spencer Clark <https://github.com/spencerkclark>`_.
- Bump minimum tested pint version to ``>=0.22``. By `Deepak Cherian <https://github.com/dcherian>`_.
- Minimum supported versions for the following packages have changed: ``h5py >=3.7``, ``h5netcdf>=1.1``.
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Deprecations
~~~~~~~~~~~~
Expand Down
9 changes: 0 additions & 9 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,15 +271,6 @@ def prepare_variable(
dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding)

fillvalue = attrs.pop("_FillValue", None)
if dtype is str and fillvalue is not None:
raise NotImplementedError(
"h5netcdf does not yet support setting a fill value for "
"variable-length strings "
"(https://github.com/h5netcdf/h5netcdf/issues/37). "
f"Either remove '_FillValue' from encoding on variable {name!r} "
"or set {'dtype': 'S1'} in encoding to use the fixed width "
"NC_CHAR type."
)

if dtype is str:
dtype = h5py.special_dtype(vlen=str)
Expand Down
10 changes: 0 additions & 10 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,16 +490,6 @@ def prepare_variable(

fill_value = attrs.pop("_FillValue", None)

if datatype is str and fill_value is not None:
raise NotImplementedError(
"netCDF4 does not yet support setting a fill value for "
"variable-length strings "
"(https://github.com/Unidata/netcdf4-python/issues/730). "
f"Either remove '_FillValue' from encoding on variable {name!r} "
"or set {'dtype': 'S1'} in encoding to use the fixed width "
"NC_CHAR type."
)

encoding = _extract_nc4_variable_encoding(
variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims
)
Expand Down
12 changes: 12 additions & 0 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,3 +562,15 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:

def decode(self):
raise NotImplementedError()


class ObjectVLenStringCoder(VariableCoder):
def encode(self):
return NotImplementedError

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if variable.dtype == object and variable.encoding.get("dtype", False) == str:
variable = variable.astype(variable.encoding["dtype"])
return variable
else:
return variable
4 changes: 4 additions & 0 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def decode_cf_variable(
var = strings.CharacterArrayCoder().decode(var, name=name)
var = strings.EncodedStringCoder().decode(var)

if original_dtype == object:
var = variables.ObjectVLenStringCoder().decode(var)
original_dtype = var.dtype

if mask_and_scale:
for coder in [
variables.UnsignedIntegerCoder(),
Expand Down
62 changes: 35 additions & 27 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,12 +864,13 @@ def test_roundtrip_empty_vlen_string_array(self) -> None:
assert check_vlen_dtype(original["a"].dtype) == str
with self.roundtrip(original) as actual:
assert_identical(original, actual)
assert object == actual["a"].dtype
assert actual["a"].dtype == original["a"].dtype
# only check metadata for capable backends
# eg. NETCDF3 based backends do not roundtrip metadata
if actual["a"].dtype.metadata is not None:
assert check_vlen_dtype(actual["a"].dtype) == str
if np.issubdtype(actual["a"].dtype, object):
# only check metadata for capable backends
# eg. NETCDF3 based backends do not roundtrip metadata
if actual["a"].dtype.metadata is not None:
assert check_vlen_dtype(actual["a"].dtype) == str
else:
assert actual["a"].dtype == np.dtype("<U1")

@pytest.mark.parametrize(
"decoded_fn, encoded_fn",
Expand Down Expand Up @@ -1374,32 +1375,39 @@ def test_write_groups(self) -> None:
with self.open(tmp_file, group="data/2") as actual2:
assert_identical(data2, actual2)

def test_encoding_kwarg_vlen_string(self) -> None:
for input_strings in [[b"foo", b"bar", b"baz"], ["foo", "bar", "baz"]]:
original = Dataset({"x": input_strings})
expected = Dataset({"x": ["foo", "bar", "baz"]})
kwargs = dict(encoding={"x": {"dtype": str}})
with self.roundtrip(original, save_kwargs=kwargs) as actual:
assert actual["x"].encoding["dtype"] is str
assert_identical(actual, expected)

def test_roundtrip_string_with_fill_value_vlen(self) -> None:
@pytest.mark.parametrize(
"input_strings, is_bytes",
[
([b"foo", b"bar", b"baz"], True),
(["foo", "bar", "baz"], False),
(["foó", "bár", "baź"], False),
],
)
def test_encoding_kwarg_vlen_string(
self, input_strings: list[str], is_bytes: bool
) -> None:
original = Dataset({"x": input_strings})

expected_string = ["foo", "bar", "baz"] if is_bytes else input_strings
expected = Dataset({"x": expected_string})
kwargs = dict(encoding={"x": {"dtype": str}})
with self.roundtrip(original, save_kwargs=kwargs) as actual:
assert actual["x"].encoding["dtype"] == "<U3"
assert actual["x"].dtype == "<U3"
assert_identical(actual, expected)

@pytest.mark.parametrize("fill_value", ["XXX", "", "bár"])
def test_roundtrip_string_with_fill_value_vlen(self, fill_value: str) -> None:
values = np.array(["ab", "cdef", np.nan], dtype=object)
expected = Dataset({"x": ("t", values)})

# netCDF4-based backends don't support an explicit fillvalue
# for variable length strings yet.
# https://github.com/Unidata/netcdf4-python/issues/730
# https://github.com/h5netcdf/h5netcdf/issues/37
original = Dataset({"x": ("t", values, {}, {"_FillValue": "XXX"})})
with pytest.raises(NotImplementedError):
with self.roundtrip(original) as actual:
assert_identical(expected, actual)
original = Dataset({"x": ("t", values, {}, {"_FillValue": fill_value})})
with self.roundtrip(original) as actual:
assert_identical(expected, actual)

original = Dataset({"x": ("t", values, {}, {"_FillValue": ""})})
with pytest.raises(NotImplementedError):
with self.roundtrip(original) as actual:
assert_identical(expected, actual)
with self.roundtrip(original) as actual:
assert_identical(expected, actual)

def test_roundtrip_character_array(self) -> None:
with create_tmp_file() as tmp_file:
Expand Down