From 74dd88c75124164e855a0c0099e7af5b4868f0b0 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 22 May 2023 17:35:24 +0100 Subject: [PATCH 01/10] CF encoding should preserve vlen dtype for empty arrays --- xarray/conventions.py | 4 ++++ xarray/tests/test_conventions.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/xarray/conventions.py b/xarray/conventions.py index 1506efc31e8..053863ace2a 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -108,6 +108,10 @@ def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable: if var.dtype.kind == "O": dims, data, attrs, encoding = _var_as_tuple(var) + # leave vlen dtypes unchanged + if strings.check_vlen_dtype(data.dtype) is not None: + return var + if is_duck_dask_array(data): warnings.warn( "variable {} has data in the form of a dask array with " diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index acdf9c8846e..5ba42f5cd43 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -487,3 +487,16 @@ def test_decode_cf_error_includes_variable_name(): ds = Dataset({"invalid": ([], 1e36, {"units": "days since 2000-01-01"})}) with pytest.raises(ValueError, match="Failed to decode variable 'invalid'"): decode_cf(ds) + + +def test_encode_cf_variable_with_vlen_dtype() -> None: + v = Variable(["x"], np.array(["a", "b"], dtype=coding.strings.create_vlen_dtype(str))) + encoded_v = conventions.encode_cf_variable(v) + assert encoded_v.data.dtype.kind == "O" + assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str + + # empty array + v = Variable(["x"], np.array([], dtype=coding.strings.create_vlen_dtype(str))) + encoded_v = conventions.encode_cf_variable(v) + assert encoded_v.data.dtype.kind == "O" + assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str From 8238000ea0d42963e4ff6da072d0ea530dd8b197 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 May 2023 16:42:01 +0000 Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_conventions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 5ba42f5cd43..424b7db5ac4 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -490,7 +490,9 @@ def test_decode_cf_error_includes_variable_name(): def test_encode_cf_variable_with_vlen_dtype() -> None: - v = Variable(["x"], np.array(["a", "b"], dtype=coding.strings.create_vlen_dtype(str))) + v = Variable( + ["x"], np.array(["a", "b"], dtype=coding.strings.create_vlen_dtype(str)) + ) encoded_v = conventions.encode_cf_variable(v) assert encoded_v.data.dtype.kind == "O" assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str From 61ada468e09554d130bd07cd931c1e70b1192105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 24 May 2023 13:42:38 +0200 Subject: [PATCH 03/10] preserve vlen string dtype in netcdf4 and zarr backends --- xarray/backends/netCDF4_.py | 2 +- xarray/backends/zarr.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index d3866e90de6..419f86680ae 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -68,7 +68,7 @@ def __init__(self, variable_name, datastore): # use object dtype because that's the only way in numpy to # represent variable length strings; it also prevents automatic # string concatenation via conventions.decode_cf_variable - dtype = np.dtype("O") + dtype = coding.strings.create_vlen_dtype(str) self.dtype = dtype def __setitem__(self, key, value): diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index a4012a8a733..c4f47fba7eb 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -70,7 +70,14 @@ def __init__(self, variable_name, datastore): array = self.get_array() self.shape = array.shape - dtype = array.dtype + # preserve vlen string object dtype (GH 7328) + if array.filters is not None and any( + [filt["id"] == "vlen-utf8" for filt in array._meta["filters"]] + ): + dtype = coding.strings.create_vlen_dtype(str) + else: + dtype = array.dtype + self.dtype = dtype def get_array(self): From b7d4cf506953cea0004e4885f35620eb2d9d2900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 1 Jun 2023 14:06:43 +0200 Subject: [PATCH 04/10] check for h5py-variant ("vlen") in coding.strings.check_vlen_dtype --- xarray/coding/strings.py | 3 ++- xarray/tests/test_coding_strings.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index ffe1b1a8d50..d0bfb1a7a63 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -29,7 +29,8 @@ def check_vlen_dtype(dtype): if dtype.kind != "O" or dtype.metadata is None: return None else: - return dtype.metadata.get("element_type") + # check xarray (element_type) as well as h5py (vlen) + return dtype.metadata.get("element_type", dtype.metadata.get("vlen")) def is_unicode_dtype(dtype): diff --git a/xarray/tests/test_coding_strings.py b/xarray/tests/test_coding_strings.py index cb9595f4a64..fdc05daf4be 100644 --- a/xarray/tests/test_coding_strings.py +++ b/xarray/tests/test_coding_strings.py @@ -32,6 +32,10 @@ def test_vlen_dtype() -> None: assert strings.is_bytes_dtype(dtype) assert strings.check_vlen_dtype(dtype) is bytes + # check h5py variant ("vlen") + dtype = np.dtype("O", metadata={"vlen": str}) + assert strings.check_vlen_dtype(dtype) is str + assert strings.check_vlen_dtype(np.dtype(object)) is None From a56108e235716a23c129ce334410c58891b4116b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 1 Jun 2023 14:29:24 +0200 Subject: [PATCH 05/10] add test to check preserving vlen dtype for empty vlen string arrays --- xarray/tests/test_backends.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 190adab3d19..e0a2262a339 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -46,6 +46,7 @@ ) from xarray.backends.pydap_ import PydapDataStore from xarray.backends.scipy_ import ScipyBackendEntrypoint +from xarray.coding.strings import check_vlen_dtype, create_vlen_dtype from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing @@ -859,6 +860,20 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: with self.roundtrip(original) as actual: assert_identical(expected, actual) + def test_roundtrip_empty_vlen_string_array(self) -> None: + # checks preserving vlen dtype for empty arrays GH7862 + dtype = create_vlen_dtype(str) + original = Dataset({"a": np.array([], dtype=dtype)}) + assert check_vlen_dtype(original["a"].dtype) == str + with self.roundtrip(original) as actual: + assert_identical(original, actual) + assert object == actual["a"].dtype + assert actual["a"].dtype == original["a"].dtype + # only check metadata for capable backends + # eg. NETCDF3 based backends do not roundtrip metadata + if actual["a"].dtype.metadata is not None: + assert check_vlen_dtype(actual["a"].dtype) == str + @pytest.mark.parametrize( "decoded_fn, encoded_fn", [ From ee3dac552d38679c6c78392733b68257bb133276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 6 Jun 2023 11:13:50 +0200 Subject: [PATCH 06/10] ignore call_overload error for np.dtype("O", metadata={"vlen": str}) --- xarray/tests/test_coding_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_coding_strings.py b/xarray/tests/test_coding_strings.py index fdc05daf4be..0c9f67e77ad 100644 --- a/xarray/tests/test_coding_strings.py +++ b/xarray/tests/test_coding_strings.py @@ -33,7 +33,7 @@ def test_vlen_dtype() -> None: assert strings.check_vlen_dtype(dtype) is bytes # check h5py variant ("vlen") - dtype = np.dtype("O", metadata={"vlen": str}) + dtype = np.dtype("O", metadata={"vlen": str}) # type: ignore[call-overload] assert strings.check_vlen_dtype(dtype) is str assert strings.check_vlen_dtype(np.dtype(object)) is None From 2d7ed905fc0df656d5e7d4943e310cc92bb08901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 9 Jun 2023 12:03:49 +0200 Subject: [PATCH 07/10] use filter.codec_id instead of private filter._meta as suggested in review --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c4f47fba7eb..5c3d5781e35 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -72,7 +72,7 @@ def __init__(self, variable_name, datastore): # preserve vlen string object dtype (GH 7328) if array.filters is not None and any( - [filt["id"] == "vlen-utf8" for filt in array._meta["filters"]] + [filt.codec_id == "vlen-utf8" for filt in array.filters] ): dtype = coding.strings.create_vlen_dtype(str) else: From d0cea4477706e48c16e8324b6674969190818536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 12 Jun 2023 08:07:02 +0200 Subject: [PATCH 08/10] update comment and add whats-new.rst entry --- doc/whats-new.rst | 2 ++ xarray/backends/netCDF4_.py | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2c3f0716953..0f1625defa1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -42,6 +42,8 @@ Performance Bug fixes ~~~~~~~~~ +- Preserve vlen dtype for empty string arrays (:issue:`7328`, :pull:`7862`). + By `Tom White `_ and `Kai Mühlbauer `_ Documentation diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 419f86680ae..8a5d48c8c1e 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -65,9 +65,11 @@ def __init__(self, variable_name, datastore): dtype = array.dtype if dtype is str: - # use object dtype because that's the only way in numpy to - # represent variable length strings; it also prevents automatic - # string concatenation via conventions.decode_cf_variable + # use object dtype (with additional vlen string metadata) because that's + # the only way in numpy to represent variable length strings and to + # check vlen string dtype in further steps + # it also prevents automatic string concatenation via + # conventions.decode_cf_variable dtype = coding.strings.create_vlen_dtype(str) self.dtype = dtype From 8afa51cf8dc8e47a80f354736dd34a87ec9882e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 12 Jun 2023 08:13:36 +0200 Subject: [PATCH 09/10] fix whats-new.rst --- doc/whats-new.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9eae8f54f2b..128fe964545 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -42,13 +42,12 @@ Performance Bug fixes ~~~~~~~~~ -- Preserve vlen dtype for empty string arrays (:issue:`7328`, :pull:`7862`). - By `Tom White `_ and `Kai Mühlbauer `_ - - Fix bug where weighted ``polyfit`` were changing the original object (:issue:`5644`, :pull:`7900`). By `Mattia Almansi `_. - Don't call ``CachingFileManager.__del__`` on interpreter shutdown (:issue:`7814`, :pull:`7880`). By `Justus Magin `_. +- Preserve vlen dtype for empty string arrays (:issue:`7328`, :pull:`7862`). + By `Tom White `_ and `Kai Mühlbauer `_ Documentation ~~~~~~~~~~~~~ From 8428a8a55713ed05aa055023409ff10d88a7ff29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 12 Jun 2023 08:15:49 +0200 Subject: [PATCH 10/10] fix whats-new.rst (missing dot) --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 128fe964545..931f7dc9ae4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -47,7 +47,7 @@ Bug fixes - Don't call ``CachingFileManager.__del__`` on interpreter shutdown (:issue:`7814`, :pull:`7880`). By `Justus Magin `_. - Preserve vlen dtype for empty string arrays (:issue:`7328`, :pull:`7862`). - By `Tom White `_ and `Kai Mühlbauer `_ + By `Tom White `_ and `Kai Mühlbauer `_. Documentation ~~~~~~~~~~~~~