From 37db2b06063c48049098fe06967736a2121602bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Sep 2023 15:13:55 +0200 Subject: [PATCH 01/16] remove `dtype` from encoding for datetime64/timedelta64 variables to prevent unnecessary casts --- xarray/coding/times.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 79efbecfb7c..293847f463e 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -777,6 +777,11 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: safe_setitem(attrs, "units", units, name=name) safe_setitem(attrs, "calendar", calendar, name=name) + # drop dtype in encoding if encoded as float64 + # to prevent unnecessary casts, see GH #1064 + if np.issubdtype(data.dtype, np.float64): + encoding.pop("dtype", None) + return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable @@ -810,6 +815,11 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: data, units = encode_cf_timedelta(data, encoding.pop("units", None)) safe_setitem(attrs, "units", units, name=name) + # drop dtype in encoding if encoded as float64 + # to prevent unnecessary casts, see GH #1064 + if np.issubdtype(data.dtype, np.float64): + encoding.pop("dtype", None) + return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable From aff4e63db2fbbf392f26d638e15461290bfa314f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Sep 2023 09:56:54 +0200 Subject: [PATCH 02/16] adapt tests --- xarray/tests/test_coding_times.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 079e432b565..070317b4491 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -30,7 +30,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes -from xarray.testing import assert_allclose, assert_equal, assert_identical +from xarray.testing import assert_equal, assert_identical from xarray.tests import ( FirstElementAccessibleArray, arm_xfail, @@ -1311,12 +1311,16 @@ def test_roundtrip_timedelta64_nanosecond_precision_warning() -> None: f"Timedeltas can't be serialized faithfully with requested units {units!r}. " f"Resolution of {needed_units!r} needed. " ) - encoding = dict(_FillValue=20, units=units) + encoding = dict(dtype=np.int64, _FillValue=20, units=units) var = Variable(["time"], timedelta_values, encoding=encoding) with pytest.warns(UserWarning, match=wmsg): encoded_var = conventions.encode_cf_variable(var) + assert encoded_var.dtype == np.float64 + assert encoded_var.attrs["units"] == units + assert encoded_var.attrs["_FillValue"] == 20.0 decoded_var = conventions.decode_cf_variable("foo", encoded_var) - assert_allclose(var, decoded_var) + assert_identical(var, decoded_var) + assert decoded_var.encoding["dtype"] == np.float64 def test_roundtrip_float_times() -> None: From f6a954c493eb791fa3716665b1a3bea6eaf581a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Sep 2023 10:01:04 +0200 Subject: [PATCH 03/16] add whats-new.rst entry --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ee74411a004..d527e9e9860 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -87,6 +87,8 @@ Bug fixes - ``.rolling_exp`` functions no longer mistakenly lose non-dimensioned coords (:issue:`6528`, :pull:`8114`) By `Maximilian Roos `_. +- Fixed a bug where datetime64/timedelta64 encoded as float64 are recast to int64 with precision loss (:issue:`1064`, :pull:`8201`). + By `Kai Mühlbauer `_. Documentation ~~~~~~~~~~~~~ From e8de234d910d660214514ee1877fb5c33ec471b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Sep 2023 15:25:04 +0200 Subject: [PATCH 04/16] Update xarray/coding/times.py Co-authored-by: Spencer Clark --- xarray/coding/times.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 293847f463e..761c01dff7a 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -779,7 +779,8 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: # drop dtype in encoding if encoded as float64 # to prevent unnecessary casts, see GH #1064 - if np.issubdtype(data.dtype, np.float64): + encoding_dtype = encoding.get("dtype", data.dtype) + if np.issubdtype(data.dtype, np.float64) and encoding_dtype.kind in "iu": encoding.pop("dtype", None) return Variable(dims, data, attrs, encoding, fastpath=True) From b19a0e20029158e71b2beeebfe621cbc9036124c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Sep 2023 15:25:20 +0200 Subject: [PATCH 05/16] Update doc/whats-new.rst Co-authored-by: Spencer Clark --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d527e9e9860..8561bf4723f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -87,7 +87,7 @@ Bug fixes - ``.rolling_exp`` functions no longer mistakenly lose non-dimensioned coords (:issue:`6528`, :pull:`8114`) By `Maximilian Roos `_. -- Fixed a bug where datetime64/timedelta64 encoded as float64 are recast to int64 with precision loss (:issue:`1064`, :pull:`8201`). +- Override the user-provided encoding dtype for datetime64/timedelta64 values if it is an integer dtype, but the units encoding would require floating point values for most faithful serialization to disk (:issue:`1064`, :pull:`8201`). By `Kai Mühlbauer `_. Documentation From fc63ed9c041cf6d7187b7b4e6cefd5f3f70c590a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Sep 2023 16:14:42 +0200 Subject: [PATCH 06/16] add test per review suggestion, replace .kind-check with np.issubdtype-check --- xarray/coding/times.py | 4 +++- xarray/tests/test_coding_times.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 761c01dff7a..6dcd2506034 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -780,7 +780,9 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: # drop dtype in encoding if encoded as float64 # to prevent unnecessary casts, see GH #1064 encoding_dtype = encoding.get("dtype", data.dtype) - if np.issubdtype(data.dtype, np.float64) and encoding_dtype.kind in "iu": + if np.issubdtype(data.dtype, np.float64) and np.issubdtype( + encoding_dtype, np.integer + ): encoding.pop("dtype", None) return Variable(dims, data, attrs, encoding, fastpath=True) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 070317b4491..4cdfed10753 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1212,6 +1212,7 @@ def test_contains_cftime_lazy() -> None: ("1677-09-21T00:12:43.145224193", "ns", np.int64, None, False), ("1677-09-21T00:12:43.145225", "us", np.int64, None, False), ("1970-01-01T00:00:01.000001", "us", np.int64, None, False), + ("1677-09-21T00:21:52.901038080", "ns", np.float32, 20.0, True), ], ) def test_roundtrip_datetime64_nanosecond_precision( @@ -1261,7 +1262,7 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: ] units = "days since 1970-01-10T01:01:00" needed_units = "hours" - encoding = dict(_FillValue=20, units=units) + encoding = dict(dtype=np.int64, _FillValue=20, units=units) var = Variable(["time"], times, encoding=encoding) wmsg = ( f"Times can't be serialized faithfully with requested units {units!r}. " @@ -1269,6 +1270,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: ) with pytest.warns(UserWarning, match=wmsg): encoded_var = conventions.encode_cf_variable(var) + assert encoded_var.dtype == np.float64 + assert encoded_var.attrs["units"] == units + assert encoded_var.attrs["_FillValue"] == 20.0 decoded_var = conventions.decode_cf_variable("foo", encoded_var) assert_identical(var, decoded_var) From 35b77d6a322554a94631310ecffb3e99023b15ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 20 Sep 2023 08:16:24 +0200 Subject: [PATCH 07/16] align timedelta64 check with datetime64 check --- xarray/coding/times.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 6dcd2506034..40e1bf5fe4f 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -820,7 +820,10 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: # drop dtype in encoding if encoded as float64 # to prevent unnecessary casts, see GH #1064 - if np.issubdtype(data.dtype, np.float64): + encoding_dtype = encoding.get("dtype", data.dtype) + if np.issubdtype(data.dtype, np.float64) and np.issubdtype( + encoding_dtype, np.integer + ): encoding.pop("dtype", None) return Variable(dims, data, attrs, encoding, fastpath=True) From 64e20877a7949ab1043e8e4ccd7139bf1a92d8ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 22 Sep 2023 12:43:02 +0200 Subject: [PATCH 08/16] override units instead of dtype --- doc/whats-new.rst | 2 +- xarray/coding/times.py | 107 ++++++++++++++++-------------- xarray/tests/test_coding_times.py | 24 ++++--- 3 files changed, 71 insertions(+), 62 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5011f73bd4b..5d0b05961cc 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -90,7 +90,7 @@ Bug fixes - ``.rolling_exp`` functions no longer mistakenly lose non-dimensioned coords (:issue:`6528`, :pull:`8114`) By `Maximilian Roos `_. -- Override the user-provided encoding dtype for datetime64/timedelta64 values if it is an integer dtype, but the units encoding would require floating point values for most faithful serialization to disk (:issue:`1064`, :pull:`8201`). +- Override the user-provided encoding units for datetime64/timedelta64 values to preserve an integer dtype for most faithful serialization to disk (:issue:`1064`, :pull:`8201`). By `Kai Mühlbauer `_. Documentation diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 40e1bf5fe4f..9ebf45221f1 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -656,8 +656,22 @@ def cast_to_int_if_safe(num) -> np.ndarray: return num +def _division(deltas, delta, floor): + if floor: + # calculate int64 floor division + # to preserve integer dtype if possible (GH 4045, GH7817). + num = deltas // delta.astype(np.int64) + num = num.astype(np.int64, copy=False) + else: + num = deltas / delta + return num + + def encode_cf_datetime( - dates, units: str | None = None, calendar: str | None = None + dates, + units: str | None = None, + calendar: str | None = None, + dtype: np.dtype | None = None, ) -> tuple[np.ndarray, str, str]: """Given an array of datetime objects, returns the tuple `(num, units, calendar)` suitable for a CF compliant time variable. @@ -689,6 +703,12 @@ def encode_cf_datetime( time_units, ref_date = _unpack_time_units_and_ref_date(units) time_delta = _time_units_to_timedelta64(time_units) + # Wrap the dates in a DatetimeIndex to do the subtraction to ensure + # an OverflowError is raised if the ref_date is too far away from + # dates to be encoded (GH 2272). + dates_as_index = pd.DatetimeIndex(dates.ravel()) + time_deltas = dates_as_index - ref_date + # retrieve needed units to faithfully encode to int64 needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units) if data_units != units: @@ -697,26 +717,23 @@ def encode_cf_datetime( if ref_delta > np.timedelta64(0, "ns"): needed_units = _infer_time_units_from_diff(ref_delta) - # Wrap the dates in a DatetimeIndex to do the subtraction to ensure - # an OverflowError is raised if the ref_date is too far away from - # dates to be encoded (GH 2272). - dates_as_index = pd.DatetimeIndex(dates.ravel()) - time_deltas = dates_as_index - ref_date - # needed time delta to encode faithfully to int64 needed_time_delta = _time_units_to_timedelta64(needed_units) - if time_delta <= needed_time_delta: - # calculate int64 floor division - # to preserve integer dtype if possible (GH 4045, GH7817). - num = time_deltas // time_delta.astype(np.int64) - num = num.astype(np.int64, copy=False) - else: - emit_user_level_warning( - f"Times can't be serialized faithfully with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. " - f"Serializing timeseries to floating point." - ) - num = time_deltas / time_delta + + floor_division = True + if time_delta > needed_time_delta: + if np.issubdtype(dtype, np.integer): + new_units = f"{needed_units} since {format_timestamp(ref_date)}" + emit_user_level_warning( + f"Times can't be serialized faithfully with requested units {units!r}. " + f"Serializing with units {new_units!r} instead." + ) + units = new_units + time_delta = needed_time_delta + else: + floor_division = False + + num = _division(time_deltas, time_delta, floor_division) num = num.values.reshape(dates.shape) except (OutOfBoundsDatetime, OverflowError, ValueError): @@ -728,7 +745,9 @@ def encode_cf_datetime( return (num, units, calendar) -def encode_cf_timedelta(timedeltas, units: str | None = None) -> tuple[np.ndarray, str]: +def encode_cf_timedelta( + timedeltas, units: str | None = None, dtype: np.dtype | None = None +) -> tuple[np.ndarray, str]: data_units = infer_timedelta_units(timedeltas) if units is None: @@ -744,18 +763,19 @@ def encode_cf_timedelta(timedeltas, units: str | None = None) -> tuple[np.ndarra # needed time delta to encode faithfully to int64 needed_time_delta = _time_units_to_timedelta64(needed_units) - if time_delta <= needed_time_delta: - # calculate int64 floor division - # to preserve integer dtype if possible - num = time_deltas // time_delta.astype(np.int64) - num = num.astype(np.int64, copy=False) - else: - emit_user_level_warning( - f"Timedeltas can't be serialized faithfully with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. " - f"Serializing timedeltas to floating point." - ) - num = time_deltas / time_delta + + floor_division = True + if time_delta > needed_time_delta: + if np.issubdtype(dtype, np.integer): + emit_user_level_warning( + f"Timedeltas can't be serialized faithfully with requested units {units!r}. " + f"Serializing with units {needed_units!r} instead." + ) + units = needed_units + time_delta = needed_time_delta + else: + floor_division = False + num = _division(time_deltas, time_delta, floor_division) num = num.values.reshape(timedeltas.shape) return (num, units) @@ -772,19 +792,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: units = encoding.pop("units", None) calendar = encoding.pop("calendar", None) - (data, units, calendar) = encode_cf_datetime(data, units, calendar) + dtype = encoding.get("dtype", None) + (data, units, calendar) = encode_cf_datetime(data, units, calendar, dtype) safe_setitem(attrs, "units", units, name=name) safe_setitem(attrs, "calendar", calendar, name=name) - # drop dtype in encoding if encoded as float64 - # to prevent unnecessary casts, see GH #1064 - encoding_dtype = encoding.get("dtype", data.dtype) - if np.issubdtype(data.dtype, np.float64) and np.issubdtype( - encoding_dtype, np.integer - ): - encoding.pop("dtype", None) - return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable @@ -815,17 +828,11 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: if np.issubdtype(variable.data.dtype, np.timedelta64): dims, data, attrs, encoding = unpack_for_encoding(variable) - data, units = encode_cf_timedelta(data, encoding.pop("units", None)) + data, units = encode_cf_timedelta( + data, encoding.pop("units", None), encoding.get("dtype", None) + ) safe_setitem(attrs, "units", units, name=name) - # drop dtype in encoding if encoded as float64 - # to prevent unnecessary casts, see GH #1064 - encoding_dtype = encoding.get("dtype", data.dtype) - if np.issubdtype(data.dtype, np.float64) and np.issubdtype( - encoding_dtype, np.integer - ): - encoding.pop("dtype", None) - return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 4cdfed10753..11c8fa635c3 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1036,7 +1036,9 @@ def test_encode_cf_datetime_defaults_to_correct_dtype( pytest.skip("Nanosecond frequency is not valid for cftime dates.") times = date_range("2000", periods=3, freq=freq) units = f"{encoding_units} since 2000-01-01" - encoded, _, _ = coding.times.encode_cf_datetime(times, units) + print(times, units) + encoded, _units, _ = coding.times.encode_cf_datetime(times, units) + print(encoded, _units) numpy_timeunit = coding.times._netcdf_to_numpy_timeunit(encoding_units) encoding_units_as_timedelta = np.timedelta64(1, numpy_timeunit) @@ -1261,18 +1263,18 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: np.datetime64("1970-01-02T00:01:00", "ns"), ] units = "days since 1970-01-10T01:01:00" - needed_units = "hours" + needed_units = "hours since 1970-01-10T01:01:00" encoding = dict(dtype=np.int64, _FillValue=20, units=units) var = Variable(["time"], times, encoding=encoding) wmsg = ( f"Times can't be serialized faithfully with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. " + f"Serializing with units {needed_units!r} instead." ) with pytest.warns(UserWarning, match=wmsg): encoded_var = conventions.encode_cf_variable(var) - assert encoded_var.dtype == np.float64 - assert encoded_var.attrs["units"] == units - assert encoded_var.attrs["_FillValue"] == 20.0 + assert encoded_var.dtype == np.int64 + assert encoded_var.attrs["units"] == needed_units + assert encoded_var.attrs["_FillValue"] == 20 decoded_var = conventions.decode_cf_variable("foo", encoded_var) assert_identical(var, decoded_var) @@ -1313,18 +1315,18 @@ def test_roundtrip_timedelta64_nanosecond_precision_warning() -> None: needed_units = "hours" wmsg = ( f"Timedeltas can't be serialized faithfully with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. " + f"Serializing with units {needed_units!r} instead." ) encoding = dict(dtype=np.int64, _FillValue=20, units=units) var = Variable(["time"], timedelta_values, encoding=encoding) with pytest.warns(UserWarning, match=wmsg): encoded_var = conventions.encode_cf_variable(var) - assert encoded_var.dtype == np.float64 - assert encoded_var.attrs["units"] == units - assert encoded_var.attrs["_FillValue"] == 20.0 + assert encoded_var.dtype == np.int64 + assert encoded_var.attrs["units"] == needed_units + assert encoded_var.attrs["_FillValue"] == 20 decoded_var = conventions.decode_cf_variable("foo", encoded_var) assert_identical(var, decoded_var) - assert decoded_var.encoding["dtype"] == np.float64 + assert decoded_var.encoding["dtype"] == np.int64 def test_roundtrip_float_times() -> None: From f3ca8bf490653a6d5b8d856470b7bdc72c15bbfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 22 Sep 2023 13:05:48 +0200 Subject: [PATCH 09/16] remove print statement --- xarray/tests/test_coding_times.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 11c8fa635c3..b987efb61e4 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1036,9 +1036,7 @@ def test_encode_cf_datetime_defaults_to_correct_dtype( pytest.skip("Nanosecond frequency is not valid for cftime dates.") times = date_range("2000", periods=3, freq=freq) units = f"{encoding_units} since 2000-01-01" - print(times, units) encoded, _units, _ = coding.times.encode_cf_datetime(times, units) - print(encoded, _units) numpy_timeunit = coding.times._netcdf_to_numpy_timeunit(encoding_units) encoding_units_as_timedelta = np.timedelta64(1, numpy_timeunit) From 90d4e0c85b776f87864b7a05600b16fd6338c311 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 22 Sep 2023 13:24:19 +0200 Subject: [PATCH 10/16] warn in case of serialization to floating point, too --- xarray/coding/times.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 9ebf45221f1..9ede057cf11 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -731,6 +731,11 @@ def encode_cf_datetime( units = new_units time_delta = needed_time_delta else: + emit_user_level_warning( + f"Times can't be serialized faithfully with requested units {units!r}. " + f"Resolution of {needed_units!r} needed. " + f"Serializing timeseries to floating point." + ) floor_division = False num = _division(time_deltas, time_delta, floor_division) @@ -774,6 +779,11 @@ def encode_cf_timedelta( units = needed_units time_delta = needed_time_delta else: + emit_user_level_warning( + f"Timedeltas can't be serialized faithfully with requested units {units!r}. " + f"Resolution of {needed_units!r} needed. " + f"Serializing timeseries to floating point." + ) floor_division = False num = _division(time_deltas, time_delta, floor_division) num = num.values.reshape(timedeltas.shape) From 43408849a0db71bedc6ab084e055dd37875fe2d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 22 Sep 2023 13:46:07 +0200 Subject: [PATCH 11/16] align if-else --- xarray/coding/times.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 9ede057cf11..03e6a9ebb02 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -722,7 +722,14 @@ def encode_cf_datetime( floor_division = True if time_delta > needed_time_delta: - if np.issubdtype(dtype, np.integer): + floor_division = False + if dtype is None: + emit_user_level_warning( + f"Times can't be serialized faithfully with requested units {units!r}. " + f"Resolution of {needed_units!r} needed. " + f"Serializing timeseries to floating point." + ) + elif np.issubdtype(dtype, np.integer): new_units = f"{needed_units} since {format_timestamp(ref_date)}" emit_user_level_warning( f"Times can't be serialized faithfully with requested units {units!r}. " @@ -730,13 +737,7 @@ def encode_cf_datetime( ) units = new_units time_delta = needed_time_delta - else: - emit_user_level_warning( - f"Times can't be serialized faithfully with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. " - f"Serializing timeseries to floating point." - ) - floor_division = False + floor_division = True num = _division(time_deltas, time_delta, floor_division) num = num.values.reshape(dates.shape) @@ -771,20 +772,22 @@ def encode_cf_timedelta( floor_division = True if time_delta > needed_time_delta: - if np.issubdtype(dtype, np.integer): + floor_division = False + if dtype is None: emit_user_level_warning( f"Timedeltas can't be serialized faithfully with requested units {units!r}. " - f"Serializing with units {needed_units!r} instead." + f"Resolution of {needed_units!r} needed. " + f"Serializing timeseries to floating point." ) - units = needed_units - time_delta = needed_time_delta - else: + elif np.issubdtype(dtype, np.integer): emit_user_level_warning( f"Timedeltas can't be serialized faithfully with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. " - f"Serializing timeseries to floating point." + f"Serializing with units {needed_units!r} instead." ) - floor_division = False + units = needed_units + time_delta = needed_time_delta + floor_division = True + num = _division(time_deltas, time_delta, floor_division) num = num.values.reshape(timedeltas.shape) return (num, units) From a7ba341dff6554d2b99b2df5489bc3cab78d837c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 24 Sep 2023 14:59:16 +0200 Subject: [PATCH 12/16] Add instructions to warnings --- xarray/coding/times.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 03e6a9ebb02..5c670f8ec0c 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -725,15 +725,18 @@ def encode_cf_datetime( floor_division = False if dtype is None: emit_user_level_warning( - f"Times can't be serialized faithfully with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. " - f"Serializing timeseries to floating point." + f"Times can't be serialized faithfully to int64 with requested units {units!r}. " + f"Resolution of {needed_units!r} needed. Serializing timeseries to floating point instead." + f"Set encoding['dtype'] to integer dtype to serialize to int64. " + f"Set encoding['dtype'] to floating point dtype to silence this warning." ) elif np.issubdtype(dtype, np.integer): new_units = f"{needed_units} since {format_timestamp(ref_date)}" emit_user_level_warning( - f"Times can't be serialized faithfully with requested units {units!r}. " - f"Serializing with units {new_units!r} instead." + f"Times can't be serialized faithfully to int64 with requested units {units!r}. " + f"Serializing with units {new_units!r} instead. " + f"Set encoding['dtype'] to floating point dtype to serialize with units {units!r}. " + f"Set encoding['units'] to {new_units!r} to silence this warning ." ) units = new_units time_delta = needed_time_delta @@ -775,14 +778,17 @@ def encode_cf_timedelta( floor_division = False if dtype is None: emit_user_level_warning( - f"Timedeltas can't be serialized faithfully with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. " - f"Serializing timeseries to floating point." + f"Timedeltas can't be serialized faithfully to int64 with requested units {units!r}. " + f"Resolution of {needed_units!r} needed. Serializing timeseries to floating point instead. " + f"Set encoding['dtype'] to integer dtype to serialize to int64. " + f"Set encoding['dtype'] to floating point dtype to silence this warning." ) elif np.issubdtype(dtype, np.integer): emit_user_level_warning( f"Timedeltas can't be serialized faithfully with requested units {units!r}. " - f"Serializing with units {needed_units!r} instead." + f"Serializing with units {needed_units!r} instead. " + f"Set encoding['dtype'] to floating point dtype to serialize with units {units!r}. " + f"Set encoding['units'] to {needed_units!r} to silence this warning ." ) units = needed_units time_delta = needed_time_delta From f4b31537a3e45159f4e5b3657c9aff4775521820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 24 Sep 2023 16:17:11 +0200 Subject: [PATCH 13/16] Fix test --- xarray/coding/times.py | 2 +- xarray/tests/test_coding_times.py | 47 +++++++++++++++++++++++++------ 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 5c670f8ec0c..2822f02dd8d 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -726,7 +726,7 @@ def encode_cf_datetime( if dtype is None: emit_user_level_warning( f"Times can't be serialized faithfully to int64 with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. Serializing timeseries to floating point instead." + f"Resolution of {needed_units!r} needed. Serializing times to floating point instead. " f"Set encoding['dtype'] to integer dtype to serialize to int64. " f"Set encoding['dtype'] to floating point dtype to silence this warning." ) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index b987efb61e4..da0482cf8a2 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1261,22 +1261,53 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: np.datetime64("1970-01-02T00:01:00", "ns"), ] units = "days since 1970-01-10T01:01:00" - needed_units = "hours since 1970-01-10T01:01:00" - encoding = dict(dtype=np.int64, _FillValue=20, units=units) + needed_units = "hours" + new_units = f"{needed_units} since 1970-01-10T01:01:00" + + encoding = dict(dtype=None, _FillValue=20, units=units) var = Variable(["time"], times, encoding=encoding) - wmsg = ( - f"Times can't be serialized faithfully with requested units {units!r}. " - f"Serializing with units {needed_units!r} instead." - ) - with pytest.warns(UserWarning, match=wmsg): + with pytest.warns(UserWarning, match=f"Resolution of {needed_units!r} needed."): + encoded_var = conventions.encode_cf_variable(var) + assert encoded_var.dtype == np.float64 + assert encoded_var.attrs["units"] == units + assert encoded_var.attrs["_FillValue"] == 20. + + decoded_var = conventions.decode_cf_variable("foo", encoded_var) + assert_identical(var, decoded_var) + + encoding = dict(dtype="int64", _FillValue=20, units=units) + var = Variable(["time"], times, encoding=encoding) + with pytest.warns(UserWarning, match=f"Serializing with units {new_units!r} instead."): encoded_var = conventions.encode_cf_variable(var) assert encoded_var.dtype == np.int64 - assert encoded_var.attrs["units"] == needed_units + assert encoded_var.attrs["units"] == new_units assert encoded_var.attrs["_FillValue"] == 20 decoded_var = conventions.decode_cf_variable("foo", encoded_var) assert_identical(var, decoded_var) + encoding = dict(dtype="float64", _FillValue=20, units=units) + var = Variable(["time"], times, encoding=encoding) + with pytest.warns(None): + encoded_var = conventions.encode_cf_variable(var) + assert encoded_var.dtype == np.float64 + assert encoded_var.attrs["units"] == units + assert encoded_var.attrs["_FillValue"] == 20. + + decoded_var = conventions.decode_cf_variable("foo", encoded_var) + assert_identical(var, decoded_var) + + + encoding = dict(dtype="int64", _FillValue=20, units=new_units) + var = Variable(["time"], times, encoding=encoding) + with pytest.warns(None): + encoded_var = conventions.encode_cf_variable(var) + assert encoded_var.dtype == np.int64 + assert encoded_var.attrs["units"] == new_units + assert encoded_var.attrs["_FillValue"] == 20 + + decoded_var = conventions.decode_cf_variable("foo", encoded_var) + assert_identical(var, decoded_var) @pytest.mark.parametrize( "dtype, fill_value", From 32461c0af3e360879cee210dfea284aba2cd99bf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 24 Sep 2023 14:18:01 +0000 Subject: [PATCH 14/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_coding_times.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index da0482cf8a2..39fbd499798 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1270,14 +1270,16 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: encoded_var = conventions.encode_cf_variable(var) assert encoded_var.dtype == np.float64 assert encoded_var.attrs["units"] == units - assert encoded_var.attrs["_FillValue"] == 20. + assert encoded_var.attrs["_FillValue"] == 20.0 decoded_var = conventions.decode_cf_variable("foo", encoded_var) assert_identical(var, decoded_var) encoding = dict(dtype="int64", _FillValue=20, units=units) var = Variable(["time"], times, encoding=encoding) - with pytest.warns(UserWarning, match=f"Serializing with units {new_units!r} instead."): + with pytest.warns( + UserWarning, match=f"Serializing with units {new_units!r} instead." + ): encoded_var = conventions.encode_cf_variable(var) assert encoded_var.dtype == np.int64 assert encoded_var.attrs["units"] == new_units @@ -1292,12 +1294,11 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: encoded_var = conventions.encode_cf_variable(var) assert encoded_var.dtype == np.float64 assert encoded_var.attrs["units"] == units - assert encoded_var.attrs["_FillValue"] == 20. + assert encoded_var.attrs["_FillValue"] == 20.0 decoded_var = conventions.decode_cf_variable("foo", encoded_var) assert_identical(var, decoded_var) - encoding = dict(dtype="int64", _FillValue=20, units=new_units) var = Variable(["time"], times, encoding=encoding) with pytest.warns(None): @@ -1309,6 +1310,7 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: decoded_var = conventions.decode_cf_variable("foo", encoded_var) assert_identical(var, decoded_var) + @pytest.mark.parametrize( "dtype, fill_value", [(np.int64, 20), (np.int64, np.iinfo(np.int64).min), (np.float64, 1e30)], From 43664c9ef03f47969bb36327af74cde4a1bc4822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 24 Sep 2023 16:25:34 +0200 Subject: [PATCH 15/16] use warnings.catch_warnings --- xarray/tests/test_coding_times.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 39fbd499798..5f76a4a2ca8 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1290,7 +1290,8 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: encoding = dict(dtype="float64", _FillValue=20, units=units) var = Variable(["time"], times, encoding=encoding) - with pytest.warns(None): + with warnings.catch_warnings(): + warnings.simplefilter("error") encoded_var = conventions.encode_cf_variable(var) assert encoded_var.dtype == np.float64 assert encoded_var.attrs["units"] == units @@ -1301,7 +1302,8 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: encoding = dict(dtype="int64", _FillValue=20, units=new_units) var = Variable(["time"], times, encoding=encoding) - with pytest.warns(None): + with warnings.catch_warnings(): + warnings.simplefilter("error") encoded_var = conventions.encode_cf_variable(var) assert encoded_var.dtype == np.int64 assert encoded_var.attrs["units"] == new_units From 33a3e1a0083a18fa78530a8d45b86a261a7fd05e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 24 Sep 2023 16:37:18 +0200 Subject: [PATCH 16/16] Update doc/whats-new.rst Co-authored-by: Spencer Clark --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5d0b05961cc..5f18e999cc0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -90,7 +90,7 @@ Bug fixes - ``.rolling_exp`` functions no longer mistakenly lose non-dimensioned coords (:issue:`6528`, :pull:`8114`) By `Maximilian Roos `_. -- Override the user-provided encoding units for datetime64/timedelta64 values to preserve an integer dtype for most faithful serialization to disk (:issue:`1064`, :pull:`8201`). +- In the event that user-provided datetime64/timedelta64 units and integer dtype encoding parameters conflict with each other, override the units to preserve an integer dtype for most faithful serialization to disk (:issue:`1064`, :pull:`8201`). By `Kai Mühlbauer `_. Documentation