From 9df67299724312555c550f4fa1f6ff8633ba52c7 Mon Sep 17 00:00:00 2001 From: spencerkclark Date: Sun, 21 Aug 2022 14:52:30 -0400 Subject: [PATCH 1/8] Enable taking the mean of dask-backed cftime arrays --- doc/whats-new.rst | 8 +++- xarray/core/duck_array_ops.py | 12 ++---- xarray/tests/test_duck_array_ops.py | 62 ++++++++++++++++++++++------- 3 files changed, 59 insertions(+), 23 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6d78a47f1ea..b1a035c8f53 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,10 @@ v2022.07.0 (unreleased) New Features ~~~~~~~~~~~~ - +- Enable taking the mean of dask-backed :py:class:`cftime.datetime` arrays + (:pull:`6556`, :pull:`6940`). By `Deepak Cherian + `_ and `Spencer Clark + `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -49,6 +52,9 @@ Bug fixes - Make FacetGrid.set_titles send kwargs correctly using `handle.udpate(kwargs)`. (:issue:`6839`, :pull:`6843`) By `Oliver Lopez `_. +- Allow taking the mean over non-time dimensions of datasets containing + dask-backed cftime arrays (:issue:`5897`, :pull:`6950`). By `Spencer Clark + `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 2cd2fb3af04..2bf05abb96a 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -426,7 +426,6 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): though some calendars would allow for them (e.g. no_leap). This is because there is no `cftime.timedelta` object. """ - # TODO: make this function dask-compatible? # Set offset to minimum if not given if offset is None: if array.dtype.kind in "Mm": @@ -531,7 +530,10 @@ def pd_timedelta_to_float(value, datetime_unit): def _timedelta_to_seconds(array): - return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6 + if isinstance(array, datetime.timedelta): + return array.total_seconds() * 1e6 + else: + return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6 def py_timedelta_to_float(array, datetime_unit): @@ -565,12 +567,6 @@ def mean(array, axis=None, skipna=None, **kwargs): + offset ) elif _contains_cftime_datetimes(array): - if is_duck_dask_array(array): - raise NotImplementedError( - "Computing the mean of an array containing " - "cftime.datetime objects is not yet implemented on " - "dask arrays." - ) offset = min(array) timedeltas = datetime_to_numeric(array, offset, datetime_unit="us") mean_timedeltas = _mean(timedeltas, axis=axis, skipna=skipna, **kwargs) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index e8550bb12b2..401d969ff73 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -323,18 +323,50 @@ def test_datetime_mean(dask): @requires_cftime -def test_cftime_datetime_mean(): +@pytest.mark.parametrize("dask", [False, True]) +def test_cftime_datetime_mean(dask): + if dask and not has_dask: + pytest.skip("requires dask") + times = cftime_range("2000", periods=4) da = DataArray(times, dims=["time"]) + da_2d = DataArray(times.values.reshape(2, 2)) - assert da.isel(time=0).mean() == da.isel(time=0) + if dask: + da = da.chunk({"time": 2}) + da_2d = da_2d.chunk({"dim_0": 2}) + + expected = da.isel(time=0) + # one compute needed to check the array contains cftime datetimes + with raise_if_dask_computes(max_computes=1): + result = da.isel(time=0).mean() + assert_dask_array(result, dask) + assert_equal(result, expected) expected = DataArray(times.date_type(2000, 1, 2, 12)) - result = da.mean() + with raise_if_dask_computes(max_computes=1): + result = da.mean() + assert_dask_array(result, dask) assert_equal(result, expected) - da_2d = DataArray(times.values.reshape(2, 2)) - result = da_2d.mean() + with raise_if_dask_computes(max_computes=1): + result = da_2d.mean() + assert_dask_array(result, dask) + assert_equal(result, expected) + + +@requires_cftime +def test_mean_over_non_time_dim_of_dataset_with_dask_backed_cftime_data(): + # Regression test for part two of GH issue 5897: averaging over a non-time + # dimension still fails if the time variable is dask-backed. + ds = Dataset( + { + "var1": (("time",), cftime_range("2021-10-31", periods=10, freq="D")), + "var2": (("x",), list(range(10))), + } + ) + expected = ds.mean("x") + result = ds.chunk({}).mean("x") assert_equal(result, expected) @@ -372,15 +404,6 @@ def test_cftime_datetime_mean_long_time_period(): assert_equal(result, expected) -@requires_cftime -@requires_dask -def test_cftime_datetime_mean_dask_error(): - times = cftime_range("2000", periods=4) - da = DataArray(times, dims=["time"]).chunk() - with pytest.raises(NotImplementedError): - da.mean() - - def test_empty_axis_dtype(): ds = Dataset() ds["pos"] = [1, 2, 3] @@ -742,6 +765,17 @@ def test_datetime_to_numeric_cftime(dask): expected = 24 * np.arange(0, 35, 7).astype(dtype) np.testing.assert_array_equal(result, expected) + with raise_if_dask_computes(): + if dask: + time = dask.array.asarray(times[1]) + else: + time = np.asarray(times[1]) + result = duck_array_ops.datetime_to_numeric( + time, offset=times[0], datetime_unit="h", dtype=int + ) + expected = np.array(24 * 7).astype(int) + np.testing.assert_array_equal(result, expected) + @requires_cftime def test_datetime_to_numeric_potential_overflow(): From c3e8bf97996a1d373d529a9a582f232f1af570dc Mon Sep 17 00:00:00 2001 From: spencerkclark Date: Sun, 21 Aug 2022 19:25:37 -0400 Subject: [PATCH 2/8] Add requires_dask decorator --- xarray/tests/test_duck_array_ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 401d969ff73..16aff59a6d5 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -356,6 +356,7 @@ def test_cftime_datetime_mean(dask): @requires_cftime +@requires_dask def test_mean_over_non_time_dim_of_dataset_with_dask_backed_cftime_data(): # Regression test for part two of GH issue 5897: averaging over a non-time # dimension still fails if the time variable is dask-backed. From 47d5596d9a62d457528c895c4ed777578c175553 Mon Sep 17 00:00:00 2001 From: spencerkclark Date: Sun, 21 Aug 2022 19:44:57 -0400 Subject: [PATCH 3/8] Address dask compatibility issue --- xarray/core/duck_array_ops.py | 10 ++++++++++ xarray/tests/__init__.py | 3 +++ xarray/tests/test_duck_array_ops.py | 8 +++++--- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 2bf05abb96a..ccc8dd25342 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,6 +13,7 @@ import numpy as np import pandas as pd +from packaging.version import Version from numpy import all as array_all # noqa from numpy import any as array_any # noqa from numpy import zeros_like # noqa @@ -567,6 +568,15 @@ def mean(array, axis=None, skipna=None, **kwargs): + offset ) elif _contains_cftime_datetimes(array): + if is_duck_dask_array(array): + import dask + + if Version(dask.__version__) < Version("2021.07.0"): + raise NotImplementedError( + "Computing the mean of an array containing " + "cftime.datetime objects requires at least dask " + "version 2021.07.0." + ) offset = min(array) timedeltas = datetime_to_numeric(array, offset, datetime_unit="us") mean_timedeltas = _mean(timedeltas, axis=axis, skipna=skipna, **kwargs) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index ff477a40891..d9bd612d09e 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -67,6 +67,9 @@ def _importorskip(modname: str, minversion: str | None = None) -> tuple[bool, An has_cftime, requires_cftime = _importorskip("cftime") has_cftime_1_4_1, requires_cftime_1_4_1 = _importorskip("cftime", minversion="1.4.1") has_dask, requires_dask = _importorskip("dask") +has_dask_2021_07_0, requires_dask_2021_07_0 = _importorskip( + "dask", minversion="2021.07.0" +) has_bottleneck, requires_bottleneck = _importorskip("bottleneck") has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis") has_rasterio, requires_rasterio = _importorskip("rasterio") diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 16aff59a6d5..59b17bda96e 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -34,11 +34,13 @@ arm_xfail, assert_array_equal, has_dask, + has_dask_2021_07_0, has_scipy, raise_if_dask_computes, requires_bottleneck, requires_cftime, requires_dask, + requires_dask_2021_07_0, ) @@ -325,8 +327,8 @@ def test_datetime_mean(dask): @requires_cftime @pytest.mark.parametrize("dask", [False, True]) def test_cftime_datetime_mean(dask): - if dask and not has_dask: - pytest.skip("requires dask") + # if dask and not has_dask_2021_07_0: + # pytest.skip("requires dask >= 2021.07.0") times = cftime_range("2000", periods=4) da = DataArray(times, dims=["time"]) @@ -356,7 +358,7 @@ def test_cftime_datetime_mean(dask): @requires_cftime -@requires_dask +@requires_dask_2021_07_0 def test_mean_over_non_time_dim_of_dataset_with_dask_backed_cftime_data(): # Regression test for part two of GH issue 5897: averaging over a non-time # dimension still fails if the time variable is dask-backed. From e2b46f42fa3fd84fd3829bab0e6331e2bae9136d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 21 Aug 2022 23:46:37 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/duck_array_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index ccc8dd25342..aaf9c008ceb 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,7 +13,6 @@ import numpy as np import pandas as pd -from packaging.version import Version from numpy import all as array_all # noqa from numpy import any as array_any # noqa from numpy import zeros_like # noqa @@ -23,6 +22,7 @@ from numpy import stack as _stack from numpy import take, tensordot, transpose, unravel_index # noqa from numpy import where as _where +from packaging.version import Version from . import dask_array_compat, dask_array_ops, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast From 09cc08741c964fc45b835620994d139c31795edd Mon Sep 17 00:00:00 2001 From: spencerkclark Date: Sun, 21 Aug 2022 20:00:59 -0400 Subject: [PATCH 5/8] Uncomment pytest.skip --- xarray/tests/test_duck_array_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 59b17bda96e..8dc659bba87 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -327,8 +327,8 @@ def test_datetime_mean(dask): @requires_cftime @pytest.mark.parametrize("dask", [False, True]) def test_cftime_datetime_mean(dask): - # if dask and not has_dask_2021_07_0: - # pytest.skip("requires dask >= 2021.07.0") + if dask and not has_dask_2021_07_0: + pytest.skip("requires dask >= 2021.07.0") times = cftime_range("2000", periods=4) da = DataArray(times, dims=["time"]) From 8b1d5cd181cf16eb9efe29a3db9db4b91902a6b9 Mon Sep 17 00:00:00 2001 From: spencerkclark Date: Tue, 23 Aug 2022 20:37:09 -0400 Subject: [PATCH 6/8] Bump minimum dask version; remove compatibility code --- ci/requirements/min-all-deps.yml | 2 +- xarray/core/duck_array_ops.py | 9 --------- xarray/tests/__init__.py | 3 --- xarray/tests/test_duck_array_ops.py | 8 +++----- 4 files changed, 4 insertions(+), 18 deletions(-) diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index 34879af730b..140392e0de9 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -15,7 +15,7 @@ dependencies: - cfgrib=0.9 - cftime=1.4 - coveralls - - dask-core=2021.04 + - dask-core=2021.08.0 - distributed=2021.04 - flox=0.5 - h5netcdf=0.11 diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index aaf9c008ceb..8b40d38543d 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -568,15 +568,6 @@ def mean(array, axis=None, skipna=None, **kwargs): + offset ) elif _contains_cftime_datetimes(array): - if is_duck_dask_array(array): - import dask - - if Version(dask.__version__) < Version("2021.07.0"): - raise NotImplementedError( - "Computing the mean of an array containing " - "cftime.datetime objects requires at least dask " - "version 2021.07.0." - ) offset = min(array) timedeltas = datetime_to_numeric(array, offset, datetime_unit="us") mean_timedeltas = _mean(timedeltas, axis=axis, skipna=skipna, **kwargs) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index d9bd612d09e..ff477a40891 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -67,9 +67,6 @@ def _importorskip(modname: str, minversion: str | None = None) -> tuple[bool, An has_cftime, requires_cftime = _importorskip("cftime") has_cftime_1_4_1, requires_cftime_1_4_1 = _importorskip("cftime", minversion="1.4.1") has_dask, requires_dask = _importorskip("dask") -has_dask_2021_07_0, requires_dask_2021_07_0 = _importorskip( - "dask", minversion="2021.07.0" -) has_bottleneck, requires_bottleneck = _importorskip("bottleneck") has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis") has_rasterio, requires_rasterio = _importorskip("rasterio") diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 8dc659bba87..16aff59a6d5 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -34,13 +34,11 @@ arm_xfail, assert_array_equal, has_dask, - has_dask_2021_07_0, has_scipy, raise_if_dask_computes, requires_bottleneck, requires_cftime, requires_dask, - requires_dask_2021_07_0, ) @@ -327,8 +325,8 @@ def test_datetime_mean(dask): @requires_cftime @pytest.mark.parametrize("dask", [False, True]) def test_cftime_datetime_mean(dask): - if dask and not has_dask_2021_07_0: - pytest.skip("requires dask >= 2021.07.0") + if dask and not has_dask: + pytest.skip("requires dask") times = cftime_range("2000", periods=4) da = DataArray(times, dims=["time"]) @@ -358,7 +356,7 @@ def test_cftime_datetime_mean(dask): @requires_cftime -@requires_dask_2021_07_0 +@requires_dask def test_mean_over_non_time_dim_of_dataset_with_dask_backed_cftime_data(): # Regression test for part two of GH issue 5897: averaging over a non-time # dimension still fails if the time variable is dask-backed. From e8a3eb38e7ea4dd7939296c979fc0a3ab6f70ea3 Mon Sep 17 00:00:00 2001 From: spencerkclark Date: Tue, 23 Aug 2022 20:40:50 -0400 Subject: [PATCH 7/8] Remove unused import --- doc/whats-new.rst | 4 ++-- xarray/core/duck_array_ops.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 82fb5536eaf..7b6a9c2e9f1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -52,11 +52,11 @@ Bug fixes - Make FacetGrid.set_titles send kwargs correctly using `handle.udpate(kwargs)`. (:issue:`6839`, :pull:`6843`) By `Oliver Lopez `_. +- Fix bug where index variables would be changed inplace (:issue:`6931`, :pull:`6938`) + By `Michael Niklas `_. - Allow taking the mean over non-time dimensions of datasets containing dask-backed cftime arrays (:issue:`5897`, :pull:`6950`). By `Spencer Clark `_. -- Fix bug where index variables would be changed inplace (:issue:`6931`, :pull:`6938`) - By `Michael Niklas `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 8b40d38543d..2bf05abb96a 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -22,7 +22,6 @@ from numpy import stack as _stack from numpy import take, tensordot, transpose, unravel_index # noqa from numpy import where as _where -from packaging.version import Version from . import dask_array_compat, dask_array_ops, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast From 1c59580bd3694dec94e3e95f93196a419675353a Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Tue, 23 Aug 2022 21:15:59 -0400 Subject: [PATCH 8/8] Update min-all-deps.yml --- ci/requirements/min-all-deps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index 140392e0de9..8ba7e901973 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -16,7 +16,7 @@ dependencies: - cftime=1.4 - coveralls - dask-core=2021.08.0 - - distributed=2021.04 + - distributed=2021.08.0 - flox=0.5 - h5netcdf=0.11 - h5py=3.1