From 79f9cd4f934d8a4c7d02ab941087ca74e9a7cdc3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 14:43:01 +0100 Subject: [PATCH 01/10] Implement ea accumulate for datetimelike --- .../array_algos/datetimelike_accumulations.py | 72 +++++++++++++++++++ pandas/core/arrays/datetimelike.py | 25 +++---- pandas/core/arrays/timedeltas.py | 5 +- pandas/tests/series/test_cumulative.py | 4 +- 4 files changed, 84 insertions(+), 22 deletions(-) create mode 100644 pandas/core/array_algos/datetimelike_accumulations.py diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py new file mode 100644 index 0000000000000..ff1b795b7364a --- /dev/null +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from typing import Callable + +import numpy as np + +from pandas._libs import iNaT + +from pandas.core.dtypes.missing import isna + +""" +datetimelke_accumulations.py is for accumulations of datetimelike extension arrays +""" + + +def _cum_func( + func: Callable, + values: np.ndarray, + *, + skipna: bool = True, +): + """ + Accumulations for 1D datetimelike arrays. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + skipna : bool, default True + Whether to skip NA. + """ + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: np.iinfo(np.int64).min, + np.cumsum: 0, + np.minimum.accumulate: np.iinfo(np.int64).max, + }[func] + except KeyError: + raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + + mask = isna(values) + y = values.view("i8") + y[mask] = fill_value + + if not skipna: + mask = np.maximum.accumulate(mask) + + result = func(y) + result[mask] = iNaT + + if values.dtype.kind in ["m", "M"]: + return result.view(values.dtype) + return result + + +def cumsum(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, skipna=skipna) + + +def cumprod(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumprod, values, skipna=skipna) + + +def cummin(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, skipna=skipna) + + +def cummax(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, skipna=skipna) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 63940741c3fe3..a7e2669015f65 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -122,6 +122,7 @@ isin, unique1d, ) +from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import ( NDArrayBackedExtensionArray, @@ -1380,25 +1381,15 @@ def _addsub_object_array(self, other: np.ndarray, op): return result def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): + if name not in {"cummin", "cummax"}: + raise TypeError(f"Accumulation {name} not supported for {type(self)}") - if is_period_dtype(self.dtype): - data = self - else: - # Incompatible types in assignment (expression has type - # "ndarray[Any, Any]", variable has type "DatetimeLikeArrayMixin" - data = self._ndarray.copy() # type: ignore[assignment] - - if name in {"cummin", "cummax"}: - func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate - result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) - - # error: Unexpected keyword argument "freq" for - # "_simple_new" of "NDArrayBacked" [call-arg] - return type(self)._simple_new( - result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg] - ) + op = getattr(datetimelike_accumulations, name) + result = op(self, skipna=skipna, **kwargs) - raise TypeError(f"Accumulation {name} not supported for {type(self)}") + return type(self)._simple_new( + result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg] + ) @unpack_zerodim_and_defer("__add__") def __add__(self, other): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index aa1b826ef0876..28f969ed4f7fb 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -414,10 +414,9 @@ def std( # Accumulations def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): - - data = self._ndarray.copy() - if name == "cumsum": + data = self._ndarray.copy() + func = np.cumsum result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 6fbc78bf3ff0f..85b365ab74c13 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -70,12 +70,12 @@ def test_cummin_cummax(self, datetime_series, method): [ "cummax", False, - ["NaT", "2 days", "2 days", "2 days", "2 days", "3 days"], + ["NaT", "NaT", "NaT", "NaT", "NaT", "NaT"], ], [ "cummin", False, - ["NaT", "2 days", "2 days", "1 days", "1 days", "1 days"], + ["NaT", "NaT", "NaT", "NaT", "NaT", "NaT"], ], ], ) From 6fd82e349ca640adfca50dd5f4d13eb4e1331272 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 14:46:48 +0100 Subject: [PATCH 02/10] Fix dtype cast --- pandas/core/array_algos/datetimelike_accumulations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index ff1b795b7364a..79a459dff9678 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -52,7 +52,7 @@ def _cum_func( result[mask] = iNaT if values.dtype.kind in ["m", "M"]: - return result.view(values.dtype) + return result.view(values.dtype.base) return result From 4aa501903467e26224a51bfd8a4f9341d4469518 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 14:49:53 +0100 Subject: [PATCH 03/10] Remove from nanops --- pandas/core/nanops.py | 47 +------------------------------------------ 1 file changed, 1 insertion(+), 46 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b8e2b1fafe326..32604b3b5b3b5 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1712,52 +1712,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: }[accum_func] # We will be applying this function to block values - if values.dtype.kind in ["m", "M"]: - # GH#30460, GH#29058 - # numpy 1.18 started sorting NaTs at the end instead of beginning, - # so we need to work around to maintain backwards-consistency. - orig_dtype = values.dtype - - # We need to define mask before masking NaTs - mask = isna(values) - - y = values.view("i8") - # Note: the accum_func comparison fails as an "is" comparison - changed = accum_func == np.minimum.accumulate - - try: - if changed: - y[mask] = lib.i8max - - result = accum_func(y, axis=0) - finally: - if changed: - # restore NaT elements - y[mask] = iNaT - - if skipna: - result[mask] = iNaT - elif accum_func == np.minimum.accumulate: - # Restore NaTs that we masked previously - nz = (~np.asarray(mask)).nonzero()[0] - if len(nz): - # everything up to the first non-na entry stays NaT - result[: nz[0]] = iNaT - - if isinstance(values.dtype, np.dtype): - result = result.view(orig_dtype) - else: - # DatetimeArray/TimedeltaArray - # TODO: have this case go through a DTA method? - # For DatetimeTZDtype, view result as M8[ns] - npdtype = orig_dtype if isinstance(orig_dtype, np.dtype) else "M8[ns]" - # Item "type" of "Union[Type[ExtensionArray], Type[ndarray[Any, Any]]]" - # has no attribute "_simple_new" - result = type(values)._simple_new( # type: ignore[union-attr] - result.view(npdtype), dtype=orig_dtype - ) - - elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): + if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() mask = isna(vals) vals[mask] = mask_a From d9653c0c9aaf7578d60f54362a16b66393e39966 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 14:55:00 +0100 Subject: [PATCH 04/10] Add period tests --- pandas/core/arrays/datetimelike.py | 2 +- pandas/tests/series/test_cumulative.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a7e2669015f65..0e48b3e1a0227 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1385,7 +1385,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): raise TypeError(f"Accumulation {name} not supported for {type(self)}") op = getattr(datetimelike_accumulations, name) - result = op(self, skipna=skipna, **kwargs) + result = op(self.copy(), skipna=skipna, **kwargs) return type(self)._simple_new( result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg] diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 85b365ab74c13..4c5fd2d44e4f4 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -91,6 +91,26 @@ def test_cummin_cummax_datetimelike(self, ts, method, skipna, exp_tdi): result = getattr(ser, method)(skipna=skipna) tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( + "func, exp", + [ + ("cummin", pd.Period("2012-1-1", freq="D")), + ("cummax", pd.Period("2012-1-2", freq="D")), + ], + ) + def test_cummin_cummax_period(self, func, exp): + # GH#28385 + ser = pd.Series( + [pd.Period("2012-1-1", freq="D"), pd.NaT, pd.Period("2012-1-2", freq="D")] + ) + result = getattr(ser, func)(skipna=False) + expected = pd.Series([pd.Period("2012-1-1", freq="D"), pd.NaT, pd.NaT]) + tm.assert_series_equal(result, expected) + + result = getattr(ser, func)(skipna=True) + expected = pd.Series([pd.Period("2012-1-1", freq="D"), pd.NaT, exp]) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "arg", [ From 4ffa49731814f24067a9e248ce978494799f905a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 15:54:56 +0100 Subject: [PATCH 05/10] Move comment --- pandas/core/array_algos/datetimelike_accumulations.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index 79a459dff9678..ec84bdb93749e 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -1,3 +1,6 @@ +""" +datetimelke_accumulations.py is for accumulations of datetimelike extension arrays +""" from __future__ import annotations from typing import Callable @@ -8,10 +11,6 @@ from pandas.core.dtypes.missing import isna -""" -datetimelke_accumulations.py is for accumulations of datetimelike extension arrays -""" - def _cum_func( func: Callable, From 8305cf3e29df9bfd4c50cf92725edb5b43ef05df Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 15:55:07 +0100 Subject: [PATCH 06/10] Move comment --- pandas/core/array_algos/datetimelike_accumulations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index ec84bdb93749e..0a8a204ad61c3 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -1,6 +1,7 @@ """ datetimelke_accumulations.py is for accumulations of datetimelike extension arrays """ + from __future__ import annotations from typing import Callable From 9487b86f448d5ddc1c5d4cf56dc725eaf8bb1985 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 22:06:13 +0100 Subject: [PATCH 07/10] Address review --- pandas/core/array_algos/datetimelike_accumulations.py | 9 ++------- pandas/core/arrays/timedeltas.py | 7 +++---- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index 0a8a204ad61c3..b28fd9a0a3345 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -27,13 +27,12 @@ def _cum_func( func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate values : np.ndarray Numpy array with the values (can be of any dtype that support the - operation). + operation). Values is changed is modified inplace. skipna : bool, default True Whether to skip NA. """ try: fill_value = { - np.cumprod: 1, np.maximum.accumulate: np.iinfo(np.int64).min, np.cumsum: 0, np.minimum.accumulate: np.iinfo(np.int64).max, @@ -56,14 +55,10 @@ def _cum_func( return result -def cumsum(values: np.ndarray, *, skipna: bool = True): +def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: return _cum_func(np.cumsum, values, skipna=skipna) -def cumprod(values: np.ndarray, *, skipna: bool = True): - return _cum_func(np.cumprod, values, skipna=skipna) - - def cummin(values: np.ndarray, *, skipna: bool = True): return _cum_func(np.minimum.accumulate, values, skipna=skipna) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 28f969ed4f7fb..2508c618269cc 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -60,6 +60,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops +from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com @@ -415,10 +416,8 @@ def std( def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): if name == "cumsum": - data = self._ndarray.copy() - - func = np.cumsum - result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) + op = getattr(datetimelike_accumulations, name) + result = op(self._ndarray.copy(), skipna=skipna, **kwargs) return type(self)._simple_new(result, freq=None, dtype=self.dtype) elif name == "cumprod": From eb55b3cd07c73a6328677f70ff2dd0714d573986 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Dec 2022 20:26:47 +0100 Subject: [PATCH 08/10] Dont retain freq --- pandas/core/arrays/datetimelike.py | 2 +- .../arrays/datetimes/test_accumulator.py | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/arrays/datetimes/test_accumulator.py diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0e48b3e1a0227..03b656821a83d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1388,7 +1388,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): result = op(self.copy(), skipna=skipna, **kwargs) return type(self)._simple_new( - result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg] + result, freq=None, dtype=self.dtype # type: ignore[call-arg] ) @unpack_zerodim_and_defer("__add__") diff --git a/pandas/tests/arrays/datetimes/test_accumulator.py b/pandas/tests/arrays/datetimes/test_accumulator.py new file mode 100644 index 0000000000000..ecb37ea71e079 --- /dev/null +++ b/pandas/tests/arrays/datetimes/test_accumulator.py @@ -0,0 +1,31 @@ +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray + + +class TestAccumulator: + def test_accumulators_freq(self): + # GH#50297 + arr = DatetimeArray._from_sequence_not_strict( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + ], + freq="D", + ) + result = arr._accumulate("cummin") + expected = DatetimeArray._from_sequence_not_strict( + ["2000-01-01"] * 3, freq=None + ) + tm.assert_datetime_array_equal(result, expected) + + result = arr._accumulate("cummax") + expected = DatetimeArray._from_sequence_not_strict( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + ], + freq=None, + ) + tm.assert_datetime_array_equal(result, expected) From d1fa417eb2b7b165f370406803ca2f4299064490 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 4 Jan 2023 22:01:23 +0100 Subject: [PATCH 09/10] Add tests --- .../array_algos/datetimelike_accumulations.py | 2 +- .../arrays/datetimes/test_accumulator.py | 15 +++++++++++++++ .../arrays/timedeltas/test_accumulator.py | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/arrays/timedeltas/test_accumulator.py diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index b28fd9a0a3345..d0c622742126b 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -24,7 +24,7 @@ def _cum_func( Parameters ---------- - func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate values : np.ndarray Numpy array with the values (can be of any dtype that support the operation). Values is changed is modified inplace. diff --git a/pandas/tests/arrays/datetimes/test_accumulator.py b/pandas/tests/arrays/datetimes/test_accumulator.py index ecb37ea71e079..ca9760d58770a 100644 --- a/pandas/tests/arrays/datetimes/test_accumulator.py +++ b/pandas/tests/arrays/datetimes/test_accumulator.py @@ -1,3 +1,5 @@ +import pytest + import pandas._testing as tm from pandas.core.arrays import DatetimeArray @@ -29,3 +31,16 @@ def test_accumulators_freq(self): freq=None, ) tm.assert_datetime_array_equal(result, expected) + + @pytest.mark.parametrize("func", ["cumsum", "cumprod"]) + def test_accumulators_disallowed(self, func): + # GH#50297 + arr = DatetimeArray._from_sequence_not_strict( + [ + "2000-01-01", + "2000-01-02", + ], + freq="D", + ) + with pytest.raises(TypeError, match=f"Accumulation {func}"): + arr._accumulate(func) diff --git a/pandas/tests/arrays/timedeltas/test_accumulator.py b/pandas/tests/arrays/timedeltas/test_accumulator.py new file mode 100644 index 0000000000000..b321dc05bef27 --- /dev/null +++ b/pandas/tests/arrays/timedeltas/test_accumulator.py @@ -0,0 +1,19 @@ +import pytest + +import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray + + +class TestAccumulator: + def test_accumulators_disallowed(self): + # GH#50297 + arr = TimedeltaArray._from_sequence_not_strict(["1D", "2D"]) + with pytest.raises(TypeError, match="cumprod not supported"): + arr._accumulate("cumprod") + + def test_cumsum(self): + # GH#50297 + arr = TimedeltaArray._from_sequence_not_strict(["1D", "2D"]) + result = arr._accumulate("cumsum") + expected = TimedeltaArray._from_sequence_not_strict(["1D", "3D"]) + tm.assert_timedelta_array_equal(result, expected) From 177284a34c2393b965526b3224af8023092c7ff7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 19:11:01 +0100 Subject: [PATCH 10/10] Address review --- pandas/core/nanops.py | 3 +++ .../datetimes/{test_accumulator.py => test_cumulative.py} | 0 .../timedeltas/{test_accumulator.py => test_cumulative.py} | 0 3 files changed, 3 insertions(+) rename pandas/tests/arrays/datetimes/{test_accumulator.py => test_cumulative.py} (100%) rename pandas/tests/arrays/timedeltas/{test_accumulator.py => test_cumulative.py} (100%) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 32604b3b5b3b5..37df18cf86ae0 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1711,6 +1711,9 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: np.minimum.accumulate: (np.inf, np.nan), }[accum_func] + # This should go through ea interface + assert values.dtype.kind not in ["m", "M"] + # We will be applying this function to block values if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() diff --git a/pandas/tests/arrays/datetimes/test_accumulator.py b/pandas/tests/arrays/datetimes/test_cumulative.py similarity index 100% rename from pandas/tests/arrays/datetimes/test_accumulator.py rename to pandas/tests/arrays/datetimes/test_cumulative.py diff --git a/pandas/tests/arrays/timedeltas/test_accumulator.py b/pandas/tests/arrays/timedeltas/test_cumulative.py similarity index 100% rename from pandas/tests/arrays/timedeltas/test_accumulator.py rename to pandas/tests/arrays/timedeltas/test_cumulative.py