From 6f7c961356a3aff856c1285f933f0616065d1870 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Jul 2021 14:15:22 -0700 Subject: [PATCH 1/7] BUG: Groupby.min/max with Int64 --- pandas/_libs/groupby.pyx | 36 ++++++++++++++++++++---- pandas/core/arrays/masked.py | 2 ++ pandas/core/groupby/ops.py | 22 +++++++++++++-- pandas/tests/groupby/test_min_max.py | 42 ++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 354b87e03e6c4..47d51375b3d1d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1176,7 +1176,9 @@ cdef group_min_max(groupby_t[:, ::1] out, const intp_t[:] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - bint compute_max=True): + bint compute_max=True, + uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] mask_out=None): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1197,6 +1199,12 @@ cdef group_min_max(groupby_t[:, ::1] out, True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min + mask : ndarray[bool, ndim=2], optional + If not None, indices represent missing values, + otherwise the mask will not be used + mask_out : ndarray[bool, ndim=2], optional + If not None, these specify locations in the output that are NA. + Modified in-place. Notes ----- @@ -1209,6 +1217,8 @@ cdef group_min_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs + bint uses_mask = mask is not None + bint isna_entry # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` @@ -1243,7 +1253,12 @@ cdef group_min_max(groupby_t[:, ::1] out, for j in range(K): val = values[i, j] - if not _treat_as_na(val, is_datetimelike): + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: nobs[lab, j] += 1 if compute_max: if val > group_min_or_max[lab, j]: @@ -1259,7 +1274,10 @@ cdef group_min_max(groupby_t[:, ::1] out, runtime_error = True break else: - out[i, j] = nan_val + if uses_mask: + mask_out[i, j] = True + else: + out[i, j] = nan_val else: out[i, j] = group_min_or_max[i, j] @@ -1276,7 +1294,9 @@ def group_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, Py_ssize_t min_count=-1, - bint is_datetimelike=False) -> None: + bint is_datetimelike=False, + uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] mask_out=None) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1286,6 +1306,8 @@ def group_max(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=True, + mask=mask, + mask_out=mask_out, ) @@ -1296,7 +1318,9 @@ def group_min(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, Py_ssize_t min_count=-1, - bint is_datetimelike=False) -> None: + bint is_datetimelike=False, + uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] mask_out=None) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1306,6 +1330,8 @@ def group_min(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=False, + mask=mask, + mask_out=mask_out, ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3a152bd5889b7..b9afdb8091496 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -123,6 +123,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): raise ValueError("values must be a 1D array") if mask.ndim != 1: raise ValueError("mask must be a 1D array") + if values.shape != mask.shape: + raise ValueError("values and mask must have same shape") if copy: values = values.copy() diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 36fbda5974ea0..32bea30fd3dc7 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -142,7 +142,7 @@ def __init__(self, kind: str, how: str): }, } - _MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax"} + _MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max"} _cython_arity = {"ohlc": 4} # OHLC @@ -408,6 +408,7 @@ def _masked_ea_wrap_cython_operation( # Copy to ensure input and result masks don't end up shared mask = values._mask.copy() + mask_out = np.zeros(ngroups, dtype=bool) arr = values._data res_values = self._cython_op_ndim_compat( @@ -416,13 +417,18 @@ def _masked_ea_wrap_cython_operation( ngroups=ngroups, comp_ids=comp_ids, mask=mask, + mask_out=mask_out, **kwargs, ) + dtype = self._get_result_dtype(orig_values.dtype) assert isinstance(dtype, BaseMaskedDtype) cls = dtype.construct_array_type() - return cls(res_values.astype(dtype.type, copy=False), mask) + if self.kind != "aggregate": + return cls(res_values.astype(dtype.type, copy=False), mask) + else: + return cls(res_values.astype(dtype.type, copy=False), mask_out) @final def _cython_op_ndim_compat( @@ -432,7 +438,8 @@ def _cython_op_ndim_compat( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: np.ndarray | None, + mask: np.ndarray | None = None, + mask_out: np.ndarray | None = None, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -440,12 +447,15 @@ def _cython_op_ndim_compat( values2d = values[None, :] if mask is not None: mask = mask[None, :] + if mask_out is not None: + mask_out = mask_out[None, :] res = self._call_cython_op( values2d, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, mask=mask, + mask_out=mask_out, **kwargs, ) if res.shape[0] == 1: @@ -460,6 +470,7 @@ def _cython_op_ndim_compat( ngroups=ngroups, comp_ids=comp_ids, mask=mask, + mask_out=mask_out, **kwargs, ) @@ -472,6 +483,7 @@ def _call_cython_op( ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, + mask_out: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values @@ -497,6 +509,8 @@ def _call_cython_op( values = values.T if mask is not None: mask = mask.T + if mask_out is not None: + mask_out = mask_out.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) @@ -512,6 +526,8 @@ def _call_cython_op( values, comp_ids, min_count, + mask=mask, + mask_out=mask_out, is_datetimelike=is_datetimelike, ) else: diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 25a57d24e04ef..af2f1e1f49d7d 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -176,3 +176,45 @@ def test_aggregate_categorical_lost_index(func: str): expected["B"] = expected["B"].astype(ds.dtype) tm.assert_frame_equal(result, expected) + + +def test_groupby_min_nullable(): + # GH#41743 + ts = 1618556707013635762 + + df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) + df["ts"] = df["ts"].astype("Int64") + + gb = df.groupby("id") + + result = gb.min() + expected = df.iloc[:1].set_index("id") + tm.assert_frame_equal(result, expected) + + res_max = gb.max() + expected_max = df.iloc[1:].set_index("id") + tm.assert_frame_equal(res_max, expected_max) + + result2 = gb.min(min_count=3) + expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype="Int64") + tm.assert_frame_equal(result2, expected2) + + res_max2 = gb.max(min_count=3) + tm.assert_frame_equal(res_max2, expected2) + + # Case with NA values + df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) + df2["ts"] = df2["ts"].astype("Int64") + gb2 = df2.groupby("id") + + result3 = gb2.min() + tm.assert_frame_equal(result3, expected) + + res_max3 = gb2.max() + tm.assert_frame_equal(res_max3, expected_max) + + result4 = gb2.min(min_count=100) + tm.assert_frame_equal(result4, expected2) + + res_max4 = gb2.max(min_count=100) + tm.assert_frame_equal(res_max4, expected2) From 8f985015bb2a80c29b6c4e769421601e96beabe4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 15 Jul 2021 12:56:11 -0700 Subject: [PATCH 2/7] flesh out test cases --- pandas/tests/groupby/test_min_max.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index af2f1e1f49d7d..8aee71b59860a 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -178,12 +178,18 @@ def test_aggregate_categorical_lost_index(func: str): tm.assert_frame_equal(result, expected) -def test_groupby_min_nullable(): - # GH#41743 - ts = 1618556707013635762 +@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) +def test_groupby_min_nullable(dtype): + if dtype == "Int64": + # GH#41743 avoid precision loss + ts = 1618556707013635762 + elif dtype == "boolean": + ts = 0 + else: + ts = 4.0 df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) - df["ts"] = df["ts"].astype("Int64") + df["ts"] = df["ts"].astype(dtype) gb = df.groupby("id") @@ -196,7 +202,7 @@ def test_groupby_min_nullable(): tm.assert_frame_equal(res_max, expected_max) result2 = gb.min(min_count=3) - expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype="Int64") + expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype) tm.assert_frame_equal(result2, expected2) res_max2 = gb.max(min_count=3) @@ -204,7 +210,7 @@ def test_groupby_min_nullable(): # Case with NA values df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) - df2["ts"] = df2["ts"].astype("Int64") + df2["ts"] = df2["ts"].astype(dtype) gb2 = df2.groupby("id") result3 = gb2.min() From b325cc04efd7748b3171cab262c8c95ee4ee6f76 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 16 Jul 2021 14:17:15 -0700 Subject: [PATCH 3/7] const, rename test --- pandas/_libs/groupby.pyi | 4 ++++ pandas/_libs/groupby.pyx | 4 ++-- pandas/tests/groupby/test_min_max.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 7b1dcbe562123..d92b499eaaa49 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -123,6 +123,8 @@ def group_max( values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] min_count: int = ..., + mask: np.ndarray | None = ..., + mask_out: np.ndarray | None = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -130,6 +132,8 @@ def group_min( values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] min_count: int = ..., + mask: np.ndarray | None = ..., + mask_out: np.ndarray | None = ..., ) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 47d51375b3d1d..b71caad0305df 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1295,7 +1295,7 @@ def group_max(groupby_t[:, ::1] out, const intp_t[:] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - uint8_t[:, ::1] mask=None, + const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] mask_out=None) -> None: """See group_min_max.__doc__""" group_min_max( @@ -1319,7 +1319,7 @@ def group_min(groupby_t[:, ::1] out, const intp_t[:] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - uint8_t[:, ::1] mask=None, + const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] mask_out=None) -> None: """See group_min_max.__doc__""" group_min_max( diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 8aee71b59860a..68ea577a91850 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -179,7 +179,7 @@ def test_aggregate_categorical_lost_index(func: str): @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) -def test_groupby_min_nullable(dtype): +def test_groupby_min_max_nullable(dtype): if dtype == "Int64": # GH#41743 avoid precision loss ts = 1618556707013635762 From 921ad337fdfa8f8b9fdf757f52668e0c8d2a4937 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Jul 2021 16:00:31 -0700 Subject: [PATCH 4/7] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d8d00db47e03d..12b95d0acd5a3 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -260,7 +260,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`) - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) -- +- Bug in :meth:`GroupBy.max` and :meth:`GroupBy.min` with nullable integer dtypes losing precision (:issue:`41743`) Reshaping ^^^^^^^^^ From 5f4ea99e11ea3d265e3054eb28efb642353764ac Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Jul 2021 19:40:50 -0700 Subject: [PATCH 5/7] mask -> mask_in --- pandas/_libs/groupby.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1f68f54a5f04d..e92bb4000cb02 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1177,7 +1177,7 @@ cdef group_min_max(groupby_t[:, ::1] out, Py_ssize_t min_count=-1, bint is_datetimelike=False, bint compute_max=True, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, ::1] mask_in=None, uint8_t[:, ::1] mask_out=None): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1199,7 +1199,7 @@ cdef group_min_max(groupby_t[:, ::1] out, True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min - mask : ndarray[bool, ndim=2], optional + mask_in : ndarray[bool, ndim=2], optional If not None, indices represent missing values, otherwise the mask will not be used mask_out : ndarray[bool, ndim=2], optional @@ -1217,7 +1217,7 @@ cdef group_min_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs - bint uses_mask = mask is not None + bint uses_mask = mask_in is not None bint isna_entry # TODO(cython 3.0): @@ -1254,7 +1254,7 @@ cdef group_min_max(groupby_t[:, ::1] out, val = values[i, j] if uses_mask: - isna_entry = mask[i, j] + isna_entry = mask_in[i, j] else: isna_entry = _treat_as_na(val, is_datetimelike) @@ -1306,7 +1306,7 @@ def group_max(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=True, - mask=mask, + mask_in=mask, mask_out=mask_out, ) @@ -1330,7 +1330,7 @@ def group_min(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=False, - mask=mask, + mask_in=mask, mask_out=mask_out, ) From 35def869bc94f9f76f1fa4dd3429363e13f4637d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 8 Aug 2021 20:44:59 -0700 Subject: [PATCH 6/7] mask_out -> result_mask --- pandas/_libs/groupby.pyi | 4 ++-- pandas/_libs/groupby.pyx | 14 +++++++------- pandas/core/groupby/ops.py | 24 ++++++++++++------------ 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index d92b499eaaa49..b363524e4e592 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -124,7 +124,7 @@ def group_max( labels: np.ndarray, # const int64_t[:] min_count: int = ..., mask: np.ndarray | None = ..., - mask_out: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -133,7 +133,7 @@ def group_min( labels: np.ndarray, # const int64_t[:] min_count: int = ..., mask: np.ndarray | None = ..., - mask_out: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ecfca447a378d..823b4d07feafc 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1178,7 +1178,7 @@ cdef group_min_max(groupby_t[:, ::1] out, bint is_datetimelike=False, bint compute_max=True, const uint8_t[:, ::1] mask_in=None, - uint8_t[:, ::1] mask_out=None): + uint8_t[:, ::1] result_mask=None): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1202,7 +1202,7 @@ cdef group_min_max(groupby_t[:, ::1] out, mask_in : ndarray[bool, ndim=2], optional If not None, indices represent missing values, otherwise the mask will not be used - mask_out : ndarray[bool, ndim=2], optional + result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. @@ -1275,7 +1275,7 @@ cdef group_min_max(groupby_t[:, ::1] out, break else: if uses_mask: - mask_out[i, j] = True + result_mask[i, j] = True else: out[i, j] = nan_val else: @@ -1296,7 +1296,7 @@ def group_max(groupby_t[:, ::1] out, Py_ssize_t min_count=-1, bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, - uint8_t[:, ::1] mask_out=None) -> None: + uint8_t[:, ::1] result_mask=None) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1307,7 +1307,7 @@ def group_max(groupby_t[:, ::1] out, is_datetimelike=is_datetimelike, compute_max=True, mask_in=mask, - mask_out=mask_out, + result_mask=result_mask, ) @@ -1320,7 +1320,7 @@ def group_min(groupby_t[:, ::1] out, Py_ssize_t min_count=-1, bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, - uint8_t[:, ::1] mask_out=None) -> None: + uint8_t[:, ::1] result_mask=None) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1331,7 +1331,7 @@ def group_min(groupby_t[:, ::1] out, is_datetimelike=is_datetimelike, compute_max=False, mask_in=mask, - mask_out=mask_out, + result_mask=result_mask, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 35a8e1fff38b0..78ce52a3b2444 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -408,7 +408,7 @@ def _masked_ea_wrap_cython_operation( # Copy to ensure input and result masks don't end up shared mask = values._mask.copy() - mask_out = np.zeros(ngroups, dtype=bool) + result_mask = np.zeros(ngroups, dtype=bool) arr = values._data res_values = self._cython_op_ndim_compat( @@ -417,7 +417,7 @@ def _masked_ea_wrap_cython_operation( ngroups=ngroups, comp_ids=comp_ids, mask=mask, - mask_out=mask_out, + result_mask=result_mask, **kwargs, ) @@ -428,7 +428,7 @@ def _masked_ea_wrap_cython_operation( if self.kind != "aggregate": return cls(res_values.astype(dtype.type, copy=False), mask) else: - return cls(res_values.astype(dtype.type, copy=False), mask_out) + return cls(res_values.astype(dtype.type, copy=False), result_mask) @final def _cython_op_ndim_compat( @@ -439,7 +439,7 @@ def _cython_op_ndim_compat( ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None = None, - mask_out: np.ndarray | None = None, + result_mask: np.ndarray | None = None, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -447,15 +447,15 @@ def _cython_op_ndim_compat( values2d = values[None, :] if mask is not None: mask = mask[None, :] - if mask_out is not None: - mask_out = mask_out[None, :] + if result_mask is not None: + result_mask = result_mask[None, :] res = self._call_cython_op( values2d, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, mask=mask, - mask_out=mask_out, + result_mask=result_mask, **kwargs, ) if res.shape[0] == 1: @@ -470,7 +470,7 @@ def _cython_op_ndim_compat( ngroups=ngroups, comp_ids=comp_ids, mask=mask, - mask_out=mask_out, + result_mask=result_mask, **kwargs, ) @@ -483,7 +483,7 @@ def _call_cython_op( ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, - mask_out: np.ndarray | None, + result_mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values @@ -509,8 +509,8 @@ def _call_cython_op( values = values.T if mask is not None: mask = mask.T - if mask_out is not None: - mask_out = mask_out.T + if result_mask is not None: + result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) @@ -527,7 +527,7 @@ def _call_cython_op( comp_ids, min_count, mask=mask, - mask_out=mask_out, + result_mask=result_mask, is_datetimelike=is_datetimelike, ) else: From 60431052d860d836b250c0c2ab0e1a061810da00 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 31 Aug 2021 14:38:15 -0700 Subject: [PATCH 7/7] mask_in->mask --- pandas/_libs/groupby.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 20d9d5bfa3df0..40e1049c39588 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1183,7 +1183,7 @@ cdef group_min_max(groupby_t[:, ::1] out, Py_ssize_t min_count=-1, bint is_datetimelike=False, bint compute_max=True, - const uint8_t[:, ::1] mask_in=None, + const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1205,7 +1205,7 @@ cdef group_min_max(groupby_t[:, ::1] out, True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min - mask_in : ndarray[bool, ndim=2], optional + mask : ndarray[bool, ndim=2], optional If not None, indices represent missing values, otherwise the mask will not be used result_mask : ndarray[bool, ndim=2], optional @@ -1223,7 +1223,7 @@ cdef group_min_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs - bint uses_mask = mask_in is not None + bint uses_mask = mask is not None bint isna_entry # TODO(cython 3.0): @@ -1260,7 +1260,7 @@ cdef group_min_max(groupby_t[:, ::1] out, val = values[i, j] if uses_mask: - isna_entry = mask_in[i, j] + isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, is_datetimelike) @@ -1312,7 +1312,7 @@ def group_max(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=True, - mask_in=mask, + mask=mask, result_mask=result_mask, ) @@ -1336,7 +1336,7 @@ def group_min(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=False, - mask_in=mask, + mask=mask, result_mask=result_mask, )