From 2c2e7832c004e33eddb6b02314b28358252574b8 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Feb 2023 19:07:30 -0800 Subject: [PATCH 1/3] BUG: groupby.agg with numba and as_index=False --- pandas/core/groupby/generic.py | 18 ++++------------- pandas/core/groupby/groupby.py | 21 ++++++++++++++------ pandas/tests/groupby/aggregate/test_numba.py | 5 +++-- pandas/tests/groupby/transform/test_numba.py | 5 +++-- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fd07b472fc3da..c17c231c3b1bb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -219,16 +219,9 @@ def apply(self, func, *args, **kwargs) -> Series: def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - data = self._obj_with_exclusions - result = self._aggregate_with_numba( - data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs + return self._aggregate_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs ) - index = self.grouper.result_index - result = self.obj._constructor(result.ravel(), index=index, name=data.name) - if not self.as_index: - result = self._insert_inaxis_grouper(result) - result.index = default_index(len(result)) - return result relabeling = func is None columns = None @@ -1264,12 +1257,9 @@ class DataFrameGroupBy(GroupBy[DataFrame]): def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - data = self._obj_with_exclusions - result = self._aggregate_with_numba( - data, func, *args, engine_kwargs=engine_kwargs, **kwargs + return self._aggregate_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs ) - index = self.grouper.result_index - return self.obj._constructor(result, index=index, columns=data.columns) relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a54c524094b23..67e37a86d9409 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1270,9 +1270,7 @@ def _transform_with_numba( return result.take(np.argsort(sorted_index), axis=0) @final - def _aggregate_with_numba( - self, data: DataFrame, func, *args, engine_kwargs=None, **kwargs - ): + def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): """ Perform groupby aggregation routine with the numba engine. @@ -1280,7 +1278,10 @@ def _aggregate_with_numba( to generate the indices of each group in the sorted data and then passes the data and indices into a Numba jitted function. """ - starts, ends, sorted_index, sorted_data = self._numba_prep(data) + data = self._obj_with_exclusions + df = data if data.ndim == 2 else data.to_frame() + + starts, ends, sorted_index, sorted_data = self._numba_prep(df) numba_.validate_udf(func) numba_agg_func = numba_.generate_numba_agg_func( func, **get_jit_arguments(engine_kwargs, kwargs) @@ -1290,10 +1291,18 @@ def _aggregate_with_numba( sorted_index, starts, ends, - len(data.columns), + len(df.columns), *args, ) - return result + + index = self.grouper.result_index + if data.ndim == 1: + result_kwargs = {"name": data.name} + result = result.ravel() + else: + result_kwargs = {"columns": data.columns} + result = data._constructor(result, index=index, **result_kwargs) + return self._wrap_aggregated_output(result) # ----------------------------------------------------------------- # apply/agg/transform diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 0b2fb56a02006..9dd3d1d45abf0 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -51,7 +51,8 @@ def incorrect_function(values, index): # Filter warnings when parallel=True and the function can't be parallelized by Numba @pytest.mark.parametrize("jit", [True, False]) @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) -def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython): +@pytest.mark.parametrize("as_index", [True, False]) +def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): def func_numba(values, index): return np.mean(values) * 2.7 @@ -65,7 +66,7 @@ def func_numba(values, index): {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - grouped = data.groupby(0) + grouped = data.groupby(0, as_index=as_index) if pandas_obj == "Series": grouped = grouped[1] diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 2b70d7325a209..0264d2a09778f 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -48,7 +48,8 @@ def incorrect_function(values, index): # Filter warnings when parallel=True and the function can't be parallelized by Numba @pytest.mark.parametrize("jit", [True, False]) @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) -def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython): +@pytest.mark.parametrize("as_index", [True, False]) +def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): def func(values, index): return values + 1 @@ -62,7 +63,7 @@ def func(values, index): {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - grouped = data.groupby(0) + grouped = data.groupby(0, as_index=as_index) if pandas_obj == "Series": grouped = grouped[1] From 8d403582455e4251df953d3af9a5acc45a867420 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Feb 2023 07:27:10 -0800 Subject: [PATCH 2/3] smaller-diff implementation --- pandas/core/groupby/generic.py | 22 ++++++++++++++++++---- pandas/core/groupby/groupby.py | 21 ++++++--------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c17c231c3b1bb..be5e58f3cd4ef 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -219,9 +219,16 @@ def apply(self, func, *args, **kwargs) -> Series: def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - return self._aggregate_with_numba( - func, *args, engine_kwargs=engine_kwargs, **kwargs + data = self._obj_with_exclusions + result = self._aggregate_with_numba( + data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs ) + index = self.grouper.result_index + result = self.obj._constructor(result.ravel(), index=index, name=data.name) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return result relabeling = func is None columns = None @@ -1257,9 +1264,16 @@ class DataFrameGroupBy(GroupBy[DataFrame]): def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - return self._aggregate_with_numba( - func, *args, engine_kwargs=engine_kwargs, **kwargs + data = self._obj_with_exclusions + result = self._aggregate_with_numba( + data, func, *args, engine_kwargs=engine_kwargs, **kwargs ) + index = self.grouper.result_index + result = self.obj._constructor(result, index=index, columns=data.columns) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return result relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 67e37a86d9409..a54c524094b23 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1270,7 +1270,9 @@ def _transform_with_numba( return result.take(np.argsort(sorted_index), axis=0) @final - def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): + def _aggregate_with_numba( + self, data: DataFrame, func, *args, engine_kwargs=None, **kwargs + ): """ Perform groupby aggregation routine with the numba engine. @@ -1278,10 +1280,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): to generate the indices of each group in the sorted data and then passes the data and indices into a Numba jitted function. """ - data = self._obj_with_exclusions - df = data if data.ndim == 2 else data.to_frame() - - starts, ends, sorted_index, sorted_data = self._numba_prep(df) + starts, ends, sorted_index, sorted_data = self._numba_prep(data) numba_.validate_udf(func) numba_agg_func = numba_.generate_numba_agg_func( func, **get_jit_arguments(engine_kwargs, kwargs) @@ -1291,18 +1290,10 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): sorted_index, starts, ends, - len(df.columns), + len(data.columns), *args, ) - - index = self.grouper.result_index - if data.ndim == 1: - result_kwargs = {"name": data.name} - result = result.ravel() - else: - result_kwargs = {"columns": data.columns} - result = data._constructor(result, index=index, **result_kwargs) - return self._wrap_aggregated_output(result) + return result # ----------------------------------------------------------------- # apply/agg/transform From f76ce9bdd66a3ca6564265167cc6c324d817e855 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Feb 2023 09:16:19 -0800 Subject: [PATCH 3/3] Whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f907e89880d25..3717e9b011f1c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1297,6 +1297,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"ngroup"`` argument (:issue:`45986`) - Bug in :meth:`.DataFrameGroupBy.describe` produced incorrect results when data had duplicate columns (:issue:`50806`) +- Bug in :meth:`.DataFrameGroupBy.agg` with ``engine="numba"`` failing to respect ``as_index=False`` (:issue:`51228`) - Reshaping