From 071c9fa4626b056135b35fbf898092e3e57ba4ca Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 18 Jul 2019 22:21:16 -0500 Subject: [PATCH 01/42] Rename _is_cython_func to _get_cython_func --- pandas/core/base.py | 4 ++-- pandas/core/groupby/generic.py | 7 ++++--- pandas/core/resample.py | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9480e2e425f79..de59126463970 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -565,7 +565,7 @@ def is_any_frame(): else: result = None - f = self._is_cython_func(arg) + f = self._get_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None @@ -652,7 +652,7 @@ def _shallow_copy(self, obj=None, obj_type=None, **kwargs): kwargs[attr] = getattr(self, attr) return obj_type(obj, **kwargs) - def _is_cython_func(self, arg): + def _get_cython_func(self, arg): """ if we define an internal function for this argument, return it """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7fd0ca94e7997..66f4794bff589 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -572,7 +572,8 @@ def _transform_general(self, func, *args, **kwargs): def transform(self, func, *args, **kwargs): # optimized transforms - func = self._is_cython_func(func) or func + func = self._get_cython_func(func) or func + if isinstance(func, str): if func in base.cython_transforms: # cythonized transform @@ -852,7 +853,7 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): if relabeling: ret.columns = columns else: - cyfunc = self._is_cython_func(func_or_funcs) + cyfunc = self._get_cython_func(func_or_funcs) if cyfunc and not args and not kwargs: return getattr(self, cyfunc)() @@ -1004,7 +1005,7 @@ def _aggregate_named(self, func, *args, **kwargs): @Substitution(klass="Series", selected="A.") @Appender(_transform_template) def transform(self, func, *args, **kwargs): - func = self._is_cython_func(func) or func + func = self._get_cython_func(func) or func # if string function if isinstance(func, str): diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b4a3e6ed71bf4..0f15cc54e9c67 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1047,7 +1047,7 @@ def _downsample(self, how, **kwargs): **kwargs : kw args passed to how function """ self._set_binner() - how = self._is_cython_func(how) or how + how = self._get_cython_func(how) or how ax = self.ax obj = self._selected_obj @@ -1197,7 +1197,7 @@ def _downsample(self, how, **kwargs): if self.kind == "timestamp": return super()._downsample(how, **kwargs) - how = self._is_cython_func(how) or how + how = self._get_cython_func(how) or how ax = self.ax if is_subperiod(ax.freq, self.freq): From 0118cc89c4841b2ad7b6208e1f4f29fe6ad6d284 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 18 Jul 2019 22:21:27 -0500 Subject: [PATCH 02/42] typo --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 66f4794bff589..f462460034e38 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -590,7 +590,7 @@ def transform(self, func, *args, **kwargs): obj = self._obj_with_exclusions - # nuiscance columns + # nuisance columns if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) From 2f29e4e5d76cadcf86b4d2efea56bc3c2327be35 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 18 Jul 2019 23:13:35 -0500 Subject: [PATCH 03/42] CLN: whitelist func in transform(func) --- pandas/core/groupby/base.py | 64 +++++++++++++++++++++++++++++++++- pandas/core/groupby/generic.py | 14 +++++--- 2 files changed, 73 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 5c4f1fa3fbddf..97dbe890e9afe 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -98,6 +98,68 @@ def _gotitem(self, key, ndim, subset=None): dataframe_apply_whitelist = common_apply_whitelist | frozenset(["dtypes", "corrwith"]) -cython_transforms = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) +# cythonized transformations or canned "agg+broadcast", which do not +# require postprocessing of the result by transform. +cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) + +# List of aggregation/reduction functions. +# These map each series/column to a single value +reduction_functions = frozenset( + [ + "sum", + "first", + "mean", + "all", + "any", + "bfill", + "corr", + "count", + "cov", + "idxmax", + "idxmin", + "last", + "mad", + "max", + "median", + "min", + "ngroup", + "nth", + "nunique", + "prod", + "quantile", + "sem", + "size", + "skew", + "std", + "var", + ] +) + +# List of transformation functions. +# These map each object to a like-indexed result object +transformation_functions = frozenset( + [ + "backfill", + "corrwith", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "diff", + "ffill", + "fillna", + "rank", + "pad", + "pct_change", + "shift", + "tshift", + ] +) + +# Valid values of `name` for `groupby.transform(name)` +transform_recognized_functions = reduction_functions | transformation_functions +transform_recognized_functions -= \ + {"corr"} # returns multindex, exclude from transform(name) for now. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f462460034e38..da9f2b79482a4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -575,8 +575,11 @@ def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func if isinstance(func, str): - if func in base.cython_transforms: - # cythonized transform + if not (func in base.transform_recognized_functions): + msg = "'%s' is not a valid function name for transform(name)" + raise ValueError(msg % func) + if func in base.cythonized_kernels: + # cythonized transformation or canned "reduction+broadcast" return getattr(self, func)(*args, **kwargs) else: # cythonized aggregation and merge @@ -1009,8 +1012,11 @@ def transform(self, func, *args, **kwargs): # if string function if isinstance(func, str): - if func in base.cython_transforms: - # cythonized transform + if not (func in base.transform_recognized_functions): + msg = "'%s' is not a valid function name for transform(name)" + raise ValueError(msg % func) + if func in base.cythonized_kernels: + # cythonized transform or canned "agg+broadcast" return getattr(self, func)(*args, **kwargs) else: # cythonized aggregation and merge From 45e9f707260698c8282695cb1351b4060fef32ba Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 18 Jul 2019 23:15:24 -0500 Subject: [PATCH 04/42] comments --- pandas/core/groupby/generic.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index da9f2b79482a4..f7690af7f2971 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -582,7 +582,9 @@ def transform(self, func, *args, **kwargs): # cythonized transformation or canned "reduction+broadcast" return getattr(self, func)(*args, **kwargs) else: - # cythonized aggregation and merge + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. result = getattr(self, func)(*args, **kwargs) else: return self._transform_general(func, *args, **kwargs) @@ -1010,7 +1012,6 @@ def _aggregate_named(self, func, *args, **kwargs): def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func - # if string function if isinstance(func, str): if not (func in base.transform_recognized_functions): msg = "'%s' is not a valid function name for transform(name)" @@ -1019,7 +1020,9 @@ def transform(self, func, *args, **kwargs): # cythonized transform or canned "agg+broadcast" return getattr(self, func)(*args, **kwargs) else: - # cythonized aggregation and merge + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. return self._transform_fast( lambda: getattr(self, func)(*args, **kwargs), func ) From 0a1a0fb47b18410ebe9058ac8a54ca5acb9a1621 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 18 Jul 2019 23:24:22 -0500 Subject: [PATCH 05/42] tests --- pandas/tests/groupby/test_transform.py | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 1eab3ba253f4d..f3561e4d60af1 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1001,3 +1001,49 @@ def test_ffill_not_in_axis(func, key, val): expected = df assert_frame_equal(result, expected) + + +def test_transform_invalid_name_raises(): + df = DataFrame(dict(a=[0, 1, 1, 2])) + g = df.groupby(["a", "b", "b", "c"]) + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("some_arbitrary_name") + + # method exists on the object, but is not a valid transformation/agg + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("aggregate") + + # Test SeriesGroupBy + g = df["a"].groupby(["a", "b", "b", "c"]) + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("some_arbitrary_name") + + # method exists on the object, but is not a valid transformation/agg + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("aggregate") + + +from pandas.core.groupby.base import reduction_functions + + +@pytest.mark.parametrize("func", reduction_functions) +def test_transform_agg_by_name(func): + + df = DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) + g = df.groupby(np.repeat([0, 1], 3)) + + if func == "ngroup": # GH#27468 + pytest.xfail("TODO: g.transform('ngroup') doesn't work") + if func == "size": # GH#27469 + pytest.xfail("TODO: g.transform('size') doesn't work") + if func == "corr": + pytest.xfail("corr returns multiindex, excluded from transform for now") + + args = {"nth": [0], "quantile": [0.5]}.get(func, []) + + print(func) + result = g.transform(func, *args) + tm.assert_index_equal(result.index, df.index) + + # check values replicated broadcasted across group + assert len(set(result.iloc[-3:, 1])) == 1 From b845034992d8985cfc33cc50822ba10a55f5f5c3 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 19 Jul 2019 01:16:14 -0500 Subject: [PATCH 06/42] black --- pandas/core/groupby/base.py | 5 +++-- pandas/tests/groupby/test_transform.py | 5 ++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 97dbe890e9afe..8bb102540d3e3 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -161,5 +161,6 @@ def _gotitem(self, key, ndim, subset=None): # Valid values of `name` for `groupby.transform(name)` transform_recognized_functions = reduction_functions | transformation_functions -transform_recognized_functions -= \ - {"corr"} # returns multindex, exclude from transform(name) for now. +transform_recognized_functions -= { + "corr" +} # returns multindex, exclude from transform(name) for now. diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index f3561e4d60af1..17e66e860a5b7 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -18,6 +18,7 @@ concat, date_range, ) +from pandas.core.groupby.base import reduction_functions from pandas.core.groupby.groupby import DataError from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -1010,6 +1011,7 @@ def test_transform_invalid_name_raises(): g.transform("some_arbitrary_name") # method exists on the object, but is not a valid transformation/agg + assert hasattr(g, "aggregate") # make sure the method exists with pytest.raises(ValueError, match="not a valid function name"): g.transform("aggregate") @@ -1023,9 +1025,6 @@ def test_transform_invalid_name_raises(): g.transform("aggregate") -from pandas.core.groupby.base import reduction_functions - - @pytest.mark.parametrize("func", reduction_functions) def test_transform_agg_by_name(func): From f45acd54499a2c8ca00a7c3d44ad5c0647a25529 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 19 Jul 2019 09:25:57 -0500 Subject: [PATCH 07/42] Deterministic test order for pytest collection --- pandas/tests/groupby/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 17e66e860a5b7..e8ddf32a4132a 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1025,7 +1025,7 @@ def test_transform_invalid_name_raises(): g.transform("aggregate") -@pytest.mark.parametrize("func", reduction_functions) +@pytest.mark.parametrize("func", sorted(reduction_functions)) def test_transform_agg_by_name(func): df = DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) From 514528b0949238b4a5126350c146cc90afdf4380 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 19 Jul 2019 18:08:30 -0500 Subject: [PATCH 08/42] use format() --- pandas/core/groupby/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f7690af7f2971..a34b89205b4ef 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -576,8 +576,8 @@ def transform(self, func, *args, **kwargs): if isinstance(func, str): if not (func in base.transform_recognized_functions): - msg = "'%s' is not a valid function name for transform(name)" - raise ValueError(msg % func) + msg = "'{func}' is not a valid function name for transform(name)" + raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: # cythonized transformation or canned "reduction+broadcast" return getattr(self, func)(*args, **kwargs) @@ -1014,8 +1014,8 @@ def transform(self, func, *args, **kwargs): if isinstance(func, str): if not (func in base.transform_recognized_functions): - msg = "'%s' is not a valid function name for transform(name)" - raise ValueError(msg % func) + msg = "'{func}' is not a valid function name for transform(name)" + raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: # cythonized transform or canned "agg+broadcast" return getattr(self, func)(*args, **kwargs) From d662c9bae9f4aec7441066310b78d1897f81c0e4 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 19 Jul 2019 18:16:27 -0500 Subject: [PATCH 09/42] delete print func, now parameterized --- pandas/tests/groupby/test_transform.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index e8ddf32a4132a..e48bc245f5674 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1040,7 +1040,6 @@ def test_transform_agg_by_name(func): args = {"nth": [0], "quantile": [0.5]}.get(func, []) - print(func) result = g.transform(func, *args) tm.assert_index_equal(result.index, df.index) From 3c56a62dce809a4f3d845c6a4b58161a5d3badcc Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 19 Jul 2019 18:25:55 -0500 Subject: [PATCH 10/42] Make ita fixture --- pandas/tests/groupby/conftest.py | 7 +++++++ pandas/tests/groupby/test_transform.py | 6 ++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index bdf93756b7559..2ff9388b2f1e6 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -102,3 +102,10 @@ def three_group(): "F": np.random.randn(11), } ) + + +from pandas.core.groupby.base import reduction_functions + +@pytest.fixture(params=sorted(reduction_functions)) +def reduction_func(request): + return request.param diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index e48bc245f5674..d2bf477428dea 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -18,7 +18,6 @@ concat, date_range, ) -from pandas.core.groupby.base import reduction_functions from pandas.core.groupby.groupby import DataError from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -1025,9 +1024,8 @@ def test_transform_invalid_name_raises(): g.transform("aggregate") -@pytest.mark.parametrize("func", sorted(reduction_functions)) -def test_transform_agg_by_name(func): - +def test_transform_agg_by_name(reduction_func): + func = reduction_func df = DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) g = df.groupby(np.repeat([0, 1], 3)) From 52d4a618d2e712570e603e13faa60d6353271ea8 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 19 Jul 2019 18:30:53 -0500 Subject: [PATCH 11/42] typing --- pandas/core/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index de59126463970..4e8b04f0eea6a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -5,6 +5,7 @@ from collections import OrderedDict import textwrap import warnings +from typing import Optional import numpy as np @@ -652,7 +653,7 @@ def _shallow_copy(self, obj=None, obj_type=None, **kwargs): kwargs[attr] = getattr(self, attr) return obj_type(obj, **kwargs) - def _get_cython_func(self, arg): + def _get_cython_func(self, arg: str) -> Optional[str]: """ if we define an internal function for this argument, return it """ From a6a681a08e0e050119c4fa6f33f7f80784111407 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 19 Jul 2019 18:31:12 -0500 Subject: [PATCH 12/42] black --- pandas/core/base.py | 2 +- pandas/tests/groupby/conftest.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 4e8b04f0eea6a..4b20bafcdf941 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,8 +4,8 @@ import builtins from collections import OrderedDict import textwrap -import warnings from typing import Optional +import warnings import numpy as np diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 2ff9388b2f1e6..a17d0b5b5793c 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,6 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex +from pandas.core.groupby.base import reduction_functions from pandas.util import testing as tm @@ -104,8 +105,6 @@ def three_group(): ) -from pandas.core.groupby.base import reduction_functions - @pytest.fixture(params=sorted(reduction_functions)) def reduction_func(request): return request.param From 4e3ba6353da5f38ecfa1b1b4af871488fd5eb5a5 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 19 Jul 2019 18:57:12 -0500 Subject: [PATCH 13/42] Link issue --- pandas/tests/groupby/test_transform.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index d2bf477428dea..ba9aba53a4ab5 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1004,6 +1004,7 @@ def test_ffill_not_in_axis(func, key, val): def test_transform_invalid_name_raises(): + # GH#27486 df = DataFrame(dict(a=[0, 1, 1, 2])) g = df.groupby(["a", "b", "b", "c"]) with pytest.raises(ValueError, match="not a valid function name"): From 0ad156b2498f53392bb9467c9de65a09f6e07613 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 09:46:31 -0500 Subject: [PATCH 14/42] sort names --- pandas/core/groupby/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 8bb102540d3e3..2cd91f627f7d1 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -108,20 +108,19 @@ def _gotitem(self, key, ndim, subset=None): # These map each series/column to a single value reduction_functions = frozenset( [ - "sum", - "first", - "mean", "all", "any", "bfill", "corr", "count", "cov", + "first", "idxmax", "idxmin", "last", "mad", "max", + "mean", "median", "min", "ngroup", @@ -133,6 +132,7 @@ def _gotitem(self, key, ndim, subset=None): "size", "skew", "std", + "sum", "var", ] ) @@ -151,9 +151,9 @@ def _gotitem(self, key, ndim, subset=None): "diff", "ffill", "fillna", - "rank", "pad", "pct_change", + "rank", "shift", "tshift", ] From b1cb4b0d7e8fc9f3f82d0c44c3ac9b91a722be61 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 09:47:37 -0500 Subject: [PATCH 15/42] docstring --- pandas/tests/groupby/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index a17d0b5b5793c..4121cb56b27e8 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -107,4 +107,6 @@ def three_group(): @pytest.fixture(params=sorted(reduction_functions)) def reduction_func(request): + """yields the string names of all groupby reduction functions, one at a time. + """ return request.param From d445de4791043ad56eb60392ce069ca3de1c5fa3 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 09:50:22 -0500 Subject: [PATCH 16/42] rename to use 'kernel' --- pandas/core/groupby/base.py | 8 ++++---- pandas/core/groupby/generic.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 2cd91f627f7d1..adc612d49b501 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -106,7 +106,7 @@ def _gotitem(self, key, ndim, subset=None): # List of aggregation/reduction functions. # These map each series/column to a single value -reduction_functions = frozenset( +reduction_kernels = frozenset( [ "all", "any", @@ -139,7 +139,7 @@ def _gotitem(self, key, ndim, subset=None): # List of transformation functions. # These map each object to a like-indexed result object -transformation_functions = frozenset( +transformation_kernels = frozenset( [ "backfill", "corrwith", @@ -160,7 +160,7 @@ def _gotitem(self, key, ndim, subset=None): ) # Valid values of `name` for `groupby.transform(name)` -transform_recognized_functions = reduction_functions | transformation_functions -transform_recognized_functions -= { +transform_kernel_whitelist = reduction_kernels | transformation_kernels +transform_kernel_whitelist -= { "corr" } # returns multindex, exclude from transform(name) for now. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a34b89205b4ef..1aa15d0aa54ac 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -575,7 +575,7 @@ def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func if isinstance(func, str): - if not (func in base.transform_recognized_functions): + if not (func in base.transform_kernel_whitelist): msg = "'{func}' is not a valid function name for transform(name)" raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: @@ -1013,7 +1013,7 @@ def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func if isinstance(func, str): - if not (func in base.transform_recognized_functions): + if not (func in base.transform_kernel_whitelist): msg = "'{func}' is not a valid function name for transform(name)" raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: From de2c59cf44edf70bbc9e368d39edab05066fdc40 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 09:54:52 -0500 Subject: [PATCH 17/42] corr doesn't belong on the list --- pandas/core/groupby/base.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index adc612d49b501..f0482a31d959b 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -111,7 +111,6 @@ def _gotitem(self, key, ndim, subset=None): "all", "any", "bfill", - "corr", "count", "cov", "first", @@ -161,6 +160,3 @@ def _gotitem(self, key, ndim, subset=None): # Valid values of `name` for `groupby.transform(name)` transform_kernel_whitelist = reduction_kernels | transformation_kernels -transform_kernel_whitelist -= { - "corr" -} # returns multindex, exclude from transform(name) for now. From bd73cff65df525fb49848348be3548e6b0342a5e Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 09:57:10 -0500 Subject: [PATCH 18/42] comment --- pandas/core/groupby/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index f0482a31d959b..b052dcf956779 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -159,4 +159,6 @@ def _gotitem(self, key, ndim, subset=None): ) # Valid values of `name` for `groupby.transform(name)` +# NOTE: do NOT edit this directly. New additions should be inserted +# into the appropriate list above. transform_kernel_whitelist = reduction_kernels | transformation_kernels From 7d7eecc74c8fd318595f2de95e351b37d4fd2f4d Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 10:01:07 -0500 Subject: [PATCH 19/42] whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9caf127553e05..f7bb8f0bd1131 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -39,7 +39,7 @@ Backwards incompatible API changes .. _whatsnew_1000.api.other: -- +- :meth:`GroupBy.transform(name)` now raises on invalid operation names (:issue:`27489`). - Other API changes From d343686e243dac94f0d8dbe4c449c290383660b3 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 10:49:29 -0500 Subject: [PATCH 20/42] Fix name in conftest.py --- pandas/tests/groupby/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 4121cb56b27e8..72e60c5099304 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex -from pandas.core.groupby.base import reduction_functions +from pandas.core.groupby.base import reduction_kernels from pandas.util import testing as tm @@ -105,7 +105,7 @@ def three_group(): ) -@pytest.fixture(params=sorted(reduction_functions)) +@pytest.fixture(params=sorted(reduction_kernels)) def reduction_func(request): """yields the string names of all groupby reduction functions, one at a time. """ From 6d0301a1a45bf984bff6ed5fcfec9f38656a9655 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 11:17:42 -0500 Subject: [PATCH 21/42] test SeriesGroupby --- pandas/tests/groupby/test_transform.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index ba9aba53a4ab5..b2a756a226aad 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1025,10 +1025,16 @@ def test_transform_invalid_name_raises(): g.transform("aggregate") -def test_transform_agg_by_name(reduction_func): +@pytest.mark.parametrize( + "obj", + [ + DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))), + Series([0, 0, 0, 1, 1, 1]) + ], +) +def test_transform_agg_by_name(reduction_func, obj): func = reduction_func - df = DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) - g = df.groupby(np.repeat([0, 1], 3)) + g = obj.groupby(np.repeat([0, 1], 3)) if func == "ngroup": # GH#27468 pytest.xfail("TODO: g.transform('ngroup') doesn't work") @@ -1036,11 +1042,13 @@ def test_transform_agg_by_name(reduction_func): pytest.xfail("TODO: g.transform('size') doesn't work") if func == "corr": pytest.xfail("corr returns multiindex, excluded from transform for now") + if func == "cov" and isinstance(obj, Series): + pytest.xfail("skip cov for series") args = {"nth": [0], "quantile": [0.5]}.get(func, []) result = g.transform(func, *args) - tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.index, obj.index) # check values replicated broadcasted across group - assert len(set(result.iloc[-3:, 1])) == 1 + assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 From b6e924ed1c9e51214d278b264a9b52d26a48d980 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 12:01:33 -0500 Subject: [PATCH 22/42] black --- pandas/tests/groupby/test_transform.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index b2a756a226aad..9d8eea6673d11 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1027,10 +1027,7 @@ def test_transform_invalid_name_raises(): @pytest.mark.parametrize( "obj", - [ - DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))), - Series([0, 0, 0, 1, 1, 1]) - ], + [DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))), Series([0, 0, 0, 1, 1, 1])], ) def test_transform_agg_by_name(reduction_func, obj): func = reduction_func From 399804c301a996d994ae7c5d5cf7ff1d97fdfc49 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 22:56:02 -0500 Subject: [PATCH 23/42] remove cov --- pandas/core/groupby/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index b052dcf956779..421aac411586b 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -112,7 +112,6 @@ def _gotitem(self, key, ndim, subset=None): "any", "bfill", "count", - "cov", "first", "idxmax", "idxmin", From 1d158deb50de05bcd1c8d4415836ec84985f322e Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 22:57:12 -0500 Subject: [PATCH 24/42] bfill is a reduction --- pandas/core/groupby/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 421aac411586b..64295fe359c36 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -110,7 +110,6 @@ def _gotitem(self, key, ndim, subset=None): [ "all", "any", - "bfill", "count", "first", "idxmax", @@ -147,6 +146,7 @@ def _gotitem(self, key, ndim, subset=None): "cumprod", "cumsum", "diff", + "bfill", "ffill", "fillna", "pad", From 04dadd9573f12e4f3a33b3a3144cc4fcc4f89513 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 22:57:43 -0500 Subject: [PATCH 25/42] bfill is not a reduction I mean From 146e402253a571a629a15b0a147a461c12b54a03 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 22:59:27 -0500 Subject: [PATCH 26/42] sort --- pandas/core/groupby/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 64295fe359c36..93145d680db5d 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -139,6 +139,7 @@ def _gotitem(self, key, ndim, subset=None): transformation_kernels = frozenset( [ "backfill", + "bfill", "corrwith", "cumcount", "cummax", @@ -146,7 +147,6 @@ def _gotitem(self, key, ndim, subset=None): "cumprod", "cumsum", "diff", - "bfill", "ffill", "fillna", "pad", From 932d224706cee19106eb3a8247c76961866fd255 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 23:01:21 -0500 Subject: [PATCH 27/42] comment --- pandas/core/groupby/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 93145d680db5d..8d6ab8c78a808 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -136,6 +136,7 @@ def _gotitem(self, key, ndim, subset=None): # List of transformation functions. # These map each object to a like-indexed result object +# like-indexed means *same* index and *same* columns. transformation_kernels = frozenset( [ "backfill", From 897d2fb2f21f603279e15699a18f81a6d303a874 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 23:02:43 -0500 Subject: [PATCH 28/42] comment --- pandas/core/groupby/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 8d6ab8c78a808..db8256e448a7a 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -135,7 +135,7 @@ def _gotitem(self, key, ndim, subset=None): ) # List of transformation functions. -# These map each object to a like-indexed result object +# These map each Series/DataFrame to a like-indexed result object # like-indexed means *same* index and *same* columns. transformation_kernels = frozenset( [ From 6a808281c46a1e347821bc7aef8131587836cbe6 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 23:04:47 -0500 Subject: [PATCH 29/42] comment --- pandas/core/groupby/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index db8256e448a7a..2c9734b3b6ed6 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -135,7 +135,7 @@ def _gotitem(self, key, ndim, subset=None): ) # List of transformation functions. -# These map each Series/DataFrame to a like-indexed result object +# These map each group to a like-indexed result object # like-indexed means *same* index and *same* columns. transformation_kernels = frozenset( [ From 255519331e88325f89b76787a23ccf7c76f22e78 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 23:25:18 -0500 Subject: [PATCH 30/42] remove stale test cases --- pandas/tests/groupby/test_transform.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 9d8eea6673d11..56b0bd1df52d6 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1037,10 +1037,6 @@ def test_transform_agg_by_name(reduction_func, obj): pytest.xfail("TODO: g.transform('ngroup') doesn't work") if func == "size": # GH#27469 pytest.xfail("TODO: g.transform('size') doesn't work") - if func == "corr": - pytest.xfail("corr returns multiindex, excluded from transform for now") - if func == "cov" and isinstance(obj, Series): - pytest.xfail("skip cov for series") args = {"nth": [0], "quantile": [0.5]}.get(func, []) From 3d7bc647ac4acc878421c6edb24225bac9d1f323 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sun, 21 Jul 2019 23:26:11 -0500 Subject: [PATCH 31/42] remove duplicate test --- pandas/tests/groupby/test_transform.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 56b0bd1df52d6..11919e3125bb6 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1020,10 +1020,6 @@ def test_transform_invalid_name_raises(): with pytest.raises(ValueError, match="not a valid function name"): g.transform("some_arbitrary_name") - # method exists on the object, but is not a valid transformation/agg - with pytest.raises(ValueError, match="not a valid function name"): - g.transform("aggregate") - @pytest.mark.parametrize( "obj", From 053320b114a0e427128430b01d95ba7d98fa0da6 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Mon, 22 Jul 2019 19:14:09 -0500 Subject: [PATCH 32/42] CI From 6b9041847e2cbeae44d5e2d150904ab8f052d8c9 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Tue, 23 Jul 2019 12:02:02 -0500 Subject: [PATCH 33/42] CI From c932afd039b3ff6bcfbdbe0e0f788566c836b3f5 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Tue, 23 Jul 2019 19:36:04 -0500 Subject: [PATCH 34/42] TST: make sure every public method on Grouper is accounted for --- pandas/core/groupby/base.py | 33 ++++++++++++++++++++++++++ pandas/tests/groupby/test_whitelist.py | 26 ++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 2c9734b3b6ed6..26392d7029703 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -158,6 +158,39 @@ def _gotitem(self, key, ndim, subset=None): ] ) +# these are methods which don't belong +# in either of the above groups +groupby_other_methods = frozenset( + [ + "agg", + "aggregate", + "apply", + "boxplot", + # corr and cove return a MultiIndex so the result is not like-index + # and therefore not a transformation + "corr", + "cov", + "describe", + "dtypes", + "expanding", + "filter", + "get_group", + "groups", + "head", + "hist", + "indices", + "ndim", + "ngroups", + "ohlc", + "pipe", + "plot", + "resample", + "rolling", + "tail", + "take", + "transform", + ] +) # Valid values of `name` for `groupby.transform(name)` # NOTE: do NOT edit this directly. New additions should be inserted # into the appropriate list above. diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index ee380c6108c38..d96b3efddec2a 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -9,6 +9,11 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series, date_range +from pandas.core.groupby.base import ( + groupby_other_methods, + reduction_kernels, + transformation_kernels, +) from pandas.util import testing as tm AGG_FUNCTIONS = [ @@ -376,3 +381,24 @@ def test_groupby_selection_with_methods(df): tm.assert_frame_equal( g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3) ) + + +def test_all_methods_categorized(mframe): + grp = mframe.groupby(mframe.iloc[:, 0]) + names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns) + names -= reduction_kernels + names -= transformation_kernels + names -= groupby_other_methods + + # each name can appear in only one list + assert not (reduction_kernels & transformation_kernels) + assert not (reduction_kernels & groupby_other_methods) + assert not (transformation_kernels & groupby_other_methods) + + if names: + # each name must appear in one of the lists under + # pandas.core.groupby.base: + # - reduction_kernels + # - transformation_kernels + # - groupby_other_methods + raise RuntimeError("Uncatgeorized Groupby Methods: {names}".format(names=names)) From 5f107200fe160afc6b5c7e79d4524e830c0b86a1 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Tue, 23 Jul 2019 19:37:22 -0500 Subject: [PATCH 35/42] comment --- pandas/core/groupby/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 26392d7029703..bc00cd45e2333 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -105,7 +105,7 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) # List of aggregation/reduction functions. -# These map each series/column to a single value +# These map each group to a single value reduction_kernels = frozenset( [ "all", From 4b186a4e69ea84019271327691e16347c2643f9b Mon Sep 17 00:00:00 2001 From: pilkibun Date: Tue, 23 Jul 2019 19:38:09 -0500 Subject: [PATCH 36/42] comment --- pandas/core/groupby/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index bc00cd45e2333..b564537c86575 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -158,8 +158,8 @@ def _gotitem(self, key, ndim, subset=None): ] ) -# these are methods which don't belong -# in either of the above groups +# these are all the public methods on Grouper which don't belong +# in either of the above lists groupby_other_methods = frozenset( [ "agg", From de0c11ce7d02e7383e10d37e62becae785f53126 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Wed, 24 Jul 2019 01:24:05 -0500 Subject: [PATCH 37/42] comment --- pandas/core/groupby/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index b564537c86575..dcf8e1aa22516 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -166,8 +166,8 @@ def _gotitem(self, key, ndim, subset=None): "aggregate", "apply", "boxplot", - # corr and cove return a MultiIndex so the result is not like-index - # and therefore not a transformation + # corr and cov return ngroups*ncolumns rows, so they + # are neither a transformation nor a reduction "corr", "cov", "describe", From cba02d6c78e905667241f8315f14370f3ee741c2 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Wed, 24 Jul 2019 11:10:01 -0500 Subject: [PATCH 38/42] fix review --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/groupby/base.py | 2 +- pandas/tests/groupby/test_transform.py | 2 +- pandas/tests/groupby/test_whitelist.py | 47 ++++++++++++++++++++------ 4 files changed, 39 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index d57b86a619ac5..f6f81a059ce70 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -39,7 +39,7 @@ Backwards incompatible API changes .. _whatsnew_1000.api.other: -- :meth:`GroupBy.transform(name)` now raises on invalid operation names (:issue:`27489`). +- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). - Other API changes diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index dcf8e1aa22516..c0d165f263cef 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -105,7 +105,7 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) # List of aggregation/reduction functions. -# These map each group to a single value +# These map each group to a single numeric value reduction_kernels = frozenset( [ "all", diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index df63e0438eb3b..9c9730da6c279 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1039,7 +1039,7 @@ def test_transform_agg_by_name(reduction_func, obj): result = g.transform(func, *args) tm.assert_index_equal(result.index, obj.index) - # check values replicated broadcasted across group + # verify that values were broadcasted across each group assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index d96b3efddec2a..c2304ae70fe21 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -386,19 +386,44 @@ def test_groupby_selection_with_methods(df): def test_all_methods_categorized(mframe): grp = mframe.groupby(mframe.iloc[:, 0]) names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns) - names -= reduction_kernels - names -= transformation_kernels - names -= groupby_other_methods + new_names = set(names) + new_names -= reduction_kernels + new_names -= transformation_kernels + new_names -= groupby_other_methods - # each name can appear in only one list assert not (reduction_kernels & transformation_kernels) assert not (reduction_kernels & groupby_other_methods) assert not (transformation_kernels & groupby_other_methods) - if names: - # each name must appear in one of the lists under - # pandas.core.groupby.base: - # - reduction_kernels - # - transformation_kernels - # - groupby_other_methods - raise RuntimeError("Uncatgeorized Groupby Methods: {names}".format(names=names)) + # new public method? + if new_names: + msg = """ +There are uncatgeorized methods defined on the Grouper class: +{names}. + +IWas a new method just added? + +Every public method On Grouper must appear in exactly one the +following three lists defined in pandas.core.groupby.base: +- `reduction_kernels` +- `transformation_kernels` +- `groupby_other_methods` +see the comments in pandas/core/groupby/base.py for guidance on +how to fix this test. + """ + raise AssertionError(msg.format(names=names)) + + # removed a public method? + all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods + print(names) + print(all_categorized) + if not (names == all_categorized): + msg = """ +Some methods which are supposed to be on the Grouper class +are missing: +{names}. + +They're still defined in one of the lists that live in pandas/core/groupby/base.py. +If you removed a method, you should update them +""" + raise AssertionError(msg.format(names=all_categorized - names)) From b19a46f65bc9260d6be2b6ac569e7de861f81f10 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Wed, 24 Jul 2019 11:24:24 -0500 Subject: [PATCH 39/42] message --- pandas/tests/groupby/test_whitelist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index c2304ae70fe21..05d745ccc0e8e 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -401,7 +401,7 @@ def test_all_methods_categorized(mframe): There are uncatgeorized methods defined on the Grouper class: {names}. -IWas a new method just added? +Was a new method recently added? Every public method On Grouper must appear in exactly one the following three lists defined in pandas.core.groupby.base: From 3511480605208852badcb65f7a25b409e0ede643 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Wed, 24 Jul 2019 11:28:13 -0500 Subject: [PATCH 40/42] Update test --- pandas/tests/groupby/test_transform.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 9c9730da6c279..a28d17a79a436 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1037,7 +1037,11 @@ def test_transform_agg_by_name(reduction_func, obj): args = {"nth": [0], "quantile": [0.5]}.get(func, []) result = g.transform(func, *args) + + # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index) + if hasattr(obj, "columns"): + tm.assert_index_equal(result.columns, obj.columns) # verify that values were broadcasted across each group assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 From b5d393102ef7decafb94564484c01d25e07863ac Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 09:05:21 -0500 Subject: [PATCH 41/42] changes --- pandas/core/groupby/base.py | 6 ++++-- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_transform.py | 7 ++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index c0d165f263cef..679faa054cc6b 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -124,6 +124,8 @@ def _gotitem(self, key, ndim, subset=None): "nth", "nunique", "prod", + # as long as `quantile`'s signature accepts only + # a single quantile value, it's a reduction. "quantile", "sem", "size", @@ -135,8 +137,8 @@ def _gotitem(self, key, ndim, subset=None): ) # List of transformation functions. -# These map each group to a like-indexed result object -# like-indexed means *same* index and *same* columns. +# a transformation is a function that, for each group, +# produces a result that has the same shape as the group. transformation_kernels = frozenset( [ "backfill", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9aba9723e0546..3d4dbd3f8d887 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -261,7 +261,7 @@ class providing the base-class of operations. * f must return a value that either has the same shape as the input subframe or can be broadcast to the shape of the input subframe. - For example, f returns a scalar it will be broadcast to have the + For example, if `f` returns a scalar it will be broadcast to have the same shape as the input subframe. * if this is a DataFrame, f must support application column-by-column in the subframe. If f also supports application to the entire subframe, diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index a28d17a79a436..d3972e6ba9008 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1023,7 +1023,12 @@ def test_transform_invalid_name_raises(): @pytest.mark.parametrize( "obj", - [DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))), Series([0, 0, 0, 1, 1, 1])], + [ + DataFrame( + dict(a=[0, 0, 0, 1, 1, 1], b=range(6)), index=["A", "B", "C", "D", "E", "F"] + ), + Series([0, 0, 0, 1, 1, 1], index=["A", "B", "C", "D", "E", "F"]), + ], ) def test_transform_agg_by_name(reduction_func, obj): func = reduction_func From e9343e1c1506a57a55a7561bc61850b36daea1d9 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 13:17:58 -0500 Subject: [PATCH 42/42] reference issue --- pandas/core/groupby/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 679faa054cc6b..fc3bb69afd0cb 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -126,6 +126,7 @@ def _gotitem(self, key, ndim, subset=None): "prod", # as long as `quantile`'s signature accepts only # a single quantile value, it's a reduction. + # GH#27526 might change that. "quantile", "sem", "size",