From fd53827377591377ab78cfe849f9c5a4ee729d56 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Oct 2019 09:14:05 -0700 Subject: [PATCH 01/45] Added any_all test --- pandas/tests/groupby/test_groupby.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6212a37472000..3a6b0f74c105a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1944,3 +1944,12 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): result = getattr(grouped, op)() expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # 21668 + df = pd.DataFrame([[True, True]], columns=['a', 'a']) + result = getattr(df.groupby([0]), bool_agg_func)() + expected = df + tm.assert_frame_equal(result, expected) From 6f60cd06ea3421e0b754c7e70e8c6ce998200b40 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Oct 2019 09:23:17 -0700 Subject: [PATCH 02/45] Added complete test for output shape of elements --- pandas/tests/groupby/test_groupby.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3a6b0f74c105a..1bd8a5fac41f4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1950,6 +1950,21 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): def test_bool_aggs_dup_column_labels(bool_agg_func): # 21668 df = pd.DataFrame([[True, True]], columns=['a', 'a']) - result = getattr(df.groupby([0]), bool_agg_func)() + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + expected = df tm.assert_frame_equal(result, expected) + + +def test_dup_labels_output_shape(reduction_func): + # TODO: see if these can be fixed as well + if reduction_func in {"size", "quantile", "nunique", "nth", "ngroup"}: + pytest.xfail("Inconsistent output shape") + + df = pd.DataFrame([[1, 1]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, reduction_func)() + + assert result.shape == (1, 2) + tm.assert_index_equal(result.columns, pd.Index(["a", "a"])) From 0aa1813fce01775dd36dd8870e3e92663f1c87f2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Oct 2019 09:56:06 -0700 Subject: [PATCH 03/45] Fixed shape issues with bool aggs --- pandas/core/groupby/generic.py | 34 +++++++++------------------------- pandas/core/groupby/groupby.py | 16 ++++++++-------- 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8e53972c95275..12a289fb6d06b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -17,6 +17,7 @@ FrozenSet, Hashable, Iterable, + List, Sequence, Tuple, Type, @@ -340,15 +341,12 @@ def _aggregate_multiple_funcs(self, arg, _level): def _wrap_series_output(self, output, index, names=None): """ common agg/transform wrapping logic """ - output = output[self._selection_name] - if names is not None: return DataFrame(output, index=index, columns=names) else: - name = self._selection_name - if name is None: - name = self._selected_obj.name - return Series(output, index=index, name=name) + result = output[0] + result.index = index + return result def _wrap_aggregated_output(self, output, names=None): result = self._wrap_series_output( @@ -1116,21 +1114,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): return DataFrame(result, columns=result_columns) - def _decide_output_index(self, output, labels): - if len(output) == len(labels): - output_keys = labels - else: - output_keys = sorted(output) - try: - output_keys.sort() - except TypeError: - pass - - if isinstance(labels, MultiIndex): - output_keys = MultiIndex.from_tuples(output_keys, names=labels.names) - - return output_keys - def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return DataFrame(index=keys) @@ -1595,19 +1578,20 @@ def _insert_inaxis_grouper_inplace(self, result): if in_axis: result.insert(0, name, lev) - def _wrap_aggregated_output(self, output, names=None): + def _wrap_aggregated_output(self, output: List[Series], names=None): agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) - output_keys = self._decide_output_index(output, agg_labels) + + from pandas.core.reshape.concat import concat + result = concat(output, axis=1) if not self.as_index: - result = DataFrame(output, columns=output_keys) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - result = DataFrame(output, index=index, columns=output_keys) + result.index = index if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fa65179469840..d0628c78d7e35 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -876,14 +876,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): raise AbstractMethodError(self) def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): - output = {} + output = [] # type: List[Series] for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) - output[name] = self._try_cast(result, obj) + output.append(Series(self._try_cast(result, obj), name=name)) if len(output) == 0: raise DataError("No numeric types to aggregate") @@ -895,14 +895,14 @@ def _python_agg_general(self, func, *args, **kwargs): f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output = {} + output = [] # type: List[Series] for name, obj in self._iterate_slices(): try: result, counts = self.grouper.agg_series(obj, f) except TypeError: continue else: - output[name] = self._try_cast(result, obj, numeric_only=True) + output.append(Series(self._try_cast(result, obj, numeric_only=True), name=name)) if len(output) == 0: return self._python_apply_general(f) @@ -910,14 +910,14 @@ def _python_agg_general(self, func, *args, **kwargs): if self.grouper._filter_empty_groups: mask = counts.ravel() > 0 - for name, result in output.items(): + for index, result in enumerate(output): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[name] = self._try_cast(values[mask], result) + output[index] = self._try_cast(values[mask], result) return self._wrap_aggregated_output(output) @@ -2239,7 +2239,7 @@ def _get_cythonized_result( ) labels, _, ngroups = grouper.group_info - output = collections.OrderedDict() + output = [] # type: List[Series] base_func = getattr(libgroupby, how) for name, obj in self._iterate_slices(): @@ -2278,7 +2278,7 @@ def _get_cythonized_result( if post_processing: result = post_processing(result, inferences) - output[name] = result + output.append(Series(result, name=name)) if aggregate: return self._wrap_aggregated_output(output) From 9756e746eca2225c800f95d505ebd8db3b96b54a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 21:08:21 -0700 Subject: [PATCH 04/45] Added test for transformation shape --- pandas/tests/groupby/conftest.py | 14 +++++++++++++- pandas/tests/groupby/test_groupby.py | 6 +++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 72e60c5099304..bac967b488b09 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex -from pandas.core.groupby.base import reduction_kernels +from pandas.core.groupby.base import reduction_kernels, transformation_kernels from pandas.util import testing as tm @@ -110,3 +110,15 @@ def reduction_func(request): """yields the string names of all groupby reduction functions, one at a time. """ return request.param + + +@pytest.fixture(params=transformation_kernels) +def transformation_func(request): + """yields the string names of all groupby transformation functions.""" + return request.param + + +@pytest.fixture(params=set(reduction_kernels) | set(transformation_kernels)) +def groupby_func(request): + """yields both aggregation and transformation functions.""" + return request.param diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8334a12cc4f83..2bfb5c9523707 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1967,14 +1967,14 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): tm.assert_frame_equal(result, expected) -def test_dup_labels_output_shape(reduction_func): +def test_dup_labels_output_shape(groupby_func): # TODO: see if these can be fixed as well - if reduction_func in {"size", "quantile", "nunique", "nth", "ngroup"}: + if groupby_func in {"size", "quantile", "nunique", "nth", "ngroup"}: pytest.xfail("Inconsistent output shape") df = pd.DataFrame([[1, 1]], columns=["a", "a"]) grp_by = df.groupby([0]) - result = getattr(grp_by, reduction_func)() + result = getattr(grp_by, groupby_func)() assert result.shape == (1, 2) tm.assert_index_equal(result.columns, pd.Index(["a", "a"])) From b67596353c26f855cf01fb92cbe8ea5b3d7f2043 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 21:18:54 -0700 Subject: [PATCH 05/45] Changed transform output --- pandas/core/groupby/generic.py | 8 ++++++-- pandas/core/groupby/groupby.py | 8 +++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0ab79eddbeb66..b8e319bf66d4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1608,8 +1608,12 @@ def _wrap_aggregated_output(self, output: List[Series], names=None): return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None): - return DataFrame(output, index=self.obj.index) + def _wrap_transformed_output(self, output: List[Series], names=None) -> DataFrame: + from pandas.core.reshape.concat import concat + df = concat(output, axis=1) + df.index = self.obj.index + + return df def _wrap_agged_blocks(self, items, blocks): if not self.as_index: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 243b5258f7ee0..3d791e0eec54a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -846,7 +846,7 @@ def _transform_should_cast(self, func_nm): ) def _cython_transform(self, how, numeric_only=True, **kwargs): - output = collections.OrderedDict() + output = [] # type: List[Series] for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: @@ -856,10 +856,12 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): result, names = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue + + result = Series(result, name=name) if self._transform_should_cast(how): - output[name] = self._try_cast(result, obj) + output.append(self._try_cast(result, obj)) else: - output[name] = result + output.append(result) if len(output) == 0: raise DataError("No numeric types to aggregate") From 444d542c7fc05869a9142c9099a19b5b025ad41a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 21:27:51 -0700 Subject: [PATCH 06/45] Updated test with required args --- pandas/tests/groupby/test_groupby.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2bfb5c9523707..57abf7f4f6e41 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1969,12 +1969,20 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): def test_dup_labels_output_shape(groupby_func): # TODO: see if these can be fixed as well - if groupby_func in {"size", "quantile", "nunique", "nth", "ngroup"}: - pytest.xfail("Inconsistent output shape") df = pd.DataFrame([[1, 1]], columns=["a", "a"]) grp_by = df.groupby([0]) - result = getattr(grp_by, groupby_func)() + + # TODO: move this to a helper function + args = [] + if groupby_func in {"fillna", "nth"}: + args.append(0) + elif groupby_func == "corrwith": + args.append(pd.Series([1, 1])) + elif groupby_func == "tshift": + pytest.skip("Need to add DTI") + + result = getattr(grp_by, groupby_func)(*args) assert result.shape == (1, 2) tm.assert_index_equal(result.columns, pd.Index(["a", "a"])) From 98a99014a5a603841ee7889faf4e9ab7ca1c7e64 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 21:43:12 -0700 Subject: [PATCH 07/45] Fixed cummin issue --- pandas/core/groupby/groupby.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3d791e0eec54a..2956849d1372e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -857,11 +857,10 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): except NotImplementedError: continue - result = Series(result, name=name) if self._transform_should_cast(how): - output.append(self._try_cast(result, obj)) - else: - output.append(result) + result = self._try_cast(result, obj) + + output.append(Series(result, name=name)) if len(output) == 0: raise DataError("No numeric types to aggregate") From c8648b1c8b586db96060b88cfacca34afbee0c6b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 22:02:51 -0700 Subject: [PATCH 08/45] Fixed ohlc one-off handling --- pandas/core/groupby/groupby.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2956849d1372e..b893a40a37ab2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -884,6 +884,20 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): continue result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) + if how == "ohlc": + assert result.shape[1] == 4 + for index in range(4): + values = result[:, index] + output.append(Series(self._try_cast(values, obj))) + + # TODO: de-dup with DataFrame._wrap_aggregated_output + from pandas.core.reshape.concat import concat + df = concat(output, axis=1) + df.columns = ["open", "high", "low", "close"] + df.index = self.grouper.result_index + + return df + output.append(Series(self._try_cast(result, obj), name=name)) if len(output) == 0: From 1626de10a3ab495587c1f18eb381445dd776e81b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 22:12:16 -0700 Subject: [PATCH 09/45] Fixed output shape of nunique --- pandas/core/groupby/generic.py | 4 +++- pandas/tests/groupby/test_groupby.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b8e319bf66d4a..e4d35704c08a0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1733,9 +1733,11 @@ def groupby_series(obj, col=None): if isinstance(obj, Series): results = groupby_series(obj) else: + # TODO: this is duplicative of how GroupBy naturally works + # Try to consolidate with normal wrapping functions from pandas.core.reshape.concat import concat - results = [groupby_series(obj[col], col) for col in obj.columns] + results = [groupby_series(content, label) for label, content in obj.items()] results = concat(results, axis=1) results.columns.names = obj.columns.names diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 57abf7f4f6e41..62f66c0984655 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1968,7 +1968,8 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): def test_dup_labels_output_shape(groupby_func): - # TODO: see if these can be fixed as well + if groupby_func in {"size", "ngroup", "cumcount"}: + pytest.skip("Not applicable") df = pd.DataFrame([[1, 1]], columns=["a", "a"]) grp_by = df.groupby([0]) From 2a6b8d712f5a2b83e900f6e73fed8d353b546f88 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 22:14:12 -0700 Subject: [PATCH 10/45] Fixed tshift --- pandas/tests/groupby/test_groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 62f66c0984655..eea4c9f9c061c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1981,7 +1981,8 @@ def test_dup_labels_output_shape(groupby_func): elif groupby_func == "corrwith": args.append(pd.Series([1, 1])) elif groupby_func == "tshift": - pytest.skip("Need to add DTI") + df.index = [pd.Timestamp("today")] + args.extend([1, "D"]) result = getattr(grp_by, groupby_func)(*args) From 12d1ca0ebbc18df28599aa2d098a7cabfce8e5ef Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 22:17:41 -0700 Subject: [PATCH 11/45] lint and black --- pandas/core/groupby/generic.py | 6 ++---- pandas/core/groupby/groupby.py | 6 ++++-- pandas/tests/groupby/test_groupby.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e4d35704c08a0..698655f716233 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1589,11 +1589,8 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output(self, output: List[Series], names=None): - agg_axis = 0 if self.axis == 1 else 1 - agg_labels = self._obj_with_exclusions._get_axis(agg_axis) - - from pandas.core.reshape.concat import concat + result = concat(output, axis=1) if not self.as_index: @@ -1610,6 +1607,7 @@ def _wrap_aggregated_output(self, output: List[Series], names=None): def _wrap_transformed_output(self, output: List[Series], names=None) -> DataFrame: from pandas.core.reshape.concat import concat + df = concat(output, axis=1) df.index = self.obj.index diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b893a40a37ab2..99ba6bdd5e6f2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -7,7 +7,6 @@ class providing the base-class of operations. expose these user-facing objects to provide specific functionailty. """ -import collections from contextlib import contextmanager import datetime from functools import partial, wraps @@ -892,6 +891,7 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): # TODO: de-dup with DataFrame._wrap_aggregated_output from pandas.core.reshape.concat import concat + df = concat(output, axis=1) df.columns = ["open", "high", "low", "close"] df.index = self.grouper.result_index @@ -917,7 +917,9 @@ def _python_agg_general(self, func, *args, **kwargs): except TypeError: continue else: - output.append(Series(self._try_cast(result, obj, numeric_only=True), name=name)) + output.append( + Series(self._try_cast(result, obj, numeric_only=True), name=name) + ) if len(output) == 0: return self._python_apply_general(f) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index eea4c9f9c061c..f297c49a70322 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1959,7 +1959,7 @@ def test_groupby_only_none_group(): @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) def test_bool_aggs_dup_column_labels(bool_agg_func): # 21668 - df = pd.DataFrame([[True, True]], columns=['a', 'a']) + df = pd.DataFrame([[True, True]], columns=["a", "a"]) grp_by = df.groupby([0]) result = getattr(grp_by, bool_agg_func)() @@ -1983,7 +1983,7 @@ def test_dup_labels_output_shape(groupby_func): elif groupby_func == "tshift": df.index = [pd.Timestamp("today")] args.extend([1, "D"]) - + result = getattr(grp_by, groupby_func)(*args) assert result.shape == (1, 2) From 5a3fcd7f7d10a449981c6bc86e47b49344faebda Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Oct 2019 22:20:01 -0700 Subject: [PATCH 12/45] Added whatsnew --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 48c1173a372a7..9a1eca3c6f332 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -377,6 +377,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) +- Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) Reshaping ^^^^^^^^^ From dee597ac0244eca6d0b9d79002538199a61c6c1a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Oct 2019 07:56:12 -0700 Subject: [PATCH 13/45] Quantile special case hack --- pandas/core/groupby/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 99ba6bdd5e6f2..d56a39ac896a3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -701,7 +701,8 @@ def apply(self, func, *args, **kwargs): # resolve functions to their callable functions prior, this # wouldn't be needed if args or kwargs: - if callable(func): + # TODO: Why do we need to special case quantile now? + if callable(func) or func == "quantile": @wraps(func) def f(g): From fdb36f6f38ba220bfae8080c13f980eb010d32cf Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Oct 2019 21:02:25 -0700 Subject: [PATCH 14/45] Test fix for np dev --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f297c49a70322..5baebaa2a9018 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1979,7 +1979,7 @@ def test_dup_labels_output_shape(groupby_func): if groupby_func in {"fillna", "nth"}: args.append(0) elif groupby_func == "corrwith": - args.append(pd.Series([1, 1])) + args.append(pd.Series([1])) elif groupby_func == "tshift": df.index = [pd.Timestamp("today")] args.extend([1, "D"]) From 8975009bf32b10d8ef993b475cb2619371d762a5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Oct 2019 21:46:06 -0700 Subject: [PATCH 15/45] Used position as labels --- pandas/core/groupby/generic.py | 34 +++++++++-------- pandas/core/groupby/groupby.py | 70 +++++++++++++++++----------------- 2 files changed, 54 insertions(+), 50 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 698655f716233..19fc58e7f558d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -14,6 +14,7 @@ from typing import ( Any, Callable, + Dict, FrozenSet, Hashable, Iterable, @@ -339,22 +340,25 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) - def _wrap_series_output(self, output, index, names=None): + def _wrap_series_output(self, output: Dict[int, np.ndarray], index, names: List[Hashable]): """ common agg/transform wrapping logic """ - if names is not None: - return DataFrame(output, index=index, columns=names) + if len(names) > 1: + result = DataFrame(output, index=index) + result.columns = names + return result else: - result = output[0] + result = Series(output[0]) result.index = index + result.name = names[0] return result - def _wrap_aggregated_output(self, output, names=None): + def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): result = self._wrap_series_output( output=output, index=self.grouper.result_index, names=names ) return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None): + def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): return self._wrap_series_output( output=output, index=self.obj.index, names=names ) @@ -1588,10 +1592,9 @@ def _insert_inaxis_grouper_inplace(self, result): if in_axis: result.insert(0, name, lev) - def _wrap_aggregated_output(self, output: List[Series], names=None): - from pandas.core.reshape.concat import concat - - result = concat(output, axis=1) + def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]) -> DataFrame: + result = DataFrame(output) + result.columns = names if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -1605,13 +1608,12 @@ def _wrap_aggregated_output(self, output: List[Series], names=None): return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output: List[Series], names=None) -> DataFrame: - from pandas.core.reshape.concat import concat - - df = concat(output, axis=1) - df.index = self.obj.index + def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Hashable]) -> DataFrame: + result = DataFrame(output) + result.columns = names + result.index = self.obj.index - return df + return result def _wrap_agged_blocks(self, items, blocks): if not self.as_index: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d56a39ac896a3..a77a907fa83e3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -7,13 +7,14 @@ class providing the base-class of operations. expose these user-facing objects to provide specific functionailty. """ +import collections from contextlib import contextmanager import datetime from functools import partial, wraps import inspect import re import types -from typing import FrozenSet, Hashable, Iterable, List, Optional, Tuple, Type, Union +from typing import Dict, FrozenSet, Hashable, Iterable, List, Optional, Tuple, Type, Union import numpy as np @@ -846,60 +847,58 @@ def _transform_should_cast(self, func_nm): ) def _cython_transform(self, how, numeric_only=True, **kwargs): - output = [] # type: List[Series] - for name, obj in self._iterate_slices(): + output = collections.OrderedDict() + names = [] # type: List[Hashable] + for index, (name, obj) in enumerate(self._iterate_slices()): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue try: - result, names = self.grouper.transform(obj.values, how, **kwargs) + result, _ = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue if self._transform_should_cast(how): result = self._try_cast(result, obj) - output.append(Series(result, name=name)) + output[index] = result + names.append(name) if len(output) == 0: raise DataError("No numeric types to aggregate") return self._wrap_transformed_output(output, names) - def _wrap_aggregated_output(self, output, names=None): + def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): raise AbstractMethodError(self) - def _wrap_transformed_output(self, output, names=None): + def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): raise AbstractMethodError(self) def _wrap_applied_output(self, keys, values, not_indexed_same=False): raise AbstractMethodError(self) def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): - output = [] # type: List[Series] - for name, obj in self._iterate_slices(): + output = collections.OrderedDict() + names = [] # type: List[Hashable] + + for index, (name, obj) in enumerate(self._iterate_slices()): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) + result, _ = self.grouper.aggregate(obj.values, how, min_count=min_count) + + # TODO: ohlc needs a new home if how == "ohlc": assert result.shape[1] == 4 - for index in range(4): - values = result[:, index] - output.append(Series(self._try_cast(values, obj))) - - # TODO: de-dup with DataFrame._wrap_aggregated_output - from pandas.core.reshape.concat import concat - - df = concat(output, axis=1) - df.columns = ["open", "high", "low", "close"] - df.index = self.grouper.result_index - + result = self._try_cast(result, obj) + df = DataFrame(result, index=self.grouper.result_index, columns=["open", "high", "low", "close"]) return df - output.append(Series(self._try_cast(result, obj), name=name)) + output[index] = self._try_cast(result, obj) + names.append(name) if len(output) == 0: raise DataError("No numeric types to aggregate") @@ -911,16 +910,17 @@ def _python_agg_general(self, func, *args, **kwargs): f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output = [] # type: List[Series] - for name, obj in self._iterate_slices(): + output = collections.OrderedDict() + names = [] # type: List[Hashable] + + for index, (name, obj) in enumerate(self._iterate_slices()): try: result, counts = self.grouper.agg_series(obj, f) except TypeError: continue else: - output.append( - Series(self._try_cast(result, obj, numeric_only=True), name=name) - ) + output[index] = self._try_cast(result, obj, numeric_only=True) + names.append(name) if len(output) == 0: return self._python_apply_general(f) @@ -928,7 +928,7 @@ def _python_agg_general(self, func, *args, **kwargs): if self.grouper._filter_empty_groups: mask = counts.ravel() > 0 - for index, result in enumerate(output): + for index, result in output.items(): # since we are masking, make sure that we have a float object values = result @@ -937,7 +937,7 @@ def _python_agg_general(self, func, *args, **kwargs): output[index] = self._try_cast(values[mask], result) - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, names) def _concat_objects(self, keys, values, not_indexed_same=False): from pandas.core.reshape.concat import concat @@ -2256,10 +2256,11 @@ def _get_cythonized_result( ) labels, _, ngroups = grouper.group_info - output = [] # type: List[Series] + output = collections.OrderedDict() + names = [] # List[Hashable] base_func = getattr(libgroupby, how) - for name, obj in self._iterate_slices(): + for index, (name, obj) in enumerate(self._iterate_slices()): values = obj._data._values if aggregate: @@ -2295,12 +2296,13 @@ def _get_cythonized_result( if post_processing: result = post_processing(result, inferences) - output.append(Series(result, name=name)) + output[index] = result + names.append(name) if aggregate: - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, names) else: - return self._wrap_transformed_output(output) + return self._wrap_transformed_output(output, names) @Substitution(name="groupby") @Appender(_common_see_also) From 9adde1ba7185dff87a63ef689d557ff53b9cc930 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Oct 2019 21:52:31 -0700 Subject: [PATCH 16/45] Code cleanup --- pandas/core/groupby/generic.py | 15 ++++++--------- pandas/core/groupby/groupby.py | 3 +-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 19fc58e7f558d..f7d77c739f7b7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -342,15 +342,12 @@ def _aggregate_multiple_funcs(self, arg, _level): def _wrap_series_output(self, output: Dict[int, np.ndarray], index, names: List[Hashable]): """ common agg/transform wrapping logic """ - if len(names) > 1: - result = DataFrame(output, index=index) - result.columns = names - return result - else: - result = Series(output[0]) - result.index = index - result.name = names[0] - return result + assert len(names) == 1 + result = Series(output[0]) + result.index = index + result.name = names[0] + + return result def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): result = self._wrap_series_output( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a77a907fa83e3..b1b9c0eb0dd90 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -702,8 +702,7 @@ def apply(self, func, *args, **kwargs): # resolve functions to their callable functions prior, this # wouldn't be needed if args or kwargs: - # TODO: Why do we need to special case quantile now? - if callable(func) or func == "quantile": + if callable(func): @wraps(func) def f(g): From 63b35f9038a4b007d3872d44caa0aa81d7dcebef Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Oct 2019 22:00:16 -0700 Subject: [PATCH 17/45] docstrings --- pandas/core/groupby/generic.py | 97 +++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f7d77c739f7b7..ffe2ee81676e4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -341,7 +341,27 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) def _wrap_series_output(self, output: Dict[int, np.ndarray], index, names: List[Hashable]): - """ common agg/transform wrapping logic """ + """ + Wraps the output of a SeriesGroupBy operation into the expected result. + + Parameters + ---------- + output : dict[int, np.ndarray] + Dict with a sole key of 0 and a value of the result values. + index : pd.Index + Index to apply to the output. + names : List[Hashable] + List containing one label (the Series name). + + Returns + ------- + Series + + Notes + ----- + output and names should only contain one element. These are containers for generic + compatability with the DataFrameGroupBy class. + """ assert len(names) == 1 result = Series(output[0]) result.index = index @@ -350,12 +370,54 @@ def _wrap_series_output(self, output: Dict[int, np.ndarray], index, names: List[ return result def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : dict[int, np.ndarray] + Dict with a sole key of 0 and a value of the result values. + index : pd.Index + Index to apply to the output. + names : List[Hashable] + List containing one label (the Series name). + + Returns + ------- + Series + + Notes + ----- + output and names should only contain one element. These are containers for generic + compatability with the DataFrameGroupBy class. + """ result = self._wrap_series_output( output=output, index=self.grouper.result_index, names=names ) return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : dict[int, np.ndarray] + Dict with a sole key of 0 and a value of the result values. + index : pd.Index + Index to apply to the output. + names : List[Hashable] + List containing one label (the Series name). + + Returns + ------- + Series + + Notes + ----- + output and names should only contain one element. These are containers for generic + compatability with the DataFrameGroupBy class. + """ return self._wrap_series_output( output=output, index=self.obj.index, names=names ) @@ -1590,6 +1652,22 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy aggregations into the expected result. + + Parameters + ---------- + output : dict[int, np.ndarray] + Dict where the key represents the columnar-index and the values are + the actual results. + names : List[Hashable] + List containing the column names to apply. The position of each + item in the list corresponds with the key in output. + + Returns + ------- + DataFrame + """ result = DataFrame(output) result.columns = names @@ -1606,6 +1684,23 @@ def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Has return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Hashable]) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy transformations into the expected result. + + Parameters + ---------- + output : dict[int, np.ndarray] + Dict where the key represents the columnar-index and the values are + the actual results. + names : List[Hashable] + List containing the column names to apply. The position of each + item in the list corresponds with the key in output. + + Returns + ------- + DataFrame + """ + result = DataFrame(output) result.columns = names result.index = self.obj.index From 9eb7c7397ead15272882fbfceee43583f2f01c9f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Oct 2019 22:06:50 -0700 Subject: [PATCH 18/45] style and more typing --- pandas/core/groupby/generic.py | 42 +++++++++++++++++++++------------- pandas/core/groupby/groupby.py | 26 +++++++++++++++++---- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ffe2ee81676e4..4fc6164198e1c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -340,8 +340,10 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) - def _wrap_series_output(self, output: Dict[int, np.ndarray], index, names: List[Hashable]): - """ + def _wrap_series_output( + self, output: Dict[int, np.ndarray], index: Index, names: List[Hashable] + ) -> Series: + """ Wraps the output of a SeriesGroupBy operation into the expected result. Parameters @@ -359,8 +361,8 @@ def _wrap_series_output(self, output: Dict[int, np.ndarray], index, names: List[ Notes ----- - output and names should only contain one element. These are containers for generic - compatability with the DataFrameGroupBy class. + output and names should only contain one element. These are containers + for generic compatability with the DataFrameGroupBy class. """ assert len(names) == 1 result = Series(output[0]) @@ -369,8 +371,10 @@ def _wrap_series_output(self, output: Dict[int, np.ndarray], index, names: List[ return result - def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): - """ + def _wrap_aggregated_output( + self, output: Dict[int, np.ndarray], names: List[Hashable] + ) -> Series: + """ Wraps the output of a SeriesGroupBy aggregation into the expected result. Parameters @@ -388,16 +392,18 @@ def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Has Notes ----- - output and names should only contain one element. These are containers for generic - compatability with the DataFrameGroupBy class. + output and names should only contain one element. These are containers + for generic compatability with the DataFrameGroupBy class. """ result = self._wrap_series_output( output=output, index=self.grouper.result_index, names=names ) return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): - """ + def _wrap_transformed_output( + self, output: Dict[int, np.ndarray], names: List[Hashable] + ) -> Series: + """ Wraps the output of a SeriesGroupBy aggregation into the expected result. Parameters @@ -415,9 +421,9 @@ def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Ha Notes ----- - output and names should only contain one element. These are containers for generic - compatability with the DataFrameGroupBy class. - """ + output and names should only contain one element. These are containers + for generic compatability with the DataFrameGroupBy class. + """ return self._wrap_series_output( output=output, index=self.obj.index, names=names ) @@ -1651,7 +1657,9 @@ def _insert_inaxis_grouper_inplace(self, result): if in_axis: result.insert(0, name, lev) - def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]) -> DataFrame: + def _wrap_aggregated_output( + self, output: Dict[int, np.ndarray], names: List[Hashable] + ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1683,7 +1691,9 @@ def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Has return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Hashable]) -> DataFrame: + def _wrap_transformed_output( + self, output: Dict[int, np.ndarray], names: List[Hashable] + ) -> DataFrame: """ Wraps the output of DataFrameGroupBy transformations into the expected result. @@ -1700,7 +1710,7 @@ def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Ha ------- DataFrame """ - + result = DataFrame(output) result.columns = names result.index = self.obj.index diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b1b9c0eb0dd90..a1270f856b573 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -14,7 +14,17 @@ class providing the base-class of operations. import inspect import re import types -from typing import Dict, FrozenSet, Hashable, Iterable, List, Optional, Tuple, Type, Union +from typing import ( + Dict, + FrozenSet, + Hashable, + Iterable, + List, + Optional, + Tuple, + Type, + Union, +) import numpy as np @@ -869,10 +879,14 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): return self._wrap_transformed_output(output, names) - def _wrap_aggregated_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): + def _wrap_aggregated_output( + self, output: Dict[int, np.ndarray], names: List[Hashable] + ): raise AbstractMethodError(self) - def _wrap_transformed_output(self, output: Dict[int, np.ndarray], names: List[Hashable]): + def _wrap_transformed_output( + self, output: Dict[int, np.ndarray], names: List[Hashable] + ): raise AbstractMethodError(self) def _wrap_applied_output(self, keys, values, not_indexed_same=False): @@ -893,7 +907,11 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): if how == "ohlc": assert result.shape[1] == 4 result = self._try_cast(result, obj) - df = DataFrame(result, index=self.grouper.result_index, columns=["open", "high", "low", "close"]) + df = DataFrame( + result, + index=self.grouper.result_index, + columns=["open", "high", "low", "close"], + ) return df output[index] = self._try_cast(result, obj) From 0e49bdbbc90a1619f6326817587b18f757cb0405 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Oct 2019 23:13:58 -0700 Subject: [PATCH 19/45] Fixed parallel test collection --- pandas/tests/groupby/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index bac967b488b09..5ecbe3108b147 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -118,7 +118,7 @@ def transformation_func(request): return request.param -@pytest.fixture(params=set(reduction_kernels) | set(transformation_kernels)) +@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels)) def groupby_func(request): """yields both aggregation and transformation functions.""" return request.param From 2ad76322fe439fbe996cda47cbb35bdfea461a71 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 08:53:30 -0700 Subject: [PATCH 20/45] numpy dev warning fix --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5baebaa2a9018..9bd6a67df01c2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1979,7 +1979,7 @@ def test_dup_labels_output_shape(groupby_func): if groupby_func in {"fillna", "nth"}: args.append(0) elif groupby_func == "corrwith": - args.append(pd.Series([1])) + args.append(pd.Series([1, 0])) elif groupby_func == "tshift": df.index = [pd.Timestamp("today")] args.extend([1, "D"]) From 11fda39df785a741821e84e78c41730b4ecc695b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 09:45:35 -0700 Subject: [PATCH 21/45] More generic ohlc handling --- pandas/core/groupby/generic.py | 32 ++++++++++++++++++++----------- pandas/core/groupby/groupby.py | 35 +++++++++++++++++++--------------- pandas/core/groupby/ops.py | 13 +++++++++++-- 3 files changed, 52 insertions(+), 28 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4fc6164198e1c..297e64a4ea2bc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,6 +23,7 @@ Tuple, Type, Union, + cast, ) import warnings @@ -342,18 +343,20 @@ def _aggregate_multiple_funcs(self, arg, _level): def _wrap_series_output( self, output: Dict[int, np.ndarray], index: Index, names: List[Hashable] - ) -> Series: + ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. Parameters ---------- output : dict[int, np.ndarray] - Dict with a sole key of 0 and a value of the result values. + Dict where the key represents the columnar-index and the values are + the actual results. index : pd.Index Index to apply to the output. names : List[Hashable] - List containing one label (the Series name). + List containing the column names to apply. The position of each + item in the list corresponds with the key in output. Returns ------- @@ -361,19 +364,21 @@ def _wrap_series_output( Notes ----- - output and names should only contain one element. These are containers - for generic compatability with the DataFrameGroupBy class. + In the vast majority of cases output and names will only contain one + element. The exception is operations that expand dimensions, like ohlc. """ - assert len(names) == 1 - result = Series(output[0]) - result.index = index - result.name = names[0] + assert len(output) == len(names) + if len(output) > 1: + result = DataFrame(output, index=index) # type: Union[Series, DataFrame] + result.columns = names + else: + result = Series(output[0], index=index, name=names[0]) return result def _wrap_aggregated_output( self, output: Dict[int, np.ndarray], names: List[Hashable] - ) -> Series: + ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -424,10 +429,15 @@ def _wrap_transformed_output( output and names should only contain one element. These are containers for generic compatability with the DataFrameGroupBy class. """ - return self._wrap_series_output( + result = self._wrap_series_output( output=output, index=self.obj.index, names=names ) + # No transformations increase the ndim of the result + # Unfortunately need to cast for mypy to know this + result = cast(Series, result) + return result + def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a1270f856b573..bad82af0d228a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -896,26 +896,31 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): output = collections.OrderedDict() names = [] # type: List[Hashable] - for index, (name, obj) in enumerate(self._iterate_slices()): + # Ideally we would be able to enumerate self._iterate_slices and use + # the index from enumeration as the key of output, but ohlc in particular + # returns a (n x 4) array. Output requires 1D ndarrays as values, so we + # need to slice that up into 1D arrays + index = 0 + for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue - result, _ = self.grouper.aggregate(obj.values, how, min_count=min_count) - - # TODO: ohlc needs a new home - if how == "ohlc": - assert result.shape[1] == 4 - result = self._try_cast(result, obj) - df = DataFrame( - result, - index=self.grouper.result_index, - columns=["open", "high", "low", "close"], - ) - return df + result, agg_names = self.grouper.aggregate( + obj.values, how, min_count=min_count + ) - output[index] = self._try_cast(result, obj) - names.append(name) + if agg_names: + assert len(agg_names) == result.shape[1] + else: + assert result.ndim == 1 + result = result.reshape(-1, 1) + agg_names = [name] + + for result_column, result_name in zip(result.T, agg_names): + output[index] = self._try_cast(result_column, obj) + names.append(result_name) + index += 1 if len(output) == 0: raise DataError("No numeric types to aggregate") diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fcc646dec89d9..30f36cc4eef74 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -7,6 +7,7 @@ """ import collections +from typing import List, Optional, Tuple import numpy as np @@ -453,7 +454,15 @@ def wrapper(*args, **kwargs): return func - def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): + def _cython_operation( + self, kind, values, how, axis, min_count=-1, **kwargs + ) -> Tuple[np.ndarray, Optional[List[str]]]: + """ + Returns the values of a cython operation as a Tuple of [data, names]. + + Names is only useful when dealing with 2D results, like ohlc + (see self._name_functions). + """ assert kind in ["transform", "aggregate"] orig_values = values @@ -589,7 +598,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): if how in self._name_functions: # TODO - names = self._name_functions[how]() + names = self._name_functions[how]() # type: Optional[List[str]] else: names = None From 7c4bad90f5ea58703b78f2a674cd961010d7153d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 09:50:21 -0700 Subject: [PATCH 22/45] Converted Dict -> Mapping --- pandas/core/groupby/generic.py | 16 ++++++++-------- pandas/core/groupby/groupby.py | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d5ddd000f581b..ff6333688356a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -14,11 +14,11 @@ from typing import ( Any, Callable, - Dict, FrozenSet, Hashable, Iterable, List, + Mapping, Sequence, Tuple, Type, @@ -342,7 +342,7 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) def _wrap_series_output( - self, output: Dict[int, np.ndarray], index: Index, names: List[Hashable] + self, output: Mapping[int, np.ndarray], index: Index, names: List[Hashable] ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -377,7 +377,7 @@ def _wrap_series_output( return result def _wrap_aggregated_output( - self, output: Dict[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], names: List[Hashable] ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -406,7 +406,7 @@ def _wrap_aggregated_output( return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Dict[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], names: List[Hashable] ) -> Series: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -1668,7 +1668,7 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Dict[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], names: List[Hashable] ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1702,7 +1702,7 @@ def _wrap_aggregated_output( return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Dict[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], names: List[Hashable] ) -> DataFrame: """ Wraps the output of DataFrameGroupBy transformations into the expected result. @@ -1887,7 +1887,7 @@ def _normalize_keyword_aggregation(kwargs): """ Normalize user-provided "named aggregation" kwargs. - Transforms from the new ``Dict[str, NamedAgg]`` style kwargs + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs to the old OrderedDict[str, List[scalar]]]. Parameters @@ -1911,7 +1911,7 @@ def _normalize_keyword_aggregation(kwargs): if not PY36: kwargs = OrderedDict(sorted(kwargs.items())) - # Normalize the aggregation functions as Dict[column, List[func]], + # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to # defaultdict(list) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index da78db239f686..99cf6d1785dc8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -15,11 +15,11 @@ class providing the base-class of operations. import re import types from typing import ( - Dict, FrozenSet, Hashable, Iterable, List, + Mapping, Optional, Tuple, Type, @@ -880,12 +880,12 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): return self._wrap_transformed_output(output, names) def _wrap_aggregated_output( - self, output: Dict[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], names: List[Hashable] ): raise AbstractMethodError(self) def _wrap_transformed_output( - self, output: Dict[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], names: List[Hashable] ): raise AbstractMethodError(self) From b9dca9620a573bb3e8f3f83eabdc60888d0e4473 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 11:22:39 -0700 Subject: [PATCH 23/45] doc fixups --- pandas/core/groupby/generic.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ff6333688356a..9ce42f413206c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -360,7 +360,7 @@ def _wrap_series_output( Returns ------- - Series + Series or DataFrame Notes ----- @@ -385,20 +385,22 @@ def _wrap_aggregated_output( Parameters ---------- output : dict[int, np.ndarray] - Dict with a sole key of 0 and a value of the result values. + Dict where the key represents the columnar-index and the values are + the actual results. index : pd.Index Index to apply to the output. names : List[Hashable] - List containing one label (the Series name). + List containing the column names to apply. The position of each + item in the list corresponds with the key in output. Returns ------- - Series + Series or DataFrame Notes ----- - output and names should only contain one element. These are containers - for generic compatability with the DataFrameGroupBy class. + In the vast majority of cases output and names will only contain one + element. The exception is operations that expand dimensions, like ohlc. """ result = self._wrap_series_output( output=output, index=self.grouper.result_index, names=names From a878e6752b25a0f3edc107696278b5071ed4a7ef Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 12:47:56 -0700 Subject: [PATCH 24/45] numpy dev compat --- pandas/tests/groupby/test_groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9bd6a67df01c2..7930c3d52b203 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1974,12 +1974,11 @@ def test_dup_labels_output_shape(groupby_func): df = pd.DataFrame([[1, 1]], columns=["a", "a"]) grp_by = df.groupby([0]) - # TODO: move this to a helper function args = [] if groupby_func in {"fillna", "nth"}: args.append(0) elif groupby_func == "corrwith": - args.append(pd.Series([1, 0])) + args.append(df) elif groupby_func == "tshift": df.index = [pd.Timestamp("today")] args.extend([1, "D"]) From a66d37f4f00a432567a181b77c5ce6d89318c878 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Oct 2019 11:39:05 -0700 Subject: [PATCH 25/45] Renamed index -> idx --- pandas/core/groupby/groupby.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b79b9b80509c8..8377079c9c0c5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -858,7 +858,7 @@ def _transform_should_cast(self, func_nm): def _cython_transform(self, how, numeric_only=True, **kwargs): output = collections.OrderedDict() names = [] # type: List[Hashable] - for index, (name, obj) in enumerate(self._iterate_slices()): + for idx, (name, obj) in enumerate(self._iterate_slices()): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue @@ -871,7 +871,7 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): if self._transform_should_cast(how): result = self._try_cast(result, obj) - output[index] = result + output[idx] = result names.append(name) if len(output) == 0: @@ -900,7 +900,7 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): # the index from enumeration as the key of output, but ohlc in particular # returns a (n x 4) array. Output requires 1D ndarrays as values, so we # need to slice that up into 1D arrays - index = 0 + idx = 0 for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: @@ -918,9 +918,9 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): agg_names = [name] for result_column, result_name in zip(result.T, agg_names): - output[index] = self._try_cast(result_column, obj) + output[idx] = self._try_cast(result_column, obj) names.append(result_name) - index += 1 + idx += 1 if len(output) == 0: raise DataError("No numeric types to aggregate") @@ -935,13 +935,13 @@ def _python_agg_general(self, func, *args, **kwargs): output = collections.OrderedDict() names = [] # type: List[Hashable] - for index, (name, obj) in enumerate(self._iterate_slices()): + for idx, (name, obj) in enumerate(self._iterate_slices()): try: result, counts = self.grouper.agg_series(obj, f) except TypeError: continue else: - output[index] = self._try_cast(result, obj, numeric_only=True) + output[idx] = self._try_cast(result, obj, numeric_only=True) names.append(name) if len(output) == 0: @@ -950,14 +950,14 @@ def _python_agg_general(self, func, *args, **kwargs): if self.grouper._filter_empty_groups: mask = counts.ravel() > 0 - for index, result in output.items(): + for idx, result in output.items(): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[index] = self._try_cast(values[mask], result) + output[idx] = self._try_cast(values[mask], result) return self._wrap_aggregated_output(output, names) @@ -2280,7 +2280,7 @@ def _get_cythonized_result( names = [] # List[Hashable] base_func = getattr(libgroupby, how) - for index, (name, obj) in enumerate(self._iterate_slices()): + for idx, (name, obj) in enumerate(self._iterate_slices()): values = obj._data._values if aggregate: @@ -2316,7 +2316,7 @@ def _get_cythonized_result( if post_processing: result = post_processing(result, inferences) - output[index] = result + output[idx] = result names.append(name) if aggregate: From 6d50448187b681a20ad09be177c4624b8350e44d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Oct 2019 11:40:41 -0700 Subject: [PATCH 26/45] Removed rogue TODO --- pandas/core/groupby/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ad9821d779dd0..9cafd2aea1b07 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -605,7 +605,6 @@ def _cython_operation( result = result[:, 0] if how in self._name_functions: - # TODO names = self._name_functions[how]() # type: Optional[List[str]] else: names = None From 4dd8f5b1844bd71326ac720d8c70ef111e0fef4f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Oct 2019 15:51:18 -0700 Subject: [PATCH 27/45] Added failing test for multiindex --- pandas/tests/groupby/test_groupby.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7930c3d52b203..bd98a9331e4fd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1967,11 +1967,14 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): tm.assert_frame_equal(result, expected) -def test_dup_labels_output_shape(groupby_func): +@pytest.mark.parametrize( + "idx", [pd.Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))] +) +def test_dup_labels_output_shape(groupby_func, idx): if groupby_func in {"size", "ngroup", "cumcount"}: pytest.skip("Not applicable") - df = pd.DataFrame([[1, 1]], columns=["a", "a"]) + df = pd.DataFrame([[1, 1]], columns=idx) grp_by = df.groupby([0]) args = [] @@ -1986,4 +1989,4 @@ def test_dup_labels_output_shape(groupby_func): result = getattr(grp_by, groupby_func)(*args) assert result.shape == (1, 2) - tm.assert_index_equal(result.columns, pd.Index(["a", "a"])) + tm.assert_index_equal(result.columns, idx) From 9d39862e2b31ee99832e6f3c9651a40c99ca06e9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Oct 2019 16:09:51 -0700 Subject: [PATCH 28/45] MultiIndex support --- pandas/core/groupby/generic.py | 61 +++++++++++++++------------------- pandas/core/groupby/groupby.py | 17 +++++----- 2 files changed, 35 insertions(+), 43 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7353a930d7e83..5a02573f26b79 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -17,7 +17,6 @@ FrozenSet, Hashable, Iterable, - List, Mapping, Optional, Sequence, @@ -343,7 +342,7 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) def _wrap_series_output( - self, output: Mapping[int, np.ndarray], index: Index, names: List[Hashable] + self, output: Mapping[int, np.ndarray], index: Index, columns: Index ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -352,12 +351,11 @@ def _wrap_series_output( ---------- output : dict[int, np.ndarray] Dict where the key represents the columnar-index and the values are - the actual results. + the actual results. Must be ordered from 0..n index : pd.Index Index to apply to the output. - names : List[Hashable] - List containing the column names to apply. The position of each - item in the list corresponds with the key in output. + columns : pd.Index + Columns to apply to the output. Returns ------- @@ -365,20 +363,20 @@ def _wrap_series_output( Notes ----- - In the vast majority of cases output and names will only contain one + In the vast majority of cases output and columns will only contain one element. The exception is operations that expand dimensions, like ohlc. """ - assert len(output) == len(names) + assert len(output) == len(columns) if len(output) > 1: result = DataFrame(output, index=index) # type: Union[Series, DataFrame] - result.columns = names + result.columns = columns else: - result = Series(output[0], index=index, name=names[0]) + result = Series(output[0], index=index, name=columns[0]) return result def _wrap_aggregated_output( - self, output: Mapping[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], columns: Index ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -388,11 +386,8 @@ def _wrap_aggregated_output( output : dict[int, np.ndarray] Dict where the key represents the columnar-index and the values are the actual results. - index : pd.Index - Index to apply to the output. - names : List[Hashable] - List containing the column names to apply. The position of each - item in the list corresponds with the key in output. + columns : pd.Index + Columns to apply to the output. Returns ------- @@ -400,16 +395,16 @@ def _wrap_aggregated_output( Notes ----- - In the vast majority of cases output and names will only contain one + In the vast majority of cases output and columns will only contain one element. The exception is operations that expand dimensions, like ohlc. """ result = self._wrap_series_output( - output=output, index=self.grouper.result_index, names=names + output=output, index=self.grouper.result_index, columns=columns ) return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Mapping[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], columns: Index ) -> Series: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -418,10 +413,8 @@ def _wrap_transformed_output( ---------- output : dict[int, np.ndarray] Dict with a sole key of 0 and a value of the result values. - index : pd.Index - Index to apply to the output. - names : List[Hashable] - List containing one label (the Series name). + columns : pd.Index + Columns to apply to the output. Returns ------- @@ -429,11 +422,11 @@ def _wrap_transformed_output( Notes ----- - output and names should only contain one element. These are containers + output and columns should only contain one element. These are containers for generic compatability with the DataFrameGroupBy class. """ result = self._wrap_series_output( - output=output, index=self.obj.index, names=names + output=output, index=self.obj.index, columns=columns ) # No transformations increase the ndim of the result @@ -1668,7 +1661,7 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], columns: Index ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1678,16 +1671,15 @@ def _wrap_aggregated_output( output : dict[int, np.ndarray] Dict where the key represents the columnar-index and the values are the actual results. - names : List[Hashable] - List containing the column names to apply. The position of each - item in the list corresponds with the key in output. + columns : pd.Index + Column names to apply Returns ------- DataFrame """ result = DataFrame(output) - result.columns = names + result.columns = columns if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -1702,7 +1694,7 @@ def _wrap_aggregated_output( return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Mapping[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], columns: Index ) -> DataFrame: """ Wraps the output of DataFrameGroupBy transformations into the expected result. @@ -1712,9 +1704,8 @@ def _wrap_transformed_output( output : dict[int, np.ndarray] Dict where the key represents the columnar-index and the values are the actual results. - names : List[Hashable] - List containing the column names to apply. The position of each - item in the list corresponds with the key in output. + columns : pd.Index + Column names to apply. Returns ------- @@ -1722,7 +1713,7 @@ def _wrap_transformed_output( """ result = DataFrame(output) - result.columns = names + result.columns = columns result.index = self.obj.index return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8377079c9c0c5..19a07ac5f05d4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -877,15 +877,14 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_transformed_output(output, names) + columns = Index(names) + return self._wrap_transformed_output(output, columns) - def _wrap_aggregated_output( - self, output: Mapping[int, np.ndarray], names: List[Hashable] - ): + def _wrap_aggregated_output(self, output: Mapping[int, np.ndarray], columns: Index): raise AbstractMethodError(self) def _wrap_transformed_output( - self, output: Mapping[int, np.ndarray], names: List[Hashable] + self, output: Mapping[int, np.ndarray], columns: Index ): raise AbstractMethodError(self) @@ -959,7 +958,8 @@ def _python_agg_general(self, func, *args, **kwargs): output[idx] = self._try_cast(values[mask], result) - return self._wrap_aggregated_output(output, names) + columns = Index(names) + return self._wrap_aggregated_output(output, columns) def _concat_objects(self, keys, values, not_indexed_same=False): from pandas.core.reshape.concat import concat @@ -2319,10 +2319,11 @@ def _get_cythonized_result( output[idx] = result names.append(name) + columns = Index(names) if aggregate: - return self._wrap_aggregated_output(output, names) + return self._wrap_aggregated_output(output, columns) else: - return self._wrap_transformed_output(output, names) + return self._wrap_transformed_output(output, columns) @Substitution(name="groupby") @Appender(_common_see_also) From d6b197b2740b7a546556bee5015b6760c5d9fc6f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Oct 2019 16:18:41 -0700 Subject: [PATCH 29/45] More exact typing --- pandas/core/groupby/generic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5a02573f26b79..3efe86fa8e0a7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -53,7 +53,7 @@ ) from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna -from pandas._typing import FrameOrSeries +from pandas._typing import AnyArrayLike, FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com @@ -342,7 +342,7 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) def _wrap_series_output( - self, output: Mapping[int, np.ndarray], index: Index, columns: Index + self, output: Mapping[int, AnyArrayLike], index: Index, columns: Index ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -376,7 +376,7 @@ def _wrap_series_output( return result def _wrap_aggregated_output( - self, output: Mapping[int, np.ndarray], columns: Index + self, output: Mapping[int, AnyArrayLike], columns: Index ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -404,7 +404,7 @@ def _wrap_aggregated_output( return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Mapping[int, np.ndarray], columns: Index + self, output: Mapping[int, AnyArrayLike], columns: Index ) -> Series: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -1661,7 +1661,7 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[int, np.ndarray], columns: Index + self, output: Mapping[int, AnyArrayLike], columns: Index ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1694,7 +1694,7 @@ def _wrap_aggregated_output( return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Mapping[int, np.ndarray], columns: Index + self, output: Mapping[int, AnyArrayLike], columns: Index ) -> DataFrame: """ Wraps the output of DataFrameGroupBy transformations into the expected result. From 16e9512160998586ece561078f055dee1739ac93 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 29 Oct 2019 09:29:51 -0700 Subject: [PATCH 30/45] jbrockmendel feedback --- pandas/core/groupby/generic.py | 4 ++-- pandas/core/groupby/groupby.py | 1 + pandas/core/groupby/ops.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 46f11107c2245..db66630f0028d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -354,7 +354,7 @@ def _wrap_series_output( Parameters ---------- - output : dict[int, np.ndarray] + output : Mapping[int, AnyArrayLike] Dict where the key represents the columnar-index and the values are the actual results. Must be ordered from 0..n index : pd.Index @@ -388,7 +388,7 @@ def _wrap_aggregated_output( Parameters ---------- - output : dict[int, np.ndarray] + output : Mapping[int, AnyArrayLike] Dict where the key represents the columnar-index and the values are the actual results. columns : pd.Index diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8b95977aa6039..65d3bcc2e75cc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -913,6 +913,7 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): ) if agg_names: + # e.g. ohlc assert len(agg_names) == result.shape[1] else: assert result.ndim == 1 diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 30a6fe223687b..6f48e28651817 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,6 +36,7 @@ ) from pandas.core.dtypes.missing import _maybe_fill, isna +from pandas._typing import Axis import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -470,7 +471,7 @@ def wrapper(*args, **kwargs): return func def _cython_operation( - self, kind, values, how, axis, min_count=-1, **kwargs + self, kind: str, values, how: str, axis: Axis, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. From d297684fe35119f4d37e5eb0abc1751ee67a63e6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 29 Oct 2019 09:32:53 -0700 Subject: [PATCH 31/45] Removed failing type --- pandas/core/groupby/ops.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6f48e28651817..984bb45f6543a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,7 +36,6 @@ ) from pandas.core.dtypes.missing import _maybe_fill, isna -from pandas._typing import Axis import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -471,7 +470,7 @@ def wrapper(*args, **kwargs): return func def _cython_operation( - self, kind: str, values, how: str, axis: Axis, min_count: int = -1, **kwargs + self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. From 391d106c6d6e742f18103f6b449fd26b277fa97f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 2 Nov 2019 15:03:36 -0700 Subject: [PATCH 32/45] Aligned annotations and comments --- pandas/core/groupby/generic.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d95154529c74c..74fb66f57dddf 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -52,7 +52,7 @@ ) from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna -from pandas._typing import AnyArrayLike, FrameOrSeries +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com @@ -346,14 +346,17 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) def _wrap_series_output( - self, output: Mapping[int, AnyArrayLike], index: Index, columns: Index + self, + output: Mapping[int, Union[Series, np.ndarray]], + index: Index, + columns: Index, ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. Parameters ---------- - output : Mapping[int, AnyArrayLike] + output : Mapping[int, Union[Series, np.ndarray]] Dict where the key represents the columnar-index and the values are the actual results. Must be ordered from 0..n index : pd.Index @@ -380,14 +383,14 @@ def _wrap_series_output( return result def _wrap_aggregated_output( - self, output: Mapping[int, AnyArrayLike], columns: Index + self, output: Mapping[int, Union[Series, np.ndarray]], columns: Index ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. Parameters ---------- - output : Mapping[int, AnyArrayLike] + output : Mapping[int, Union[Series, np.ndarray]] Dict where the key represents the columnar-index and the values are the actual results. columns : pd.Index @@ -408,14 +411,14 @@ def _wrap_aggregated_output( return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Mapping[int, AnyArrayLike], columns: Index + self, output: Mapping[int, Union[Series, np.ndarray]], columns: Index ) -> Series: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. Parameters ---------- - output : dict[int, np.ndarray] + output : dict[int, Union[Series, np.ndarray]] Dict with a sole key of 0 and a value of the result values. columns : pd.Index Columns to apply to the output. @@ -1669,14 +1672,14 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[int, AnyArrayLike], columns: Index + self, output: Mapping[int, Union[Series, np.ndarray]], columns: Index ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. Parameters ---------- - output : dict[int, np.ndarray] + output : dict[int, Union[Series, np.ndarray]] Dict where the key represents the columnar-index and the values are the actual results. columns : pd.Index @@ -1702,14 +1705,14 @@ def _wrap_aggregated_output( return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Mapping[int, AnyArrayLike], columns: Index + self, output: Mapping[int, Union[Series, np.ndarray]], columns: Index ) -> DataFrame: """ Wraps the output of DataFrameGroupBy transformations into the expected result. Parameters ---------- - output : dict[int, np.ndarray] + output : dict[int, Union[Series, np.ndarray]] Dict where the key represents the columnar-index and the values are the actual results. columns : pd.Index From 3a78051a3c343844b1baaa217d4ec281934ec543 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 6 Nov 2019 08:21:06 -0800 Subject: [PATCH 33/45] mypy fix --- pandas/core/groupby/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a60a1a94ee7fd..3f3c57ed0673f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -15,6 +15,7 @@ class providing the base-class of operations. import re import types from typing import ( + Dict, FrozenSet, Hashable, Iterable, @@ -2271,7 +2272,7 @@ def _get_cythonized_result( grouper = self.grouper labels, _, ngroups = grouper.group_info - output = collections.OrderedDict() + output = collections.OrderedDict() # type: Dict[int, np.ndarray] names = [] # List[Hashable] base_func = getattr(libgroupby, how) From 936591e2e7975d498d38e47b4e4d6fed1578209b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 7 Nov 2019 08:51:09 -0800 Subject: [PATCH 34/45] asserts and mypy fixes --- pandas/core/groupby/generic.py | 9 +++++++++ pandas/core/groupby/groupby.py | 11 ++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1cbf04885d7ba..4e2bd5b4f954e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -372,6 +372,8 @@ def _wrap_series_output( element. The exception is operations that expand dimensions, like ohlc. """ assert len(output) == len(columns) + assert list(output.keys()) == sorted(output.keys()) + if len(output) > 1: result = DataFrame(output, index=index) # type: Union[Series, DataFrame] result.columns = columns @@ -403,6 +405,8 @@ def _wrap_aggregated_output( In the vast majority of cases output and columns will only contain one element. The exception is operations that expand dimensions, like ohlc. """ + assert list(output.keys()) == sorted(output.keys()) + result = self._wrap_series_output( output=output, index=self.grouper.result_index, columns=columns ) @@ -430,6 +434,8 @@ def _wrap_transformed_output( output and columns should only contain one element. These are containers for generic compatability with the DataFrameGroupBy class. """ + assert list(output.keys()) == sorted(output.keys()) + result = self._wrap_series_output( output=output, index=self.obj.index, columns=columns ) @@ -1690,6 +1696,8 @@ def _wrap_aggregated_output( ------- DataFrame """ + assert list(output.keys()) == sorted(output.keys()) + result = DataFrame(output) result.columns = columns @@ -1723,6 +1731,7 @@ def _wrap_transformed_output( ------- DataFrame """ + assert list(output.keys()) == sorted(output.keys()) result = DataFrame(output) result.columns = columns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 20be6d784fbb1..53858d13aa405 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -856,8 +856,8 @@ def _transform_should_cast(self, func_nm: str) -> bool: ) def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): - output = collections.OrderedDict() - names = [] # type: List[Hashable] + output = collections.OrderedDict() # type: Dict[int, np.ndarray] + names = [] # type: List[Optional[Hashable]] for idx, (name, obj) in enumerate(self._iterate_slices()): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: @@ -894,7 +894,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output = collections.OrderedDict() + output = collections.OrderedDict() # type: Dict[int, np.ndarray] names = [] # type: List[Hashable] # Ideally we would be able to enumerate self._iterate_slices and use @@ -927,14 +927,15 @@ def _cython_agg_general( if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output, names) + columns = Index(names) + return self._wrap_aggregated_output(output, columns) def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output = collections.OrderedDict() + output = collections.OrderedDict() # type: Dict[int, np.ndarray] names = [] # type: List[Hashable] for idx, (name, obj) in enumerate(self._iterate_slices()): From 6cc160747c2c11f73a58db68404358485cbd41b5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 12 Nov 2019 22:48:11 -0800 Subject: [PATCH 35/45] Fix issue in merge resolution --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 97ada9ad2ff6a..e406de8be3b31 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -944,6 +944,7 @@ def _python_agg_general(self, func, *args, **kwargs): result, counts = self.grouper.agg_series(obj, f) assert result is not None output[name] = self._try_cast(result, obj, numeric_only=True) + names.append(name) if len(output) == 0: return self._python_apply_general(f) From ce97eff03bcbfa4a9c53807d3486e66ca1895095 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 16 Nov 2019 17:56:11 -0800 Subject: [PATCH 36/45] Correct merge with 29629 --- pandas/core/groupby/groupby.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 574e78c4da4d4..86ea24cf92b97 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -845,7 +845,8 @@ def _transform_should_cast(self, func_nm: str) -> bool: def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): output = collections.OrderedDict() # type: Dict[int, np.ndarray] names = [] # type: List[Optional[Hashable]] - for idx, (name, obj) in enumerate(self._iterate_slices()): + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue @@ -889,7 +890,8 @@ def _cython_agg_general( # returns a (n x 4) array. Output requires 1D ndarrays as values, so we # need to slice that up into 1D arrays idx = 0 - for name, obj in self._iterate_slices(): + for obj in self._iterate_slices(): + name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue @@ -925,7 +927,8 @@ def _python_agg_general(self, func, *args, **kwargs): output = collections.OrderedDict() # type: Dict[int, np.ndarray] names = [] # type: List[Hashable] - for idx, (name, obj) in enumerate(self._iterate_slices()): + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name if self.grouper.ngroups == 0: # agg_series below assumes ngroups > 0 continue @@ -2279,7 +2282,8 @@ def _get_cythonized_result( names = [] # List[Hashable] base_func = getattr(libgroupby, how) - for idx, (name, obj) in enumerate(self._iterate_slices()): + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name values = obj._data._values if aggregate: From d1a92b4d27a2ef0cde83ee7b3632ab13a3e1b30e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 16 Nov 2019 19:25:54 -0800 Subject: [PATCH 37/45] Fixed issue with dict assignment --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 86ea24cf92b97..a122a54137e5d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -947,7 +947,7 @@ def _python_agg_general(self, func, *args, **kwargs): result, counts = self.grouper.agg_series(obj, f) assert result is not None - output[name] = self._try_cast(result, obj, numeric_only=True) + output[idx] = self._try_cast(result, obj, numeric_only=True) names.append(name) if len(output) == 0: From 23eb8038c97d24876dc56feac5c7c6a260adfef2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 16 Nov 2019 19:33:10 -0800 Subject: [PATCH 38/45] modernized annotations --- pandas/core/groupby/generic.py | 3 ++- pandas/core/groupby/groupby.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 677936648da8a..92378ca91e961 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -363,8 +363,9 @@ def _wrap_series_output( assert len(output) == len(columns) assert list(output.keys()) == sorted(output.keys()) + result: Union[Series, DataFrame] if len(output) > 1: - result = DataFrame(output, index=index) # type: Union[Series, DataFrame] + result = DataFrame(output, index=index) result.columns = columns else: result = Series(output[0], index=index, name=columns[0]) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a122a54137e5d..8b2cf48792484 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -843,8 +843,8 @@ def _transform_should_cast(self, func_nm: str) -> bool: ) def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): - output = collections.OrderedDict() # type: Dict[int, np.ndarray] - names = [] # type: List[Optional[Hashable]] + output: Dict[int, np.ndarray] = collections.OrderedDict() + names: List[Optional[Hashable]] = [] for idx, obj in enumerate(self._iterate_slices()): name = obj.name is_numeric = is_numeric_dtype(obj.dtype) @@ -882,8 +882,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output = collections.OrderedDict() # type: Dict[int, np.ndarray] - names = [] # type: List[Hashable] + output: Dict[int, np.ndarray] = collections.OrderedDict() + names: List[Hashable] = [] # Ideally we would be able to enumerate self._iterate_slices and use # the index from enumeration as the key of output, but ohlc in particular @@ -924,8 +924,8 @@ def _python_agg_general(self, func, *args, **kwargs): f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output = collections.OrderedDict() # type: Dict[int, np.ndarray] - names = [] # type: List[Hashable] + output: Dict[int, np.ndarray] = collections.OrderedDict() + names: List[Hashable] = [] for idx, obj in enumerate(self._iterate_slices()): name = obj.name @@ -2278,8 +2278,8 @@ def _get_cythonized_result( grouper = self.grouper labels, _, ngroups = grouper.group_info - output = collections.OrderedDict() # type: Dict[int, np.ndarray] - names = [] # List[Hashable] + output: Dict[int, np.ndarray] = collections.OrderedDict() + names: List[Optional[Hashable]] = [] base_func = getattr(libgroupby, how) for idx, obj in enumerate(self._iterate_slices()): From b07335b28360a9096b4846b1fd02c75e2a893b46 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 17 Nov 2019 16:46:04 -0800 Subject: [PATCH 39/45] Changed to assert, removed collections --- pandas/core/groupby/generic.py | 3 +-- pandas/core/groupby/groupby.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 92378ca91e961..4901f0daf0ef5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -431,8 +431,7 @@ def _wrap_transformed_output( ) # No transformations increase the ndim of the result - # Unfortunately need to cast for mypy to know this - result = cast(Series, result) + assert isinstance(result, Series) return result def _wrap_applied_output(self, keys, values, not_indexed_same=False): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 09b4ed366f257..bd3cb2fc3ee66 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -7,7 +7,6 @@ class providing the base-class of operations. expose these user-facing objects to provide specific functionailty. """ -import collections from contextlib import contextmanager import datetime from functools import partial, wraps @@ -830,7 +829,7 @@ def _transform_should_cast(self, func_nm: str) -> bool: ) def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): - output: Dict[int, np.ndarray] = collections.OrderedDict() + output: Dict[int, np.ndarray] = {} names: List[Optional[Hashable]] = [] for idx, obj in enumerate(self._iterate_slices()): name = obj.name @@ -869,7 +868,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output: Dict[int, np.ndarray] = collections.OrderedDict() + output: Dict[int, np.ndarray] = {} names: List[Hashable] = [] # Ideally we would be able to enumerate self._iterate_slices and use @@ -892,7 +891,10 @@ def _cython_agg_general( assert len(agg_names) == result.shape[1] else: assert result.ndim == 1 - result = result.reshape(-1, 1) + + breakpoint() + if isinstance(result, np.ndarray): + result = result.reshape(-1, 1) agg_names = [name] for result_column, result_name in zip(result.T, agg_names): @@ -911,7 +913,7 @@ def _python_agg_general(self, func, *args, **kwargs): f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output: Dict[int, np.ndarray] = collections.OrderedDict() + output: Dict[int, np.ndarray] = {} names: List[Hashable] = [] for idx, obj in enumerate(self._iterate_slices()): @@ -2265,7 +2267,7 @@ def _get_cythonized_result( grouper = self.grouper labels, _, ngroups = grouper.group_info - output: Dict[int, np.ndarray] = collections.OrderedDict() + output: Dict[int, np.ndarray] = {} names: List[Optional[Hashable]] = [] base_func = getattr(libgroupby, how) From fb711853eeb879c6038bd88fd009ca0596094d14 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 17 Nov 2019 16:58:21 -0800 Subject: [PATCH 40/45] Removed breakpoint and whatsnew space --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/groupby/groupby.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ac08f360652ab..4118aa1705fbb 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -438,7 +438,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) -- Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) +- Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bd3cb2fc3ee66..51916c7c5deb9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -892,7 +892,6 @@ def _cython_agg_general( else: assert result.ndim == 1 - breakpoint() if isinstance(result, np.ndarray): result = result.reshape(-1, 1) agg_names = [name] From d7b84a2f594a0f7865968f2d09aa9030b7b41e4c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 17 Nov 2019 17:11:01 -0800 Subject: [PATCH 41/45] Fixed issue with DTA --- pandas/core/groupby/groupby.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 51916c7c5deb9..22af5444c80b8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -889,16 +889,14 @@ def _cython_agg_general( if agg_names: # e.g. ohlc assert len(agg_names) == result.shape[1] + for result_column, result_name in zip(result.T, agg_names): + output[idx] = self._try_cast(result_column, obj) + names.append(result_name) + idx += 1 else: assert result.ndim == 1 - - if isinstance(result, np.ndarray): - result = result.reshape(-1, 1) - agg_names = [name] - - for result_column, result_name in zip(result.T, agg_names): - output[idx] = self._try_cast(result_column, obj) - names.append(result_name) + output[idx] = self._try_cast(result, obj) + names.append(name) idx += 1 if len(output) == 0: From 79344222527a16f049f3ad7e88930bea3581b80e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 17 Nov 2019 17:26:14 -0800 Subject: [PATCH 42/45] typing fixup --- pandas/core/groupby/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 22af5444c80b8..4df396f65f67f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -51,7 +51,7 @@ class providing the base-class of operations. from pandas.core import nanops import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical, try_cast_to_ea +from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -868,8 +868,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output: Dict[int, np.ndarray] = {} - names: List[Hashable] = [] + output: Dict[int, Union[np.ndarray, DatetimeArray]] = {} + names: List[Optional[Hashable]] = [] # Ideally we would be able to enumerate self._iterate_slices and use # the index from enumeration as the key of output, but ohlc in particular From acf22d3f79063d6701fa656c24012b3b0872a23f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Nov 2019 18:39:48 -0800 Subject: [PATCH 43/45] packed key into namedtuple --- pandas/core/groupby/base.py | 4 ++ pandas/core/groupby/generic.py | 76 +++++++++++----------------------- pandas/core/groupby/groupby.py | 52 ++++++++++------------- 3 files changed, 50 insertions(+), 82 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index fed387cbeade4..407cd8342d486 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -3,8 +3,12 @@ hold the whitelist of methods that are exposed on the SeriesGroupBy and the DataFrameGroupBy objects. """ +import collections + from pandas.core.dtypes.common import is_list_like, is_scalar +OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) + class GroupByMixin: """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d58541f90ff8f..e23adcbfc56ef 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -333,10 +333,7 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) def _wrap_series_output( - self, - output: Mapping[int, Union[Series, np.ndarray]], - index: Index, - columns: Index, + self, output: Mapping[int, Union[Series, np.ndarray]], index: Index, ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -344,12 +341,9 @@ def _wrap_series_output( Parameters ---------- output : Mapping[int, Union[Series, np.ndarray]] - Dict where the key represents the columnar-index and the values are - the actual results. Must be ordered from 0..n + Data to wrap. index : pd.Index Index to apply to the output. - columns : pd.Index - Columns to apply to the output. Returns ------- @@ -360,31 +354,28 @@ def _wrap_series_output( In the vast majority of cases output and columns will only contain one element. The exception is operations that expand dimensions, like ohlc. """ - assert len(output) == len(columns) - assert list(output.keys()) == sorted(output.keys()) + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) result: Union[Series, DataFrame] if len(output) > 1: - result = DataFrame(output, index=index) + result = DataFrame(indexed_output, index=index) result.columns = columns else: - result = Series(output[0], index=index, name=columns[0]) + result = Series(indexed_output[0], index=index, name=columns[0]) return result def _wrap_aggregated_output( - self, output: Mapping[int, Union[Series, np.ndarray]], columns: Index + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. Parameters ---------- - output : Mapping[int, Union[Series, np.ndarray]] - Dict where the key represents the columnar-index and the values are - the actual results. - columns : pd.Index - Columns to apply to the output. + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. Returns ------- @@ -392,43 +383,30 @@ def _wrap_aggregated_output( Notes ----- - In the vast majority of cases output and columns will only contain one - element. The exception is operations that expand dimensions, like ohlc. + In the vast majority of cases output will only contain one element. + The exception is operations that expand dimensions, like ohlc. """ - assert list(output.keys()) == sorted(output.keys()) - result = self._wrap_series_output( - output=output, index=self.grouper.result_index, columns=columns + output=output, index=self.grouper.result_index ) return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Mapping[int, Union[Series, np.ndarray]], columns: Index + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] ) -> Series: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. Parameters ---------- - output : dict[int, Union[Series, np.ndarray]] + output : dict[base.OutputKey, Union[Series, np.ndarray]] Dict with a sole key of 0 and a value of the result values. - columns : pd.Index - Columns to apply to the output. Returns ------- Series - - Notes - ----- - output and columns should only contain one element. These are containers - for generic compatability with the DataFrameGroupBy class. """ - assert list(output.keys()) == sorted(output.keys()) - - result = self._wrap_series_output( - output=output, index=self.obj.index, columns=columns - ) + result = self._wrap_series_output(output=output, index=self.obj.index) # No transformations increase the ndim of the result assert isinstance(result, Series) @@ -1651,7 +1629,7 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[int, Union[Series, np.ndarray]], columns: Index + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1659,18 +1637,16 @@ def _wrap_aggregated_output( Parameters ---------- output : dict[int, Union[Series, np.ndarray]] - Dict where the key represents the columnar-index and the values are - the actual results. - columns : pd.Index - Column names to apply + Data to wrap. Returns ------- DataFrame """ - assert list(output.keys()) == sorted(output.keys()) + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) - result = DataFrame(output) + result = DataFrame(indexed_output) result.columns = columns if not self.as_index: @@ -1686,7 +1662,7 @@ def _wrap_aggregated_output( return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output( - self, output: Mapping[int, Union[Series, np.ndarray]], columns: Index + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] ) -> DataFrame: """ Wraps the output of DataFrameGroupBy transformations into the expected result. @@ -1694,18 +1670,16 @@ def _wrap_transformed_output( Parameters ---------- output : dict[int, Union[Series, np.ndarray]] - Dict where the key represents the columnar-index and the values are - the actual results. - columns : pd.Index - Column names to apply. + Data to wrap. Returns ------- DataFrame """ - assert list(output.keys()) == sorted(output.keys()) + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) - result = DataFrame(output) + result = DataFrame(indexed_output) result.columns = columns result.index = self.obj.index diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 655728f430043..f1cd21418409c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -16,7 +16,6 @@ class providing the base-class of operations. from typing import ( Dict, FrozenSet, - Hashable, Iterable, List, Mapping, @@ -829,8 +828,7 @@ def _transform_should_cast(self, func_nm: str) -> bool: ) def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): - output: Dict[int, np.ndarray] = {} - names: List[Optional[Hashable]] = [] + output: Dict[base.OutputKey, np.ndarray] = {} for idx, obj in enumerate(self._iterate_slices()): name = obj.name is_numeric = is_numeric_dtype(obj.dtype) @@ -845,14 +843,13 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): if self._transform_should_cast(how): result = self._try_cast(result, obj) - output[idx] = result - names.append(name) + key = base.OutputKey(label=name, position=idx) + output[key] = result if len(output) == 0: raise DataError("No numeric types to aggregate") - columns = Index(names) - return self._wrap_transformed_output(output, columns) + return self._wrap_transformed_output(output) def _wrap_aggregated_output(self, output: Mapping[int, np.ndarray], columns: Index): raise AbstractMethodError(self) @@ -868,9 +865,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output: Dict[int, Union[np.ndarray, DatetimeArray]] = {} - names: List[Optional[Hashable]] = [] - + output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {} # Ideally we would be able to enumerate self._iterate_slices and use # the index from enumeration as the key of output, but ohlc in particular # returns a (n x 4) array. Output requires 1D ndarrays as values, so we @@ -890,28 +885,26 @@ def _cython_agg_general( # e.g. ohlc assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): - output[idx] = self._try_cast(result_column, obj) - names.append(result_name) + key = base.OutputKey(label=result_name, position=idx) + output[key] = self._try_cast(result_column, obj) idx += 1 else: assert result.ndim == 1 - output[idx] = self._try_cast(result, obj) - names.append(name) + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj) idx += 1 if len(output) == 0: raise DataError("No numeric types to aggregate") - columns = Index(names) - return self._wrap_aggregated_output(output, columns) + return self._wrap_aggregated_output(output) def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output: Dict[int, np.ndarray] = {} - names: List[Hashable] = [] + output: Dict[base.OutputKey, np.ndarray] = {} for idx, obj in enumerate(self._iterate_slices()): name = obj.name @@ -933,8 +926,8 @@ def _python_agg_general(self, func, *args, **kwargs): result, counts = self.grouper.agg_series(obj, f) assert result is not None - output[idx] = self._try_cast(result, obj, numeric_only=True) - names.append(name) + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) @@ -942,17 +935,16 @@ def _python_agg_general(self, func, *args, **kwargs): if self.grouper._filter_empty_groups: mask = counts.ravel() > 0 - for idx, result in output.items(): + for key, result in output.items(): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[idx] = self._try_cast(values[mask], result) + output[key] = self._try_cast(values[mask], result) - columns = Index(names) - return self._wrap_aggregated_output(output, columns) + return self._wrap_aggregated_output(output) def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -2261,8 +2253,7 @@ def _get_cythonized_result( grouper = self.grouper labels, _, ngroups = grouper.group_info - output: Dict[int, np.ndarray] = {} - names: List[Optional[Hashable]] = [] + output: Dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) for idx, obj in enumerate(self._iterate_slices()): @@ -2299,14 +2290,13 @@ def _get_cythonized_result( if post_processing: result = post_processing(result, inferences) - output[idx] = result - names.append(name) + key = base.OutputKey(label=name, position=idx) + output[key] = result - columns = Index(names) if aggregate: - return self._wrap_aggregated_output(output, columns) + return self._wrap_aggregated_output(output) else: - return self._wrap_transformed_output(output, columns) + return self._wrap_transformed_output(output) @Substitution(name="groupby") @Appender(_common_see_also) From a0aae64b58e6085bfb3ffc78eb93c7b54f44aa05 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Nov 2019 18:43:42 -0800 Subject: [PATCH 44/45] mypy fixes --- pandas/core/groupby/generic.py | 8 +++++++- pandas/core/groupby/groupby.py | 6 ++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e23adcbfc56ef..b3579303b4ffc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -333,7 +333,7 @@ def _aggregate_multiple_funcs(self, arg, _level): return DataFrame(results, columns=columns) def _wrap_series_output( - self, output: Mapping[int, Union[Series, np.ndarray]], index: Index, + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -405,7 +405,13 @@ def _wrap_transformed_output( Returns ------- Series + + Notes + ----- + output should always contain one element. It is specified as a dict + for consistency with DataFrame methods and _wrap_aggregated_output. """ + assert len(output) == 1 result = self._wrap_series_output(output=output, index=self.obj.index) # No transformations increase the ndim of the result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f1cd21418409c..fe2e3441f2adc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -851,12 +851,10 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[int, np.ndarray], columns: Index): + def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_transformed_output( - self, output: Mapping[int, np.ndarray], columns: Index - ): + def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): From a9b411ab7248679c09c0ea7f139ed7518f4418c4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Nov 2019 18:47:42 -0800 Subject: [PATCH 45/45] docstring cleanup --- pandas/core/groupby/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b3579303b4ffc..fc005399c6e43 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -340,7 +340,7 @@ def _wrap_series_output( Parameters ---------- - output : Mapping[int, Union[Series, np.ndarray]] + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. index : pd.Index Index to apply to the output. @@ -1642,7 +1642,7 @@ def _wrap_aggregated_output( Parameters ---------- - output : dict[int, Union[Series, np.ndarray]] + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. Returns @@ -1675,7 +1675,7 @@ def _wrap_transformed_output( Parameters ---------- - output : dict[int, Union[Series, np.ndarray]] + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. Returns