From afd84912186425797d5d3d69159847c610d0462c Mon Sep 17 00:00:00 2001 From: janus Date: Tue, 20 Oct 2020 11:06:37 +0200 Subject: [PATCH 1/3] Call __finalize__ in `groupby` iterator This solution does not call `__finalize__` immediately on object creation in the `DataSplitter` instance. This is to allow `.apply` to avoid the overhead of calling `__finalize__` for intermediate objects. --- pandas/core/groupby/ops.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bca71b5c9646b..7d772b2d07a4e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -140,9 +140,16 @@ def get_iterator( splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() for key, (i, group) in zip(keys, splitter): - yield key, group + yield key, group.__finalize__(data, method="groupby") def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": + """ + Returns + ------- + Generator yielding subsetted objects + + __finalize__ has not been called for the the subsetted objects returned. + """ comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -921,7 +928,8 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return type(sdata)(mgr, name=sdata.name, fastpath=True) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr, name=sdata.name, fastpath=True) class FrameSplitter(DataSplitter): @@ -937,7 +945,8 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return type(sdata)(mgr) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr) def get_splitter( From e733ba4ada6d7e42023d6a4b435565c8dc877340 Mon Sep 17 00:00:00 2001 From: janus Date: Mon, 2 Nov 2020 12:58:25 +0100 Subject: [PATCH 2/3] Add test for GH-37343 See https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties --- pandas/tests/groupby/test_groupby_subclass.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index cc7a79e976513..d268d87708552 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -51,6 +51,15 @@ def test_groupby_preserves_subclass(obj, groupby_func): tm.assert_series_equal(result1, result2) +def test_groupby_preserves_metadata(): + # GH-37343 + custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]}) + assert "testattr" in custom_df._metadata + custom_df.testattr = "hello" + for _, group_df in custom_df.groupby("c"): + assert group_df.testattr == "hello" + + @pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame]) def test_groupby_resample_preserves_subclass(obj): # GH28330 -- preserve subclass through groupby.resample() From f7d0a4bf2f14e521f71669f9f6cc3d190b9f780e Mon Sep 17 00:00:00 2001 From: janus Date: Wed, 4 Nov 2020 11:27:23 +0100 Subject: [PATCH 3/3] Add 1.1.5 whatsnew entry --- doc/source/whatsnew/v1.1.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index cf728d94b2a55..a122154904996 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -23,7 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) - .. ---------------------------------------------------------------------------