BUG: DataFrameGroupBy.__getitem__ with non-unique columns (#41427)

jbrockmendel · web-flow · commit 913cfab85fcd · 2021-05-11T21:10:28.000-04:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -895,6 +895,8 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`)
 - Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`)
 - Bug in :meth:`DataFrameGroupBy.rank` with the GroupBy object's ``axis=0`` and the ``rank`` method's keyword ``axis=1`` (:issue:`41320`)
+- Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`)
+- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -214,7 +214,7 @@ def ndim(self) -> int:
     @cache_readonly
     def _obj_with_exclusions(self):
         if self._selection is not None and isinstance(self.obj, ABCDataFrame):
-            return self.obj.reindex(columns=self._selection_list)
+            return self.obj[self._selection_list]
 
         if len(self.exclusions) > 0:
             return self.obj.drop(self.exclusions, axis=1)
@@ -239,7 +239,9 @@ def __getitem__(self, key):
         else:
             if key not in self.obj:
                 raise KeyError(f"Column not found: {key}")
-            return self._gotitem(key, ndim=1)
+            subset = self.obj[key]
+            ndim = subset.ndim
+            return self._gotitem(key, ndim=ndim, subset=subset)
 
     def _gotitem(self, key, ndim: int, subset=None):
         """
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1417,12 +1417,19 @@ def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFram
         return path, res
 
     def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
-        # iterate through columns
+        # iterate through columns, see test_transform_exclude_nuisance
         output = {}
         inds = []
         for i, col in enumerate(obj):
+            subset = obj.iloc[:, i]
+            sgb = SeriesGroupBy(
+                subset,
+                selection=col,
+                grouper=self.grouper,
+                exclusions=self.exclusions,
+            )
             try:
-                output[col] = self[col].transform(wrapper)
+                output[i] = sgb.transform(wrapper)
             except TypeError:
                 # e.g. trying to call nanmean with string values
                 pass
@@ -1434,7 +1441,9 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
 
         columns = obj.columns.take(inds)
 
-        return self.obj._constructor(output, index=obj.index, columns=columns)
+        result = self.obj._constructor(output, index=obj.index)
+        result.columns = columns
+        return result
 
     def filter(self, func, dropna=True, *args, **kwargs):
         """
@@ -1504,7 +1513,7 @@ def filter(self, func, dropna=True, *args, **kwargs):
 
         return self._apply_filter(indices, dropna)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
         if self.axis == 1:
             # GH 37725
             raise ValueError("Cannot subset columns when using axis=1")
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -20,6 +20,10 @@
     date_range,
 )
 import pandas._testing as tm
+from pandas.core.groupby.generic import (
+    DataFrameGroupBy,
+    SeriesGroupBy,
+)
 from pandas.core.groupby.groupby import DataError
 
 
@@ -391,13 +395,31 @@ def test_transform_select_columns(df):
     tm.assert_frame_equal(result, expected)
 
 
-def test_transform_exclude_nuisance(df):
+@pytest.mark.parametrize("duplicates", [True, False])
+def test_transform_exclude_nuisance(df, duplicates):
+    # case that goes through _transform_item_by_item
+
+    if duplicates:
+        # make sure we work with duplicate columns GH#41427
+        df.columns = ["A", "C", "C", "D"]
 
     # this also tests orderings in transform between
     # series/frame to make sure it's consistent
     expected = {}
     grouped = df.groupby("A")
-    expected["C"] = grouped["C"].transform(np.mean)
+
+    gbc = grouped["C"]
+    expected["C"] = gbc.transform(np.mean)
+    if duplicates:
+        # squeeze 1-column DataFrame down to Series
+        expected["C"] = expected["C"]["C"]
+
+        assert isinstance(gbc.obj, DataFrame)
+        assert isinstance(gbc, DataFrameGroupBy)
+    else:
+        assert isinstance(gbc, SeriesGroupBy)
+        assert isinstance(gbc.obj, Series)
+
     expected["D"] = grouped["D"].transform(np.mean)
     expected = DataFrame(expected)
     result = df.groupby("A").transform(np.mean)