diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ce64ed754180d..9047f1504aa1a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1917,8 +1917,15 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_arraylike = any(isinstance(g, (list, tuple, Series, np.ndarray)) for g in keys) + # sugar for df.reset_index().groupby(['a', 'b']) where b was in index_names + from_col, from_idx, from_both = _from_index_and_columns(obj, keys) try: + if from_idx and from_col: + # check the drop part... + obj = obj.reset_index(level=list(from_idx)).reset_index(drop=True) + group_axis = obj._get_axis(axis) + if isinstance(obj, DataFrame): all_in_columns = all(g in obj.columns for g in keys) else: @@ -1940,6 +1947,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): groupings = [] exclusions = [] + for i, (gpr, level) in enumerate(zip(keys, levels)): name = None try: @@ -1969,6 +1977,31 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): return grouper, exclusions, obj +def _from_index_and_columns(obj, keys): + """ + keys is already listlike + """ + not_all_string = not all(isinstance(g, compat.string_types) for g in keys) + not_df = not isinstance(obj, DataFrame) + if not_all_string or not_df: + # TODO: Handle mix of callables and strings. + return None, None, None + ks = set(keys) + from_idx = ks & set(obj.index.names) + from_col = ks & set(obj.columns) + + # check for ambiguity: + from_both = from_idx & from_col + if from_both: + from warnings import warn + msg = ("Found {0} in both the columns and index labels. " + "Grouping by the columns".format(from_both),) + warn(msg, FutureWarning) + + # don't need to do anything if the only ones from either are in both + return from_col, from_idx - from_both, from_both + + def _is_label_like(val): return isinstance(val, compat.string_types) or np.isscalar(val) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 1b70ae0309b10..ae06e60724264 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -11,7 +11,8 @@ from pandas.core.common import rands from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import (SpecificationError, DataError, - _nargsort, _lexsort_indexer) + _nargsort, _lexsort_indexer, + _from_index_and_columns) from pandas.core.series import Series from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, @@ -4168,6 +4169,56 @@ def test_nargsort(self): expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) assert_equal(result, expected) + def test_by_index_cols(self): + df = DataFrame([[1, 2, 'x', 'a', 'a'], + [2, 3, 'x', 'a', 'b'], + [3, 4, 'x', 'b', 'a'], + [4, 5, 'y', 'b', 'b']], + columns=['c1', 'c2', 'g1', 'i1', 'i2']) + df = df.set_index(['i1', 'i2']) + df.index.set_names(['i1', 'g1'], inplace=True) + result = df.groupby(by=['g1', 'i1']).mean() + idx = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'b')], + names=['g1', 'i1']) + expected = DataFrame([[1.5, 2.5], [1, 4], [1, 5]], + index=idx, columns=['c1', 'c2']) + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + result = df.groupby('g1').mean() + expected = DataFrame([[2, 3], [4, 5]], + index=['x', 'y'], columns=['c1', 'c2']) + expected.index.set_names(['g1'], inplace=True) + assert_frame_equal(result, expected) + + def test_from_index_and_columns(self): + # allowing by to spread across index and col names GH #5677 + df = DataFrame([[1, 2, 3, 4]], columns=['c1', 'c2', 'i1', 'i2']) + df = df.set_index(['i1', 'i2']) + + keys = ['c1'] + from_col, from_idx, from_both = _from_index_and_columns(df, keys) + self.assertEqual(from_col, set(['c1'])) + self.assertEqual(from_idx, set([])) + self.assertEqual(from_both, set([])) + + keys = ['c1', 'i1'] + from_col, from_idx, from_both = _from_index_and_columns(df, keys) + self.assertEqual(from_col, set(['c1'])) + self.assertEqual(from_idx, set(['i1'])) + self.assertEqual(from_both, set([])) + + df.index.names = ['i1', 'c1'] + keys = ['c1', 'i1'] + with tm.assert_produces_warning(FutureWarning): + from_col, from_idx, from_both = _from_index_and_columns(df, keys) + self.assertEqual(from_col, set(['c1'])) + self.assertEqual(from_idx, set(['i1'])) + self.assertEqual(from_both, set(['c1'])) + + res = _from_index_and_columns(df['c1'], 'i1') + self.assertEqual(res, (None, None, None)) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all()