From 98a4b37711d38f98100799e3cc3b197e7f236a7c Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 17 Mar 2014 09:03:11 -0400 Subject: [PATCH 1/2] DOC: update groupby pd.Grouper docs --- doc/source/api.rst | 1 + doc/source/release.rst | 2 +- doc/source/v0.14.0.txt | 2 +- pandas/core/groupby.py | 30 +++++++++++++++++++++--------- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 811301a6bbbca..1c80712e82d49 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1193,6 +1193,7 @@ Indexing, iteration GroupBy.groups GroupBy.indices GroupBy.get_group + Grouper Function application ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index c0415a350515f..bc4807a293d12 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -132,7 +132,7 @@ API Changes ``FutureWarning`` is raised to alert that the old ``rows`` and ``cols`` arguments will not be supported in a future release (:issue:`5505`) -- Allow specification of a more complex groupby, via ``pd.Groupby`` (:issue:`3794`) +- Allow specification of a more complex groupby, via ``pd.Grouper`` (:issue:`3794`) - A tuple passed to ``DataFame.sort_index`` will be interpreted as the levels of the index, rather than requiring a list of tuple (:issue:`4370`) diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 54995fc8daeb5..ea321cbab545a 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -94,7 +94,7 @@ These are out-of-bounds selections g.nth(0, dropna='any') # similar to old behaviour -- Allow specification of a more complex groupby via ``pd.Groupby``, such as grouping +- Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping by a Time and a string field simultaneously. See :ref:`the docs `. (:issue:`3794`) - Local variable usage has changed in diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 683c07b70d0f2..9deee2d593476 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -141,14 +141,26 @@ def _last(x): class Grouper(object): """ - A Grouper allows the user to specify a groupby instruction + A Grouper allows the user to specify a groupby instruction for a target object + + This specification will select a column via the key parameter, or if the level and/or + axis parameters are given, a level of the index of the target object. + + These are local specifications and will override 'global' settings, that is the parameters + axis and level which are passed to the groupby itself. Parameters ---------- - key : groupby key, default None - level : name, int level number, default None - freq : string / freqency object, default None - sort : boolean, whether to sort the resulting labels, default True + key : string, defaults to None + groupby key, which selects the grouping column of the target + level : name/number, defaults to None + the level for the target index + freq : string / freqency object, defaults to None + This will groupby the specified frequency if the target selection (via key or level) is + a datetime-like object + axis : number/name of the axis, defaults to None + sort : boolean, default to False + whether to sort the resulting labels Returns ------- @@ -156,10 +168,10 @@ class Grouper(object): Examples -------- - df.groupby(Group(key='A')) : syntatic sugar for df.groupby('A') - df.groupby(Group(key='date',freq='60s')) : specify a resample on the column 'date' - df.groupby(Group(level='date',freq='60s',axis=1)) : - specify a resample on the level 'date' on the columns axis with a frequency of 60s + >>> df.groupby(Grouper(key='A')) : syntatic sugar for df.groupby('A') + >>> df.groupby(Grouper(key='date',freq='60s')) : specify a resample on the column 'date' + >>> df.groupby(Grouper(level='date',freq='60s',axis=1)) : + specify a resample on the level 'date' on the columns axis with a frequency of 60s """ From 0719f218d944ab72f18936dbf0faa643959a6717 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 17 Mar 2014 09:21:21 -0400 Subject: [PATCH 2/2] CLN: move set_grouper/get_grouper type pd.Grouper methods to internal --- pandas/core/groupby.py | 12 ++++++------ pandas/tseries/resample.py | 20 ++++++++++---------- pandas/tseries/tests/test_resample.py | 4 ++-- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 9deee2d593476..15e6381cbe2fa 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -198,7 +198,7 @@ def __init__(self, key=None, level=None, freq=None, axis=None, sort=False): def ax(self): return self.grouper - def get_grouper(self, obj): + def _get_grouper(self, obj): """ Parameters @@ -210,10 +210,10 @@ def get_grouper(self, obj): a tuple of binner, grouper, obj (possibly sorted) """ - self.set_grouper(obj) + self._set_grouper(obj) return self.binner, self.grouper, self.obj - def set_grouper(self, obj, sort=False): + def _set_grouper(self, obj, sort=False): """ given an object and the specifcations, setup the internal grouper for this particular specification @@ -264,7 +264,7 @@ def set_grouper(self, obj, sort=False): self.grouper = ax return self.grouper - def get_binner_for_grouping(self, obj): + def _get_binner_for_grouping(self, obj): raise NotImplementedError @property @@ -1697,7 +1697,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, elif isinstance(self.grouper, Grouper): # get the new grouper - grouper = self.grouper.get_binner_for_grouping(self.obj) + grouper = self.grouper._get_binner_for_grouping(self.obj) self.obj = self.grouper.obj self.grouper = grouper if self.name is None: @@ -1807,7 +1807,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): # a passed in Grouper, directly convert if isinstance(key, Grouper): - binner, grouper, obj = key.get_grouper(obj) + binner, grouper, obj = key._get_grouper(obj) return grouper, [], obj # already have a BaseGrouper, just return it diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index b29f67b40894b..51144cb3bba2c 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -76,7 +76,7 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs) def resample(self, obj): - self.set_grouper(obj, sort=True) + self._set_grouper(obj, sort=True) ax = self.grouper if isinstance(ax, DatetimeIndex): @@ -93,7 +93,7 @@ def resample(self, obj): rs = self._resample_periods() else: obj = self.obj.to_timestamp(how=self.convention) - self.set_grouper(obj) + self._set_grouper(obj) rs = self._resample_timestamps() elif len(ax) == 0: return self.obj @@ -104,11 +104,11 @@ def resample(self, obj): rs_axis.name = ax.name return rs - def get_grouper(self, obj): - self.set_grouper(obj) - return self.get_binner_for_resample() + def _get_grouper(self, obj): + self._set_grouper(obj) + return self._get_binner_for_resample() - def get_binner_for_resample(self): + def _get_binner_for_resample(self): # create the BinGrouper # assume that self.set_grouper(obj) has already been called @@ -121,12 +121,12 @@ def get_binner_for_resample(self): self.grouper = BinGrouper(bins, binlabels) return self.binner, self.grouper, self.obj - def get_binner_for_grouping(self, obj): + def _get_binner_for_grouping(self, obj): # return an ordering of the transformed group labels, # suitable for multi-grouping, e.g the labels for # the resampled intervals - ax = self.set_grouper(obj) - self.get_binner_for_resample() + ax = self._set_grouper(obj) + self._get_binner_for_resample() # create the grouper binner = self.binner @@ -233,7 +233,7 @@ def _resample_timestamps(self): # assumes set_grouper(obj) already called axlabels = self.ax - self.get_binner_for_resample() + self._get_binner_for_resample() grouper = self.grouper binner = self.binner obj = self.obj diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 20c6724726955..242d656b8794f 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1134,7 +1134,7 @@ def test_apply_iteration(self): df = DataFrame({'open': 1, 'close': 2}, index=ind) tg = TimeGrouper('M') - _, grouper, _ = tg.get_grouper(df) + _, grouper, _ = tg._get_grouper(df) # Errors grouped = df.groupby(grouper, group_keys=False) @@ -1151,7 +1151,7 @@ def test_panel_aggregation(self): minor_axis=['A', 'B', 'C', 'D']) tg = TimeGrouper('M', axis=1) - _, grouper, _ = tg.get_grouper(wp) + _, grouper, _ = tg._get_grouper(wp) bingrouped = wp.groupby(grouper) binagg = bingrouped.mean()