From 79f41cca1741ee0d3128ed8641deaa6b1597f45a Mon Sep 17 00:00:00 2001 From: Todd Jennings Date: Mon, 19 May 2014 20:04:31 +0200 Subject: [PATCH 1/3] implement additional tests for groupby apply methods --- pandas/tests/test_groupby.py | 55 +++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 475f5085d55f5..a13436697a24d 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1633,8 +1633,12 @@ def _testit(op): result = op(grouped)['C'] assert_series_equal(result, exp) + _testit(lambda x: x.count()) _testit(lambda x: x.sum()) + _testit(lambda x: x.std()) + _testit(lambda x: x.var()) _testit(lambda x: x.mean()) + _testit(lambda x: x.median()) _testit(lambda x: x.prod()) _testit(lambda x: x.min()) _testit(lambda x: x.max()) @@ -4166,7 +4170,7 @@ def test_tab_completion(self): 'agg','aggregate','apply','boxplot','filter','first','get_group', 'groups','hist','indices','last','max','mean','median', 'min','name','ngroups','nth','ohlc','plot', 'prod', - 'size','std','sum','transform','var', 'count', 'head', 'describe', + 'size', 'std', 'sum', 'transform', 'var', 'count', 'head', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'irow', 'ffill', @@ -4306,6 +4310,55 @@ def __eq__(self, other): name='grp')) tm.assert_frame_equal(result, expected) + def test__cython_agg_general(self): + ops = [('mean', np.mean), + ('median', np.median), + ('var', np.var), + ('add', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), + ] + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + for op, targop in ops: + result = df.groupby(labels)._cython_agg_general(op) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + + def test_ops_general(self): + ops = [('mean', np.mean), + ('median', np.median), + ('std', np.std), + ('var', np.var), + ('sum', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), + ] + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + for op, targop in ops: + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() From ec9a09c87a13f2068e1e4e4ec2368f2b34d112a3 Mon Sep 17 00:00:00 2001 From: Todd Jennings Date: Fri, 16 May 2014 17:57:57 +0200 Subject: [PATCH 2/3] simplify groupby's std method --- pandas/core/groupby.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1b07e2fb0aeab..31bdeaaddde32 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -695,12 +695,7 @@ def std(self, ddof=1): For multiple groupings, the result index will be a MultiIndex """ # todo, implement at cython level? - if ddof == 1: - return self._cython_agg_general('std') - else: - self._set_selection_from_grouper() - f = lambda x: x.std(ddof=ddof) - return self._python_agg_general(f) + return np.sqrt(self.var(ddof=ddof)) def var(self, ddof=1): """ @@ -1332,7 +1327,6 @@ def get_group_levels(self): 'name': 'group_median' }, 'var': 'group_var', - 'std': 'group_var', 'first': { 'name': 'group_nth', 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) @@ -1341,10 +1335,6 @@ def get_group_levels(self): 'count': 'group_count', } - _cython_transforms = { - 'std': np.sqrt, - } - _cython_arity = { 'ohlc': 4, # OHLC } @@ -1455,7 +1445,6 @@ def aggregate(self, values, how, axis=0): def _aggregate(self, result, counts, values, how, is_numeric): agg_func, dtype = self._get_aggregate_function(how, values) - trans_func = self._cython_transforms.get(how, lambda x: x) comp_ids, _, ngroups = self.group_info if values.ndim > 3: @@ -1469,7 +1458,7 @@ def _aggregate(self, result, counts, values, how, is_numeric): else: agg_func(result, counts, values, comp_ids) - return trans_func(result) + return result def agg_series(self, obj, func): try: @@ -1669,7 +1658,6 @@ def names(self): 'min': 'group_min_bin', 'max': 'group_max_bin', 'var': 'group_var_bin', - 'std': 'group_var_bin', 'ohlc': 'group_ohlc', 'first': { 'name': 'group_nth_bin', @@ -1688,7 +1676,6 @@ def names(self): def _aggregate(self, result, counts, values, how, is_numeric=True): agg_func, dtype = self._get_aggregate_function(how, values) - trans_func = self._cython_transforms.get(how, lambda x: x) if values.ndim > 3: # punting for now @@ -1699,7 +1686,7 @@ def _aggregate(self, result, counts, values, how, is_numeric=True): else: agg_func(result, counts, values, self.bins) - return trans_func(result) + return result def agg_series(self, obj, func): dummy = obj[:0] From 2121b22ebfb539db62e248be0f99815e2931ac90 Mon Sep 17 00:00:00 2001 From: Todd Jennings Date: Mon, 12 May 2014 18:47:34 +0200 Subject: [PATCH 3/3] add sem to nanops and pandas object apply methods --- doc/source/api.rst | 4 +++ doc/source/basics.rst | 1 + doc/source/groupby.rst | 4 +-- doc/source/timeseries.rst | 4 +-- doc/source/v0.14.1.txt | 3 ++ pandas/core/generic.py | 24 ++++++++++++-- pandas/core/groupby.py | 10 +++++- pandas/core/nanops.py | 47 ++++++++++++++++++--------- pandas/tests/test_frame.py | 17 ++++++++++ pandas/tests/test_groupby.py | 20 +++++++++--- pandas/tests/test_multilevel.py | 2 +- pandas/tests/test_panel.py | 7 ++++ pandas/tests/test_panel4d.py | 7 ++++ pandas/tests/test_series.py | 13 ++++++++ pandas/tseries/tests/test_resample.py | 45 ++++++++++++++++++++++++- 15 files changed, 178 insertions(+), 30 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index c037dfa8d7acf..bc257ffa0ad6c 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -350,6 +350,7 @@ Computations / Descriptive Stats Series.prod Series.quantile Series.rank + Series.sem Series.skew Series.std Series.sum @@ -642,6 +643,7 @@ Computations / Descriptive Stats DataFrame.prod DataFrame.quantile DataFrame.rank + DataFrame.sem DataFrame.skew DataFrame.sum DataFrame.std @@ -895,6 +897,7 @@ Computations / Descriptive Stats Panel.min Panel.pct_change Panel.prod + Panel.sem Panel.skew Panel.sum Panel.std @@ -1222,6 +1225,7 @@ Computations / Descriptive Stats GroupBy.mean GroupBy.median + GroupBy.sem GroupBy.std GroupBy.var GroupBy.ohlc diff --git a/doc/source/basics.rst b/doc/source/basics.rst index dd1ea5678698d..f614e1b7edcf4 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -428,6 +428,7 @@ optional ``level`` parameter which applies only if the object has a ``prod``, Product of values ``std``, Unbiased standard deviation ``var``, Unbiased variance + ``sem``, Unbiased standard error of the mean ``skew``, Unbiased skewness (3rd moment) ``kurt``, Unbiased kurtosis (4th moment) ``quantile``, Sample quantile (value at %) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index cad557756f897..c0db87d58ef08 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -397,7 +397,7 @@ index are the group names and whose values are the sizes of each group. named *columns*. Aggregating functions are ones that reduce the dimension of the returned objects, - for example: ``mean, sum, size, count, std, var, describe, first, last, nth, min, max``. This is + for example: ``mean, sum, size, count, std, var, sem, describe, first, last, nth, min, max``. This is what happens when you do for example ``DataFrame.sum()`` and get back a ``Series``. ``nth`` can act as a reducer *or* a filter, see :ref:`here ` @@ -457,7 +457,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching Cython-optimized aggregation functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Some common aggregations, currently only ``sum``, ``mean``, and ``std``, have +Some common aggregations, currently only ``sum``, ``mean``, ``std``, and ``sem``, have optimized Cython implementations: .. ipython:: python diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 48acacd7ced08..53efe061e218f 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -974,8 +974,8 @@ an array and produces aggregated values: ts.resample('5Min', how=np.max) Any function available via :ref:`dispatching ` can be given to -the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``max``, -``min``, ``median``, ``first``, ``last``, ``ohlc``. +the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``sem``, +``max``, ``min``, ``median``, ``first``, ``last``, ``ohlc``. For downsampling, ``closed`` can be set to 'left' or 'right' to specify which end of the interval is closed: diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index ff35ce9ca3069..f83cd50bbd8c5 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -73,6 +73,9 @@ Enhancements See :ref:`the docs `. +- Implemented ``sem`` (standard error of the mean) operation for ``Series``, + ``DataFrame``, ``Panel``, and ``Groupby`` (:issue:`6897`) + .. _whatsnew_0141.performance: Performance diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ed0d92683ad54..4500a9181f5d9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3794,7 +3794,8 @@ def mad(self, axis=None, skipna=None, level=None, **kwargs): @Substitution(outname='variance', desc="Return unbiased variance over requested " - "axis\nNormalized by N-1") + "axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument") @Appender(_num_doc) def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): if skipna is None: @@ -3811,7 +3812,8 @@ def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): @Substitution(outname='stdev', desc="Return unbiased standard deviation over requested " - "axis\nNormalized by N-1") + "axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument") @Appender(_num_doc) def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): if skipna is None: @@ -3827,6 +3829,24 @@ def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): return np.sqrt(result) cls.std = std + @Substitution(outname='standarderror', + desc="Return unbiased standard error of the mean over " + "requested axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument") + @Appender(_num_doc) + def sem(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level('sem', axis=axis, level=level, + skipna=skipna, ddof=ddof) + + return self._reduce(nanops.nansem, axis=axis, skipna=skipna, + ddof=ddof) + cls.sem = sem + @Substitution(outname='compounded', desc="Return the compound percentage of the values for " "the requested axis") diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 31bdeaaddde32..2714e9f22cd95 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -486,7 +486,7 @@ def __getattr__(self, attr): (type(self).__name__, attr)) def __getitem__(self, key): - raise NotImplementedError + raise NotImplementedError('Not implemented: %s' % key) def _make_wrapper(self, name): if name not in self._apply_whitelist: @@ -710,6 +710,14 @@ def var(self, ddof=1): f = lambda x: x.var(ddof=ddof) return self._python_agg_general(f) + def sem(self, ddof=1): + """ + Compute standard error of the mean of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + return self.std(ddof=ddof)/np.sqrt(self.count()) + def size(self): """ Compute group sizes diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 46d2358768384..b40334c1857ac 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -308,6 +308,24 @@ def get_median(x): return _wrap_results(get_median(values), dtype) if notempty else np.nan +def _get_counts_nanvar(mask, axis, ddof): + count = _get_counts(mask, axis) + + d = count-ddof + + # always return NaN, never inf + if np.isscalar(count): + if count <= ddof: + count = np.nan + d = np.nan + else: + mask2 = count <= ddof + if mask2.any(): + np.putmask(d, mask2, np.nan) + np.putmask(count, mask2, np.nan) + return count, d + + @disallow('M8') @bottleneck_switch(ddof=1) def nanvar(values, axis=None, skipna=True, ddof=1): @@ -316,31 +334,28 @@ def nanvar(values, axis=None, skipna=True, ddof=1): mask = isnull(values) - if axis is not None: - count = (values.shape[axis] - mask.sum(axis)).astype(float) - else: - count = float(values.size - mask.sum()) + count, d = _get_counts_nanvar(mask, axis, ddof) - d = count-ddof if skipna: values = values.copy() np.putmask(values, mask, 0) - # always return NaN, never inf - if np.isscalar(count): - if count <= ddof: - count = np.nan - d = np.nan - else: - mask = count <= ddof - if mask.any(): - np.putmask(d, mask, np.nan) - np.putmask(count, mask, np.nan) - X = _ensure_numeric(values.sum(axis)) XX = _ensure_numeric((values ** 2).sum(axis)) return np.fabs((XX - X ** 2 / count) / d) + +def nansem(values, axis=None, skipna=True, ddof=1): + var = nanvar(values, axis, skipna, ddof=ddof) + + if not isinstance(values.dtype.type, np.floating): + values = values.astype('f8') + mask = isnull(values) + count, _ = _get_counts_nanvar(mask, axis, ddof) + + return np.sqrt(var)/np.sqrt(count) + + @bottleneck_switch() def nanmin(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, fill_value_typ='+inf') diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 358d9d82403f6..c4475715386b9 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10864,6 +10864,23 @@ def test_var_std(self): self.assertFalse((result < 0).any()) nanops._USE_BOTTLENECK = True + def test_sem(self): + alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x)) + self._check_stat_op('sem', alt) + + result = self.tsframe.sem(ddof=4) + expected = self.tsframe.apply(lambda x: x.std(ddof=4)/np.sqrt(len(x))) + assert_almost_equal(result, expected) + + arr = np.repeat(np.random.random((1, 1000)), 1000, 0) + result = nanops.nansem(arr, axis=0) + self.assertFalse((result < 0).any()) + if nanops._USE_BOTTLENECK: + nanops._USE_BOTTLENECK = False + result = nanops.nansem(arr, axis=0) + self.assertFalse((result < 0).any()) + nanops._USE_BOTTLENECK = True + def test_skew(self): _skip_if_no_scipy() from scipy.stats import skew diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a13436697a24d..4aae5dfea3982 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -681,11 +681,14 @@ def _check_results(grouped): assert_frame_equal(result, expected) # group frame by function dict - result = grouped.agg( - OrderedDict([['A', 'var'], ['B', 'std'], ['C', 'mean']])) + result = grouped.agg(OrderedDict([['A', 'var'], + ['B', 'std'], + ['C', 'mean'], + ['D', 'sem']])) expected = DataFrame(OrderedDict([['A', grouped['A'].var()], ['B', grouped['B'].std()], - ['C', grouped['C'].mean()]])) + ['C', grouped['C'].mean()], + ['D', grouped['D'].sem()]])) assert_frame_equal(result, expected) by_weekday = self.tsframe.groupby(lambda x: x.weekday()) @@ -1637,6 +1640,7 @@ def _testit(op): _testit(lambda x: x.sum()) _testit(lambda x: x.std()) _testit(lambda x: x.var()) + _testit(lambda x: x.sem()) _testit(lambda x: x.mean()) _testit(lambda x: x.median()) _testit(lambda x: x.prod()) @@ -4170,8 +4174,8 @@ def test_tab_completion(self): 'agg','aggregate','apply','boxplot','filter','first','get_group', 'groups','hist','indices','last','max','mean','median', 'min','name','ngroups','nth','ohlc','plot', 'prod', - 'size', 'std', 'sum', 'transform', 'var', 'count', 'head', 'describe', - 'cummax', 'quantile', 'rank', 'cumprod', 'tail', + 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'head', + 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'irow', 'ffill', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', @@ -4347,6 +4351,12 @@ def test_ops_general(self): ('last', lambda x: x.iloc[-1]), ('count', np.size), ] + try: + from scipy.stats import sem + except ImportError: + pass + else: + ops.append(('sem', sem)) df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 07b753b6724d8..b8ccfb3eb151b 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1378,7 +1378,7 @@ def test_count(self): self.assertRaises(KeyError, frame.count, level='x') AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', - 'mad', 'std', 'var'] + 'mad', 'std', 'var', 'sem'] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 2ed24832c3270..34ab401eac283 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -127,6 +127,13 @@ def alt(x): return np.std(x, ddof=1) self._check_stat_op('std', alt) + def test_sem(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.std(x, ddof=1)/np.sqrt(len(x)) + self._check_stat_op('sem', alt) + # def test_skew(self): # from scipy.stats import skew diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 55b6535be9078..7dc5d9bd411fb 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -98,6 +98,13 @@ def alt(x): return np.std(x, ddof=1) self._check_stat_op('std', alt) + def test_sem(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.std(x, ddof=1)/np.sqrt(len(x)) + self._check_stat_op('sem', alt) + # def test_skew(self): # from scipy.stats import skew diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 44587248e6d51..a822b0891edc4 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1980,6 +1980,19 @@ def test_var_std(self): result = s.std(ddof=1) self.assertTrue(isnull(result)) + def test_sem(self): + alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x)) + self._check_stat_op('sem', alt) + + result = self.ts.sem(ddof=4) + expected = np.std(self.ts.values, ddof=4)/np.sqrt(len(self.ts.values)) + assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = self.ts.iloc[[0]] + result = s.sem(ddof=1) + self.assert_(isnull(result)) + def test_skew(self): _skip_if_no_scipy() diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 2345738002029..cdf62af1fd90b 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -5,7 +5,8 @@ from pandas.compat import range, lrange, zip, product import numpy as np -from pandas import Series, TimeSeries, DataFrame, Panel, isnull, notnull, Timestamp +from pandas import (Series, TimeSeries, DataFrame, Panel, Index, + isnull, notnull, Timestamp) from pandas.tseries.index import date_range from pandas.tseries.offsets import Minute, BDay @@ -104,6 +105,48 @@ def test_resample_basic(self): expect = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expect) + def test_resample_how(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', + freq='min', name='index') + s = Series(np.random.randn(14), index=rng) + grouplist = np.ones_like(s) + grouplist[0] = 0 + grouplist[1:6] = 1 + grouplist[6:11] = 2 + grouplist[11:] = 3 + args = ['sum', 'mean', 'std', 'sem', 'max', 'min', + 'median', 'first', 'last', 'ohlc'] + + def _ohlc(group): + if isnull(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + inds = date_range('1/1/2000', periods=4, freq='5min') + + for arg in args: + if arg == 'ohlc': + func = _ohlc + else: + func = arg + try: + result = s.resample('5min', how=arg, + closed='right', label='right') + + expected = s.groupby(grouplist).agg(func) + self.assertEqual(result.index.name, 'index') + if arg == 'ohlc': + expected = DataFrame(expected.values.tolist()) + expected.columns = ['open', 'high', 'low', 'close'] + expected.index = Index(inds, name='index') + assert_frame_equal(result, expected) + else: + expected.index = inds + assert_series_equal(result, expected) + except BaseException as exc: + + exc.args += ('how=%s' % arg,) + raise + def test_resample_basic_from_daily(self): # from daily dti = DatetimeIndex(