From 0e954f84d5e5b2eae643472031c7890da3d98814 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 20 Dec 2017 11:30:15 -0600 Subject: [PATCH] ENH: Added a min_count keyword to stat funcs The current default is 1, reproducing the behavior of pandas 0.21. The current test suite should pass. Currently, only nansum and nanprod actually do anything with `min_count`. It will not be hard to adjust other nan* methods use it if we want. This was just simplest for now. Additional tests for the new behavior have been added. --- pandas/_libs/groupby_helper.pxi.in | 51 ++++++++--- pandas/core/generic.py | 104 +++++++++++++++++++++-- pandas/core/groupby.py | 44 ++++++---- pandas/core/nanops.py | 48 ++++++----- pandas/core/resample.py | 15 +++- pandas/tests/frame/test_analytics.py | 31 +++++++ pandas/tests/groupby/test_aggregate.py | 41 +++++---- pandas/tests/groupby/test_categorical.py | 45 ++++++++++ pandas/tests/series/test_analytics.py | 94 ++++++++++++++++++-- pandas/tests/test_resample.py | 55 ++++++++++++ 10 files changed, 445 insertions(+), 83 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index d38b677df321c..16b7cbff44e03 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -36,7 +36,8 @@ def get_dispatch(dtypes): def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=1): """ Only aggregates on axis=0 """ @@ -88,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = sumx[i, j] @@ -99,7 +100,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=1): """ Only aggregates on axis=0 """ @@ -147,7 +149,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = prodx[i, j] @@ -159,12 +161,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, ct, oldmean ndarray[{{dest_type2}}, ndim=2] nobs, mean + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -208,12 +213,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] sumx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -263,7 +271,8 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -272,6 +281,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count Py_ssize_t ngroups = len(counts) + assert min_count == -1, "'min_count' only used in add and prod" + if len(labels) == 0: return @@ -332,7 +343,8 @@ def get_dispatch(dtypes): def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -342,6 +354,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -382,7 +396,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): + ndarray[int64_t] labels, int64_t rank, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -392,6 +407,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -455,7 +472,8 @@ def get_dispatch(dtypes): def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -464,6 +482,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] maxx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -526,7 +546,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -535,6 +556,8 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] minx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -686,7 +709,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -695,6 +719,9 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] _counts ndarray data float64_t* ptr + + assert min_count == -1, "'min_count' only used in add and prod" + ngroups = len(counts) N, K = ( values).shape diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f2dbb3ef4d32a..2acf64f1d9f74 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7322,7 +7322,8 @@ def _add_numeric_operations(cls): @Substitution(outname='mad', desc="Return the mean absolute deviation of the values " "for the requested axis", - name1=name, name2=name2, axis_descr=axis_descr) + name1=name, name2=name2, axis_descr=axis_descr, + min_count='', examples='') @Appender(_num_doc) def mad(self, axis=None, skipna=None, level=None): if skipna is None: @@ -7363,7 +7364,8 @@ def mad(self, axis=None, skipna=None, level=None): @Substitution(outname='compounded', desc="Return the compound percentage of the values for " "the requested axis", name1=name, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, + min_count='', examples='') @Appender(_num_doc) def compound(self, axis=None, skipna=None, level=None): if skipna is None: @@ -7387,10 +7389,10 @@ def compound(self, axis=None, skipna=None, level=None): lambda y, axis: np.maximum.accumulate(y, axis), "max", -np.inf, np.nan) - cls.sum = _make_stat_function( + cls.sum = _make_min_count_stat_function( cls, 'sum', name, name2, axis_descr, 'Return the sum of the values for the requested axis', - nanops.nansum) + nanops.nansum, _sum_examples) cls.mean = _make_stat_function( cls, 'mean', name, name2, axis_descr, 'Return the mean of the values for the requested axis', @@ -7406,10 +7408,10 @@ def compound(self, axis=None, skipna=None, level=None): "by N-1\n", nanops.nankurt) cls.kurtosis = cls.kurt - cls.prod = _make_stat_function( + cls.prod = _make_min_count_stat_function( cls, 'prod', name, name2, axis_descr, 'Return the product of the values for the requested axis', - nanops.nanprod) + nanops.nanprod, _prod_examples) cls.product = cls.prod cls.median = _make_stat_function( cls, 'median', name, name2, axis_descr, @@ -7540,10 +7542,13 @@ def _doc_parms(cls): numeric_only : boolean, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. +%(min_count)s\ Returns ------- -%(outname)s : %(name1)s or %(name2)s (if level specified)\n""" +%(outname)s : %(name1)s or %(name2)s (if level specified) + +%(examples)s""" _num_ddof_doc = """ @@ -7611,9 +7616,92 @@ def _doc_parms(cls): """ +_sum_examples = """\ +Examples +-------- +By default, the sum of an empty series is ``NaN``. + +>>> pd.Series([]).sum() # min_count=1 is the default +nan + +This can be controlled with the ``min_count`` parameter. For example, if +you'd like the sum of an empty series to be 0, pass ``min_count=0``. + +>>> pd.Series([]).sum(min_count=0) +0.0 + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).sum() +nan + +>>> pd.Series([np.nan]).sum(min_count=0) +0.0 +""" + +_prod_examples = """\ +Examples +-------- +By default, the product of an empty series is ``NaN`` + +>>> pd.Series([]).prod() +nan + +This can be controlled with the ``min_count`` parameter + +>>> pd.Series([]).prod(min_count=0) +1.0 + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).prod() +nan + +>>> pd.Series([np.nan]).sum(min_count=0) +1.0 +""" + + +_min_count_stub = """\ +min_count : int, default 1 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + + .. versionadded :: 0.21.2 + + Added with the default being 1. This means the sum or product + of an all-NA or empty series is ``NaN``. +""" + + +def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, + f, examples): + @Substitution(outname=name, desc=desc, name1=name1, name2=name2, + axis_descr=axis_descr, min_count=_min_count_stub, + examples=examples) + @Appender(_num_doc) + def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, + min_count=1, + **kwargs): + nv.validate_stat_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, + skipna=skipna, min_count=min_count) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=numeric_only, min_count=min_count) + + return set_function_name(stat_func, name, cls) + + def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, min_count='', examples='') @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 47b80c00da4d4..041239ed06d88 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -986,7 +986,8 @@ def _cython_transform(self, how, numeric_only=True): return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, alt=None, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True, + min_count=-1): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -994,7 +995,8 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True): continue try: - result, names = self.grouper.aggregate(obj.values, how) + result, names = self.grouper.aggregate(obj.values, how, + min_count=min_count) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -1301,7 +1303,8 @@ def _add_numeric_operations(cls): """ add numeric operations to the GroupBy generically """ def groupby_function(name, alias, npfunc, - numeric_only=True, _convert=False): + numeric_only=True, _convert=False, + min_count=-1): _local_template = "Compute %(f)s of group values" @@ -1311,6 +1314,8 @@ def groupby_function(name, alias, npfunc, def f(self, **kwargs): if 'numeric_only' not in kwargs: kwargs['numeric_only'] = numeric_only + if 'min_count' not in kwargs: + kwargs['min_count'] = min_count self._set_group_selection() try: return self._cython_agg_general( @@ -1358,8 +1363,8 @@ def last(x): else: return last(x) - cls.sum = groupby_function('sum', 'add', np.sum) - cls.prod = groupby_function('prod', 'prod', np.prod) + cls.sum = groupby_function('sum', 'add', np.sum, min_count=1) + cls.prod = groupby_function('prod', 'prod', np.prod, min_count=1) cls.min = groupby_function('min', 'min', np.min, numeric_only=False) cls.max = groupby_function('max', 'max', np.max, numeric_only=False) cls.first = groupby_function('first', 'first', first_compat, @@ -2139,7 +2144,7 @@ def get_group_levels(self): 'var': 'group_var', 'first': { 'name': 'group_nth', - 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) + 'f': lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1) }, 'last': 'group_last', 'ohlc': 'group_ohlc', @@ -2209,7 +2214,7 @@ def wrapper(*args, **kwargs): (how, dtype_str)) return func, dtype_str - def _cython_operation(self, kind, values, how, axis): + def _cython_operation(self, kind, values, how, axis, min_count=-1): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions @@ -2294,11 +2299,12 @@ def _cython_operation(self, kind, values, how, axis): counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, - is_datetimelike) + is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) + # TODO: min_count result = self._transform( result, values, labels, func, is_numeric, is_datetimelike) @@ -2335,14 +2341,15 @@ def _cython_operation(self, kind, values, how, axis): return result, names - def aggregate(self, values, how, axis=0): - return self._cython_operation('aggregate', values, how, axis) + def aggregate(self, values, how, axis=0, min_count=-1): + return self._cython_operation('aggregate', values, how, axis, + min_count=min_count) def transform(self, values, how, axis=0): return self._cython_operation('transform', values, how, axis) def _aggregate(self, result, counts, values, comp_ids, agg_func, - is_numeric, is_datetimelike): + is_numeric, is_datetimelike, min_count=-1): if values.ndim > 3: # punting for now raise NotImplementedError("number of dimensions is currently " @@ -2351,9 +2358,10 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, for i, chunk in enumerate(values.transpose(2, 0, 1)): chunk = chunk.squeeze() - agg_func(result[:, :, i], counts, chunk, comp_ids) + agg_func(result[:, :, i], counts, chunk, comp_ids, + min_count) else: - agg_func(result, counts, values, comp_ids) + agg_func(result, counts, values, comp_ids, min_count) return result @@ -3643,9 +3651,10 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, alt=None, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True, + min_count=-1): new_items, new_blocks = self._cython_agg_blocks( - how, alt=alt, numeric_only=numeric_only) + how, alt=alt, numeric_only=numeric_only, min_count=min_count) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -3671,7 +3680,8 @@ def _wrap_agged_blocks(self, items, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how, alt=None, numeric_only=True): + def _cython_agg_blocks(self, how, alt=None, numeric_only=True, + min_count=-1): # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine @@ -3688,7 +3698,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True): locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis) + block.values, how, axis=agg_axis, min_count=min_count) except NotImplementedError: # generally if we have numeric_only=False # and non-applicable functions diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e1c09947ac0b4..88f69f6ff2e14 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -107,21 +107,9 @@ def f(values, axis=None, skipna=True, **kwds): if k not in kwds: kwds[k] = v try: - if values.size == 0: - - # we either return np.nan or pd.NaT - if is_numeric_dtype(values): - values = values.astype('float64') - fill_value = na_value_for_dtype(values.dtype) - - if values.ndim == 1: - return fill_value - else: - result_shape = (values.shape[:axis] + - values.shape[axis + 1:]) - result = np.empty(result_shape, dtype=values.dtype) - result.fill(fill_value) - return result + if values.size == 0 and kwds.get('min_count') is None: + # We are empty, returning NA for our type + return _na_for_min_count(values, axis) if (_USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name)): @@ -292,6 +280,22 @@ def _wrap_results(result, dtype): return result +def _na_for_min_count(values, axis): + # we either return np.nan or pd.NaT + if is_numeric_dtype(values): + values = values.astype('float64') + fill_value = na_value_for_dtype(values.dtype) + + if values.ndim == 1: + return fill_value + else: + result_shape = (values.shape[:axis] + + values.shape[axis + 1:]) + result = np.empty(result_shape, dtype=values.dtype) + result.fill(fill_value) + return result + + def nanany(values, axis=None, skipna=True): values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna) return values.any(axis) @@ -304,7 +308,7 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch() -def nansum(values, axis=None, skipna=True): +def nansum(values, axis=None, skipna=True, min_count=1): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): @@ -312,7 +316,7 @@ def nansum(values, axis=None, skipna=True): elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) - the_sum = _maybe_null_out(the_sum, axis, mask) + the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count) return _wrap_results(the_sum, dtype) @@ -641,13 +645,13 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanprod(values, axis=None, skipna=True): +def nanprod(values, axis=None, skipna=True, min_count=1): mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) - return _maybe_null_out(result, axis, mask) + return _maybe_null_out(result, axis, mask, min_count=min_count) def _maybe_arg_null_out(result, axis, mask, skipna): @@ -683,9 +687,9 @@ def _get_counts(mask, axis, dtype=float): return np.array(count, dtype=dtype) -def _maybe_null_out(result, axis, mask): +def _maybe_null_out(result, axis, mask, min_count=1): if axis is not None and getattr(result, 'ndim', False): - null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 + null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): @@ -698,7 +702,7 @@ def _maybe_null_out(result, axis, mask): result[null_mask] = None elif result is not tslib.NaT: null_mask = mask.size - mask.sum() - if null_mask == 0: + if null_mask < min_count: result = np.nan return result diff --git a/pandas/core/resample.py b/pandas/core/resample.py index c2bf7cff746eb..a30c727ecb87c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -625,9 +625,20 @@ def size(self): Resampler._deprecated_valids += dir(Resampler) + +# downsample methods +for method in ['sum', 'prod']: + + def f(self, _method=method, min_count=1, *args, **kwargs): + nv.validate_resampler_func(_method, args, kwargs) + return self._downsample(_method, min_count=min_count) + f.__doc__ = getattr(GroupBy, method).__doc__ + setattr(Resampler, method, f) + + # downsample methods -for method in ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', - 'median', 'prod', 'ohlc']: +for method in ['min', 'max', 'first', 'last', 'mean', 'sem', + 'median', 'ohlc']: def f(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 17d711f937bf7..80e9acd0d2281 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -973,6 +973,37 @@ def test_sum_corner(self): assert len(axis0) == 0 assert len(axis1) == 0 + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_sum_prod_nanops(self, method, unit): + idx = ['a', 'b', 'c'] + df = pd.DataFrame({"a": [unit, unit], + "b": [unit, np.nan], + "c": [np.nan, np.nan]}) + + result = getattr(df, method)(min_count=1) + expected = pd.Series([unit, unit, np.nan], index=idx) + tm.assert_series_equal(result, expected) + + result = getattr(df, method)(min_count=0) + expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + tm.assert_series_equal(result, expected) + + result = getattr(df.iloc[1:], method)(min_count=1) + expected = pd.Series([unit, np.nan, np.nan], index=idx) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) + result = getattr(df, method)(min_count=5) + expected = pd.Series(result, index=['A', 'B']) + tm.assert_series_equal(result, expected) + + result = getattr(df, method)(min_count=6) + expected = pd.Series(result, index=['A', 'B']) + tm.assert_series_equal(result, expected) + def test_sum_object(self): values = self.frame.values.astype(int) frame = DataFrame(values, index=self.frame.index, diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 3d27df31cee6e..07ecc085098bf 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -809,26 +809,33 @@ def test__cython_agg_general(self): exc.args += ('operation: %s' % op, ) raise - def test_cython_agg_empty_buckets(self): - ops = [('mean', np.mean), - ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), - ('var', lambda x: np.var(x, ddof=1)), - ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), ] - + @pytest.mark.parametrize('op, targop', [ + ('mean', np.mean), + ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), + ('var', lambda x: np.var(x, ddof=1)), + ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), ] + ) + def test_cython_agg_empty_buckets(self, op, targop): df = pd.DataFrame([11, 12, 13]) grps = range(0, 55, 5) - for op, targop in ops: - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) - expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op,) - raise + # calling _cython_agg_general directly, instead of via the user API + # which sets different values for min_count, so do that here. + if op in ('add', 'prod'): + min_count = 1 + else: + min_count = -1 + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general( + op, min_count=min_count) + expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise def test_agg_over_numpy_arrays(self): # GH 3788 diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c73423921898d..5e3d2bb9cf091 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -662,3 +662,48 @@ def test_groupby_categorical_two_columns(self): "C3": [nan, nan, nan, nan, 10, 100, nan, nan, nan, nan, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp) + + def test_empty_sum(self): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # NA by default + result = df.groupby("A").B.sum() + expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.sum(min_count=0) + expected = pd.Series([3, 1, 0], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.sum(min_count=1) + expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + def test_empty_prod(self): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # NA by default + result = df.groupby("A").B.prod() + expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.prod(min_count=0) + expected = pd.Series([2, 1, 1], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.prod(min_count=1) + expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 0dae6aa96ced1..cd92edc927173 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -30,38 +30,122 @@ class TestSeriesAnalytics(TestData): @pytest.mark.parametrize("use_bottleneck", [True, False]) - @pytest.mark.parametrize("method", ["sum", "prod"]) - def test_empty(self, method, use_bottleneck): - + @pytest.mark.parametrize("method, unit", [ + ("sum", 0.0), + ("prod", 1.0) + ]) + def test_empty(self, method, unit, use_bottleneck): with pd.option_context("use_bottleneck", use_bottleneck): # GH 9422 - # treat all missing as NaN + # Entirely empty s = Series([]) + # NA by default result = getattr(s, method)() assert isna(result) + # Explict + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert isna(result) + + # Skipna, default result = getattr(s, method)(skipna=True) assert isna(result) + # Skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert isna(result) + + # All-NA s = Series([np.nan]) + # NA by default result = getattr(s, method)() assert isna(result) + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert isna(result) + + # Skipna, default result = getattr(s, method)(skipna=True) assert isna(result) + # skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert isna(result) + + # Mix of valid, empty s = Series([np.nan, 1]) + # Default result = getattr(s, method)() assert result == 1.0 - s = Series([np.nan, 1]) + # Explicit + result = getattr(s, method)(min_count=0) + assert result == 1.0 + + result = getattr(s, method)(min_count=1) + assert result == 1.0 + + # Skipna result = getattr(s, method)(skipna=True) assert result == 1.0 + result = getattr(s, method)(skipna=True, min_count=0) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=1) + assert result == 1.0 + # GH #844 (changed in 9422) df = DataFrame(np.empty((10, 0))) assert (df.sum(1).isnull()).all() + s = pd.Series([1]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + s = pd.Series([np.nan]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + s = pd.Series([np.nan, 1]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + @pytest.mark.parametrize('method, unit', [ + ('sum', 0.0), + ('prod', 1.0), + ]) + def test_empty_multi(self, method, unit): + s = pd.Series([1, np.nan, np.nan, np.nan], + index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) + # NaN by default + result = getattr(s, method)(level=0) + expected = pd.Series([1, np.nan], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(s, method)(level=0, min_count=0) + expected = pd.Series([1, unit], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = getattr(s, method)(level=0, min_count=1) + expected = pd.Series([1, np.nan], index=['a', 'b']) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "method", ['sum', 'mean', 'median', 'std', 'var']) def test_ops_consistency_on_empty(self, method): diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 38f4b8be469a5..4a3c4eff9f8c3 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta from functools import partial from textwrap import dedent +from operator import methodcaller import pytz import pytest @@ -3382,6 +3383,34 @@ def test_aggregate_normal(self): assert_frame_equal(expected, dt_result) """ + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_resample_entirly_nat_window(self, method, unit): + s = pd.Series([0] * 2 + [np.nan] * 2, + index=pd.date_range('2017', periods=4)) + # nan by default + result = methodcaller(method)(s.resample("2d")) + expected = pd.Series([0.0, np.nan], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = methodcaller(method, min_count=0)(s.resample("2d")) + expected = pd.Series([0.0, unit], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = methodcaller(method, min_count=1)(s.resample("2d")) + expected = pd.Series([0.0, np.nan], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + def test_aggregate_with_nat(self): # check TimeGrouper's aggregation is identical as normal groupby @@ -3441,3 +3470,29 @@ def test_repr(self): "closed='left', label='left', how='mean', " "convention='e', base=0)") assert result == expected + + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_upsample_sum(self, method, unit): + s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H")) + resampled = s.resample("30T") + index = pd.to_datetime(['2017-01-01T00:00:00', + '2017-01-01T00:30:00', + '2017-01-01T01:00:00']) + + # NaN by default + result = methodcaller(method)(resampled) + expected = pd.Series([1, np.nan, 1], index=index) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = methodcaller(method, min_count=0)(resampled) + expected = pd.Series([1, unit, 1], index=index) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = methodcaller(method, min_count=1)(resampled) + expected = pd.Series([1, np.nan, 1], index=index) + tm.assert_series_equal(result, expected)