Skip to content

ENH: Added a min_count keyword to stat funcs #18876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 28, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 39 additions & 12 deletions pandas/_libs/groupby_helper.pxi.in
Original file line number Diff line number Diff line change
@@ -36,7 +36,8 @@ def get_dispatch(dtypes):
def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{c_type}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=1):
"""
Only aggregates on axis=0
"""
@@ -88,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this work for min_count==0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, sumx starts out as zeros, so we just have to avoid setting it to NaN. Same for prod, but with ones.

out[i, j] = NAN
else:
out[i, j] = sumx[i, j]
@@ -99,7 +100,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{c_type}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=1):
"""
Only aggregates on axis=0
"""
@@ -147,7 +149,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
out[i, j] = NAN
else:
out[i, j] = prodx[i, j]
@@ -159,12 +161,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
{{dest_type2}} val, ct, oldmean
ndarray[{{dest_type2}}, ndim=2] nobs, mean

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

@@ -208,12 +213,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
{{dest_type2}} val, count
ndarray[{{dest_type2}}, ndim=2] sumx, nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

@@ -263,7 +271,8 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
@@ -272,6 +281,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
{{dest_type2}} val, count
Py_ssize_t ngroups = len(counts)

assert min_count == -1, "'min_count' only used in add and prod"

if len(labels) == 0:
return

@@ -332,7 +343,8 @@ def get_dispatch(dtypes):
def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{c_type}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
@@ -342,6 +354,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[{{dest_type2}}, ndim=2] resx
ndarray[int64_t, ndim=2] nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

@@ -382,7 +396,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{c_type}}, ndim=2] values,
ndarray[int64_t] labels, int64_t rank):
ndarray[int64_t] labels, int64_t rank,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
@@ -392,6 +407,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[{{dest_type2}}, ndim=2] resx
ndarray[int64_t, ndim=2] nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

@@ -455,7 +472,8 @@ def get_dispatch(dtypes):
def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
@@ -464,6 +482,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
{{dest_type2}} val, count
ndarray[{{dest_type2}}, ndim=2] maxx, nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

@@ -526,7 +546,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
@@ -535,6 +556,8 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
{{dest_type2}} val, count
ndarray[{{dest_type2}}, ndim=2] minx, nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

@@ -686,7 +709,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_median_float64(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
@@ -695,6 +719,9 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] _counts
ndarray data
float64_t* ptr

assert min_count == -1, "'min_count' only used in add and prod"

ngroups = len(counts)
N, K = (<object> values).shape

104 changes: 96 additions & 8 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
@@ -7322,7 +7322,8 @@ def _add_numeric_operations(cls):
@Substitution(outname='mad',
desc="Return the mean absolute deviation of the values "
"for the requested axis",
name1=name, name2=name2, axis_descr=axis_descr)
name1=name, name2=name2, axis_descr=axis_descr,
min_count='', examples='')
@Appender(_num_doc)
def mad(self, axis=None, skipna=None, level=None):
if skipna is None:
@@ -7363,7 +7364,8 @@ def mad(self, axis=None, skipna=None, level=None):
@Substitution(outname='compounded',
desc="Return the compound percentage of the values for "
"the requested axis", name1=name, name2=name2,
axis_descr=axis_descr)
axis_descr=axis_descr,
min_count='', examples='')
@Appender(_num_doc)
def compound(self, axis=None, skipna=None, level=None):
if skipna is None:
@@ -7387,10 +7389,10 @@ def compound(self, axis=None, skipna=None, level=None):
lambda y, axis: np.maximum.accumulate(y, axis), "max",
-np.inf, np.nan)

cls.sum = _make_stat_function(
cls.sum = _make_min_count_stat_function(
cls, 'sum', name, name2, axis_descr,
'Return the sum of the values for the requested axis',
nanops.nansum)
nanops.nansum, _sum_examples)
cls.mean = _make_stat_function(
cls, 'mean', name, name2, axis_descr,
'Return the mean of the values for the requested axis',
@@ -7406,10 +7408,10 @@ def compound(self, axis=None, skipna=None, level=None):
"by N-1\n",
nanops.nankurt)
cls.kurtosis = cls.kurt
cls.prod = _make_stat_function(
cls.prod = _make_min_count_stat_function(
cls, 'prod', name, name2, axis_descr,
'Return the product of the values for the requested axis',
nanops.nanprod)
nanops.nanprod, _prod_examples)
cls.product = cls.prod
cls.median = _make_stat_function(
cls, 'median', name, name2, axis_descr,
@@ -7540,10 +7542,13 @@ def _doc_parms(cls):
numeric_only : boolean, default None
Include only float, int, boolean columns. If None, will attempt to use
everything, then use only numeric data. Not implemented for Series.
%(min_count)s\

Returns
-------
%(outname)s : %(name1)s or %(name2)s (if level specified)\n"""
%(outname)s : %(name1)s or %(name2)s (if level specified)

%(examples)s"""

_num_ddof_doc = """

@@ -7611,9 +7616,92 @@ def _doc_parms(cls):
"""


_sum_examples = """\
Examples
--------
By default, the sum of an empty series is ``NaN``.

>>> pd.Series([]).sum() # min_count=1 is the default
nan

This can be controlled with the ``min_count`` parameter. For example, if
you'd like the sum of an empty series to be 0, pass ``min_count=0``.

>>> pd.Series([]).sum(min_count=0)
0.0

Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
empty series identically.

>>> pd.Series([np.nan]).sum()
nan

>>> pd.Series([np.nan]).sum(min_count=0)
0.0
"""

_prod_examples = """\
Examples
--------
By default, the product of an empty series is ``NaN``

>>> pd.Series([]).prod()
nan

This can be controlled with the ``min_count`` parameter

>>> pd.Series([]).prod(min_count=0)
1.0

Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
empty series identically.

>>> pd.Series([np.nan]).prod()
nan

>>> pd.Series([np.nan]).sum(min_count=0)
1.0
"""


_min_count_stub = """\
min_count : int, default 1
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.

.. versionadded :: 0.21.2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably it will become 0.22 ? (but can change later)


Added with the default being 1. This means the sum or product
of an all-NA or empty series is ``NaN``.
"""


def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc,
f, examples):
@Substitution(outname=name, desc=desc, name1=name1, name2=name2,
axis_descr=axis_descr, min_count=_min_count_stub,
examples=examples)
@Appender(_num_doc)
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
min_count=1,
**kwargs):
nv.validate_stat_func(tuple(), kwargs, fname=name)
if skipna is None:
skipna = True
if axis is None:
axis = self._stat_axis_number
if level is not None:
return self._agg_by_level(name, axis=axis, level=level,
skipna=skipna, min_count=min_count)
return self._reduce(f, name, axis=axis, skipna=skipna,
numeric_only=numeric_only, min_count=min_count)

return set_function_name(stat_func, name, cls)


def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f):
@Substitution(outname=name, desc=desc, name1=name1, name2=name2,
axis_descr=axis_descr)
axis_descr=axis_descr, min_count='', examples='')
@Appender(_num_doc)
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
Loading