-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
ENH: Added a min_count keyword to stat funcs #18876
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7322,7 +7322,8 @@ def _add_numeric_operations(cls): | |
@Substitution(outname='mad', | ||
desc="Return the mean absolute deviation of the values " | ||
"for the requested axis", | ||
name1=name, name2=name2, axis_descr=axis_descr) | ||
name1=name, name2=name2, axis_descr=axis_descr, | ||
min_count='', examples='') | ||
@Appender(_num_doc) | ||
def mad(self, axis=None, skipna=None, level=None): | ||
if skipna is None: | ||
|
@@ -7363,7 +7364,8 @@ def mad(self, axis=None, skipna=None, level=None): | |
@Substitution(outname='compounded', | ||
desc="Return the compound percentage of the values for " | ||
"the requested axis", name1=name, name2=name2, | ||
axis_descr=axis_descr) | ||
axis_descr=axis_descr, | ||
min_count='', examples='') | ||
@Appender(_num_doc) | ||
def compound(self, axis=None, skipna=None, level=None): | ||
if skipna is None: | ||
|
@@ -7387,10 +7389,10 @@ def compound(self, axis=None, skipna=None, level=None): | |
lambda y, axis: np.maximum.accumulate(y, axis), "max", | ||
-np.inf, np.nan) | ||
|
||
cls.sum = _make_stat_function( | ||
cls.sum = _make_min_count_stat_function( | ||
cls, 'sum', name, name2, axis_descr, | ||
'Return the sum of the values for the requested axis', | ||
nanops.nansum) | ||
nanops.nansum, _sum_examples) | ||
cls.mean = _make_stat_function( | ||
cls, 'mean', name, name2, axis_descr, | ||
'Return the mean of the values for the requested axis', | ||
|
@@ -7406,10 +7408,10 @@ def compound(self, axis=None, skipna=None, level=None): | |
"by N-1\n", | ||
nanops.nankurt) | ||
cls.kurtosis = cls.kurt | ||
cls.prod = _make_stat_function( | ||
cls.prod = _make_min_count_stat_function( | ||
cls, 'prod', name, name2, axis_descr, | ||
'Return the product of the values for the requested axis', | ||
nanops.nanprod) | ||
nanops.nanprod, _prod_examples) | ||
cls.product = cls.prod | ||
cls.median = _make_stat_function( | ||
cls, 'median', name, name2, axis_descr, | ||
|
@@ -7540,10 +7542,13 @@ def _doc_parms(cls): | |
numeric_only : boolean, default None | ||
Include only float, int, boolean columns. If None, will attempt to use | ||
everything, then use only numeric data. Not implemented for Series. | ||
%(min_count)s\ | ||
Returns | ||
------- | ||
%(outname)s : %(name1)s or %(name2)s (if level specified)\n""" | ||
%(outname)s : %(name1)s or %(name2)s (if level specified) | ||
%(examples)s""" | ||
|
||
_num_ddof_doc = """ | ||
|
@@ -7611,9 +7616,92 @@ def _doc_parms(cls): | |
""" | ||
|
||
|
||
_sum_examples = """\ | ||
Examples | ||
-------- | ||
By default, the sum of an empty series is ``NaN``. | ||
>>> pd.Series([]).sum() # min_count=1 is the default | ||
nan | ||
This can be controlled with the ``min_count`` parameter. For example, if | ||
you'd like the sum of an empty series to be 0, pass ``min_count=0``. | ||
>>> pd.Series([]).sum(min_count=0) | ||
0.0 | ||
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and | ||
empty series identically. | ||
>>> pd.Series([np.nan]).sum() | ||
nan | ||
>>> pd.Series([np.nan]).sum(min_count=0) | ||
0.0 | ||
""" | ||
|
||
_prod_examples = """\ | ||
Examples | ||
-------- | ||
By default, the product of an empty series is ``NaN`` | ||
>>> pd.Series([]).prod() | ||
nan | ||
This can be controlled with the ``min_count`` parameter | ||
>>> pd.Series([]).prod(min_count=0) | ||
1.0 | ||
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and | ||
empty series identically. | ||
>>> pd.Series([np.nan]).prod() | ||
nan | ||
>>> pd.Series([np.nan]).sum(min_count=0) | ||
1.0 | ||
""" | ||
|
||
|
||
_min_count_stub = """\ | ||
min_count : int, default 1 | ||
The required number of valid values to perform the operation. If fewer than | ||
``min_count`` non-NA values are present the result will be NA. | ||
.. versionadded :: 0.21.2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably it will become 0.22 ? (but can change later) |
||
Added with the default being 1. This means the sum or product | ||
of an all-NA or empty series is ``NaN``. | ||
""" | ||
|
||
|
||
def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, | ||
f, examples): | ||
@Substitution(outname=name, desc=desc, name1=name1, name2=name2, | ||
axis_descr=axis_descr, min_count=_min_count_stub, | ||
examples=examples) | ||
@Appender(_num_doc) | ||
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, | ||
min_count=1, | ||
**kwargs): | ||
nv.validate_stat_func(tuple(), kwargs, fname=name) | ||
if skipna is None: | ||
skipna = True | ||
if axis is None: | ||
axis = self._stat_axis_number | ||
if level is not None: | ||
return self._agg_by_level(name, axis=axis, level=level, | ||
skipna=skipna, min_count=min_count) | ||
return self._reduce(f, name, axis=axis, skipna=skipna, | ||
numeric_only=numeric_only, min_count=min_count) | ||
|
||
return set_function_name(stat_func, name, cls) | ||
|
||
|
||
def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): | ||
@Substitution(outname=name, desc=desc, name1=name1, name2=name2, | ||
axis_descr=axis_descr) | ||
axis_descr=axis_descr, min_count='', examples='') | ||
@Appender(_num_doc) | ||
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, | ||
**kwargs): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -107,21 +107,9 @@ def f(values, axis=None, skipna=True, **kwds): | |
if k not in kwds: | ||
kwds[k] = v | ||
try: | ||
if values.size == 0: | ||
|
||
# we either return np.nan or pd.NaT | ||
if is_numeric_dtype(values): | ||
values = values.astype('float64') | ||
fill_value = na_value_for_dtype(values.dtype) | ||
|
||
if values.ndim == 1: | ||
return fill_value | ||
else: | ||
result_shape = (values.shape[:axis] + | ||
values.shape[axis + 1:]) | ||
result = np.empty(result_shape, dtype=values.dtype) | ||
result.fill(fill_value) | ||
return result | ||
if values.size == 0 and kwds.get('min_count') is None: | ||
# We are empty, returning NA for our type | ||
return _na_for_min_count(values, axis) | ||
|
||
if (_USE_BOTTLENECK and skipna and | ||
_bn_ok_dtype(values.dtype, bn_name)): | ||
|
@@ -292,6 +280,22 @@ def _wrap_results(result, dtype): | |
return result | ||
|
||
|
||
def _na_for_min_count(values, axis): | ||
# we either return np.nan or pd.NaT | ||
if is_numeric_dtype(values): | ||
values = values.astype('float64') | ||
fill_value = na_value_for_dtype(values.dtype) | ||
|
||
if values.ndim == 1: | ||
return fill_value | ||
else: | ||
result_shape = (values.shape[:axis] + | ||
values.shape[axis + 1:]) | ||
result = np.empty(result_shape, dtype=values.dtype) | ||
result.fill(fill_value) | ||
return result | ||
|
||
|
||
def nanany(values, axis=None, skipna=True): | ||
values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna) | ||
return values.any(axis) | ||
|
@@ -304,15 +308,15 @@ def nanall(values, axis=None, skipna=True): | |
|
||
@disallow('M8') | ||
@bottleneck_switch() | ||
def nansum(values, axis=None, skipna=True): | ||
def nansum(values, axis=None, skipna=True, min_count=1): | ||
values, mask, dtype, dtype_max = _get_values(values, skipna, 0) | ||
dtype_sum = dtype_max | ||
if is_float_dtype(dtype): | ||
dtype_sum = dtype | ||
elif is_timedelta64_dtype(dtype): | ||
dtype_sum = np.float64 | ||
the_sum = values.sum(axis, dtype=dtype_sum) | ||
the_sum = _maybe_null_out(the_sum, axis, mask) | ||
the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count) | ||
|
||
return _wrap_results(the_sum, dtype) | ||
|
||
|
@@ -641,13 +645,13 @@ def nankurt(values, axis=None, skipna=True): | |
|
||
|
||
@disallow('M8', 'm8') | ||
def nanprod(values, axis=None, skipna=True): | ||
def nanprod(values, axis=None, skipna=True, min_count=1): | ||
mask = isna(values) | ||
if skipna and not is_any_int_dtype(values): | ||
values = values.copy() | ||
values[mask] = 1 | ||
result = values.prod(axis) | ||
return _maybe_null_out(result, axis, mask) | ||
return _maybe_null_out(result, axis, mask, min_count=min_count) | ||
|
||
|
||
def _maybe_arg_null_out(result, axis, mask, skipna): | ||
|
@@ -683,9 +687,9 @@ def _get_counts(mask, axis, dtype=float): | |
return np.array(count, dtype=dtype) | ||
|
||
|
||
def _maybe_null_out(result, axis, mask): | ||
def _maybe_null_out(result, axis, mask, min_count=1): | ||
if axis is not None and getattr(result, 'ndim', False): | ||
null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 | ||
null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you just substract here something if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what you mean here. |
||
if np.any(null_mask): | ||
if is_numeric_dtype(result): | ||
if np.iscomplexobj(result): | ||
|
@@ -698,7 +702,7 @@ def _maybe_null_out(result, axis, mask): | |
result[null_mask] = None | ||
elif result is not tslib.NaT: | ||
null_mask = mask.size - mask.sum() | ||
if null_mask == 0: | ||
if null_mask < min_count: | ||
result = np.nan | ||
|
||
return result | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does this work for
min_count==0
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah,
sumx
starts out aszeros
, so we just have to avoid setting it to NaN. Same forprod
, but with ones.