Skip to content
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,8 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`)
- Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
- Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
- Bug in :meth:`DataFrameGroupBy.count` was returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` was reutrning ``NaN`` for missing categories when grouped on multiple ``Categorials``. Now returning ``0`` (:issue:`31422`)

Reshaping
^^^^^^^^^
Expand Down
41 changes: 28 additions & 13 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import numpy as np

from pandas._libs import lib
from pandas._typing import FrameOrSeries, FrameOrSeriesUnion
from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Scalar
from pandas.util._decorators import Appender, Substitution, doc

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -363,7 +363,9 @@ def _wrap_series_output(
return result

def _wrap_aggregated_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
self,
output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
fill_value: Scalar = np.NaN,
) -> Union[Series, DataFrame]:
"""
Wraps the output of a SeriesGroupBy aggregation into the expected result.
Expand All @@ -385,7 +387,7 @@ def _wrap_aggregated_output(
result = self._wrap_series_output(
output=output, index=self.grouper.result_index
)
return self._reindex_output(result)
return self._reindex_output(result, fill_value)

def _wrap_transformed_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
Expand Down Expand Up @@ -415,7 +417,11 @@ def _wrap_transformed_output(
return result

def _wrap_applied_output(
self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False
self,
keys: Index,
values: Optional[List[Any]],
not_indexed_same: bool = False,
fill_value: Scalar = np.NaN,
) -> FrameOrSeriesUnion:
"""
Wrap the output of SeriesGroupBy.apply into the expected result.
Expand Down Expand Up @@ -465,7 +471,7 @@ def _get_index() -> Index:
result = self.obj._constructor(
data=values, index=_get_index(), name=self._selection_name
)
return self._reindex_output(result)
return self._reindex_output(result, fill_value)

def _aggregate_named(self, func, *args, **kwargs):
result = {}
Expand Down Expand Up @@ -1029,7 +1035,10 @@ def _cython_agg_general(
agg_blocks, agg_items = self._cython_agg_blocks(
how, alt=alt, numeric_only=numeric_only, min_count=min_count
)
return self._wrap_agged_blocks(agg_blocks, items=agg_items)
fill_value = self._cython_func_fill_values.get(alt, np.NaN)
return self._wrap_agged_blocks(
agg_blocks, items=agg_items, fill_value=fill_value
)

def _cython_agg_blocks(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
Expand Down Expand Up @@ -1219,7 +1228,9 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:

return self.obj._constructor(result, columns=result_columns)

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
def _wrap_applied_output(
self, keys, values, not_indexed_same=False, fill_value=np.NaN
):
if len(keys) == 0:
return self.obj._constructor(index=keys)

Expand Down Expand Up @@ -1380,7 +1391,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if not self.as_index:
self._insert_inaxis_grouper_inplace(result)

return self._reindex_output(result)
return self._reindex_output(result, fill_value)

# values are not series or array-like but scalars
else:
Expand Down Expand Up @@ -1731,7 +1742,9 @@ def _insert_inaxis_grouper_inplace(self, result):
result.insert(0, name, lev)

def _wrap_aggregated_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
self,
output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
fill_value: Scalar = np.NaN,
) -> DataFrame:
"""
Wraps the output of DataFrameGroupBy aggregations into the expected result.
Expand Down Expand Up @@ -1762,7 +1775,7 @@ def _wrap_aggregated_output(
if self.axis == 1:
result = result.T

return self._reindex_output(result)
return self._reindex_output(result, fill_value)

def _wrap_transformed_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
Expand All @@ -1788,7 +1801,9 @@ def _wrap_transformed_output(

return result

def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame:
def _wrap_agged_blocks(
self, blocks: "Sequence[Block]", items: Index, fill_value: Scalar = np.NaN
) -> DataFrame:
if not self.as_index:
index = np.arange(blocks[0].values.shape[-1])
mgr = BlockManager(blocks, axes=[items, index])
Expand All @@ -1804,7 +1819,7 @@ def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFra
if self.axis == 1:
result = result.T

return self._reindex_output(result)._convert(datetime=True)
return self._reindex_output(result, fill_value)._convert(datetime=True)

def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
Expand Down Expand Up @@ -1846,7 +1861,7 @@ def count(self):
)
blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)]

return self._wrap_agged_blocks(blocks, items=data.items)
return self._wrap_agged_blocks(blocks, items=data.items, fill_value=0)

def nunique(self, dropna: bool = True):
"""
Expand Down
20 changes: 16 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,8 +888,12 @@ def _python_apply_general(
"""
keys, values, mutated = self.grouper.apply(f, data, self.axis)

fill_value = self._cython_func_fill_values.get(f, np.NaN)
return self._wrap_applied_output(
keys, values, not_indexed_same=mutated or self.mutated
keys,
values,
not_indexed_same=mutated or self.mutated,
fill_value=fill_value,
)

def _iterate_slices(self) -> Iterable[Series]:
Expand Down Expand Up @@ -970,13 +974,17 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs):

return self._wrap_transformed_output(output)

def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]):
def _wrap_aggregated_output(
self, output: Mapping[base.OutputKey, np.ndarray], fill_value: Scalar = np.NaN
):
raise AbstractMethodError(self)

def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
raise AbstractMethodError(self)

def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
def _wrap_applied_output(
self, keys, values, not_indexed_same: bool = False, fill_value: Scalar = np.NaN
):
raise AbstractMethodError(self)

def _agg_general(
Expand Down Expand Up @@ -1010,6 +1018,8 @@ def _agg_general(
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
return result

_cython_func_fill_values = {np.sum: 0}

def _cython_agg_general(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
):
Expand Down Expand Up @@ -1045,7 +1055,9 @@ def _cython_agg_general(
if len(output) == 0:
raise DataError("No numeric types to aggregate")

return self._wrap_aggregated_output(output)
fill_value = self._cython_func_fill_values.get(alt, np.NaN)

return self._wrap_aggregated_output(output, fill_value)

def _python_agg_general(
self, func, *args, engine="cython", engine_kwargs=None, **kwargs
Expand Down
47 changes: 13 additions & 34 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pandas._testing as tm


def cartesian_product_for_groupers(result, args, names):
def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
""" Reindex to a cartesian production for the groupers,
preserving the nature (Categorical) of each grouper
"""
Expand All @@ -33,7 +33,7 @@ def f(a):
return a

index = MultiIndex.from_product(map(f, args), names=names)
return result.reindex(index).sort_index()
return result.reindex(index, fill_value=fill_value).sort_index()


_results_for_groupbys_with_missing_categories = dict(
Expand Down Expand Up @@ -309,7 +309,7 @@ def test_observed(observed):
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
)

tm.assert_frame_equal(result, expected)
Expand All @@ -319,7 +319,9 @@ def test_observed(observed):
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))
expected = cartesian_product_for_groupers(
expected, [cat1, cat2], list("AB"), fill_value=0
)

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1188,9 +1190,10 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
names=["A", "B"],
).sortlevel()

expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C")
grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
result = getattr(grouped, operation)(sum)

tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -1340,15 +1343,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
)
request.node.add_marker(mark)

if reduction_func == "sum": # GH 31422
mark = pytest.mark.xfail(
reason=(
"sum should return 0 but currently returns NaN. "
"This is a known bug. See GH 31422."
)
)
request.node.add_marker(mark)

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
Expand All @@ -1369,8 +1363,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
val = result.loc[idx]
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)

# If we expect unobserved values to be zero, we also expect the dtype to be int
if zero_or_nan == 0:
# If we expect unobserved values to be zero, we also expect the dtype to be int.
# Except for .sum(). If the observed categories sum to dtype=float (i.e. their
# sums have decimals), then the zeros for the missing categories should also be
# floats.
if zero_or_nan == 0 and reduction_func != "sum":
assert np.issubdtype(result.dtype, np.integer)


Expand Down Expand Up @@ -1412,24 +1409,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
if reduction_func == "ngroup":
pytest.skip("ngroup does not return the Categories on the index")

if reduction_func == "count": # GH 35028
mark = pytest.mark.xfail(
reason=(
"DataFrameGroupBy.count returns np.NaN for missing "
"categories, when it should return 0. See GH 35028"
)
)
request.node.add_marker(mark)

if reduction_func == "sum": # GH 31422
mark = pytest.mark.xfail(
reason=(
"sum should return 0 but currently returns NaN. "
"This is a known bug. See GH 31422."
)
)
request.node.add_marker(mark)

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed):
["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
)
expected_columns = pd.Index(["a", "b"], name="C2")
expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]])
expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64)
expected = pd.DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
Expand Down Expand Up @@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed):
values="Sales",
index="Month",
columns="Year",
dropna=observed,
observed=observed,
aggfunc="sum",
)
expected_columns = pd.Int64Index([2013, 2014], name="Year")
expected_index = pd.CategoricalIndex(
["January"], categories=months, ordered=False, name="Month"
months, categories=months, ordered=False, name="Month"
)
expected_data = [[320, 120]] + [[0, 0]] * 11
expected = pd.DataFrame(
[[320, 120]], index=expected_index, columns=expected_columns
expected_data, index=expected_index, columns=expected_columns
)
if not observed:
result = result.dropna().astype(np.int64)
if observed:
expected = expected.loc[["January"]]

tm.assert_frame_equal(result, expected)

Expand Down