Skip to content

Commit 60ed993

Browse files
authored
BUG: groupby with dropna=False drops nulls from categorical groupers (#49652)
* BUG: groupby(..., dropna=False) drops null values with categorical grouper * Use intp * Fixups * Use intp * int64 * dtype fix * Breakup op to debug on CI * Trying with intp * Restore cache decorator * Add bincount comment * Rework recoding logic
1 parent 16b5f0c commit 60ed993

File tree

7 files changed

+285
-20
lines changed

7 files changed

+285
-20
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,7 @@ Groupby/resample/rolling
768768
- Bug in :meth:`.DataFrameGroupBy.apply` and :class:`SeriesGroupBy.apply` with ``as_index=False`` would not attempt the computation without using the grouping keys when using them failed with a ``TypeError`` (:issue:`49256`)
769769
- Bug in :meth:`.DataFrameGroupBy.describe` would describe the group keys (:issue:`49256`)
770770
- Bug in :meth:`.SeriesGroupBy.describe` with ``as_index=False`` would have the incorrect shape (:issue:`49256`)
771+
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` with ``dropna=False`` would drop NA values when the grouper was categorical (:issue:`36327`)
771772

772773
Reshaping
773774
^^^^^^^^^

pandas/core/algorithms.py

+23
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,29 @@ def unique(values):
406406
return unique_with_mask(values)
407407

408408

409+
def nunique_ints(values: ArrayLike) -> int:
410+
"""
411+
Return the number of unique values for integer array-likes.
412+
413+
Significantly faster than pandas.unique for long enough sequences.
414+
No checks are done to ensure input is integral.
415+
416+
Parameters
417+
----------
418+
values : 1d array-like
419+
420+
Returns
421+
-------
422+
int : The number of unique values in ``values``
423+
"""
424+
if len(values) == 0:
425+
return 0
426+
values = _ensure_data(values)
427+
# bincount requires intp
428+
result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
429+
return result
430+
431+
409432
def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
410433
"""See algorithms.unique for docs. Takes a mask for masked arrays."""
411434
values = _ensure_arraylike(values)

pandas/core/groupby/groupby.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class providing the base-class of operations.
4242
Timestamp,
4343
lib,
4444
)
45+
from pandas._libs.algos import rank_1d
4546
import pandas._libs.groupby as libgroupby
4647
from pandas._typing import (
4748
AnyArrayLike,
@@ -2268,12 +2269,15 @@ def size(self) -> DataFrame | Series:
22682269
else:
22692270
result = self._obj_1d_constructor(result)
22702271

2272+
with com.temp_setattr(self, "as_index", True):
2273+
# size already has the desired behavior in GH#49519, but this makes the
2274+
# as_index=False path of _reindex_output fail on categorical groupers.
2275+
result = self._reindex_output(result, fill_value=0)
22712276
if not self.as_index:
22722277
# error: Incompatible types in assignment (expression has
22732278
# type "DataFrame", variable has type "Series")
22742279
result = result.rename("size").reset_index() # type: ignore[assignment]
2275-
2276-
return self._reindex_output(result, fill_value=0)
2280+
return result
22772281

22782282
@final
22792283
@doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
@@ -3269,6 +3273,10 @@ def ngroup(self, ascending: bool = True):
32693273
else:
32703274
dtype = np.int64
32713275

3276+
if any(ping._passed_categorical for ping in self.grouper.groupings):
3277+
# comp_ids reflect non-observed groups, we need only observed
3278+
comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
3279+
32723280
result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
32733281
if not ascending:
32743282
result = self.ngroups - 1 - result
@@ -3950,7 +3958,7 @@ def _reindex_output(
39503958
names = names + [None]
39513959
index = MultiIndex.from_product(levels_list, names=names)
39523960
if self.sort:
3953-
index = index.sortlevel()[0]
3961+
index = index.sort_values()
39543962

39553963
if self.as_index:
39563964
# Always holds for SeriesGroupBy unless GH#36507 is implemented
@@ -3972,12 +3980,12 @@ def _reindex_output(
39723980
# reindex `output`, and then reset the in-axis grouper columns.
39733981

39743982
# Select in-axis groupers
3975-
in_axis_grps = (
3983+
in_axis_grps = list(
39763984
(i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
39773985
)
3978-
g_nums, g_names = zip(*in_axis_grps)
3979-
3980-
output = output.drop(labels=list(g_names), axis=1)
3986+
if len(in_axis_grps) > 0:
3987+
g_nums, g_names = zip(*in_axis_grps)
3988+
output = output.drop(labels=list(g_names), axis=1)
39813989

39823990
# Set a temp index and reindex (possibly expanding)
39833991
output = output.set_index(self.grouper.result_index).reindex(
@@ -3986,7 +3994,8 @@ def _reindex_output(
39863994

39873995
# Reset in-axis grouper columns
39883996
# (using level numbers `g_nums` because level names may not be unique)
3989-
output = output.reset_index(level=g_nums)
3997+
if len(in_axis_grps) > 0:
3998+
output = output.reset_index(level=g_nums)
39903999

39914000
return output.reset_index(drop=True)
39924001

pandas/core/groupby/grouper.py

+43-4
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,9 @@ def group_arraylike(self) -> ArrayLike:
612612
# retain dtype for categories, including unobserved ones
613613
return self.result_index._values
614614

615+
elif self._passed_categorical:
616+
return self.group_index._values
617+
615618
return self._codes_and_uniques[1]
616619

617620
@cache_readonly
@@ -621,14 +624,31 @@ def result_index(self) -> Index:
621624
if self._all_grouper is not None:
622625
group_idx = self.group_index
623626
assert isinstance(group_idx, CategoricalIndex)
624-
categories = self._all_grouper.categories
627+
cats = self._orig_cats
625628
# set_categories is dynamically added
626-
return group_idx.set_categories(categories) # type: ignore[attr-defined]
629+
return group_idx.set_categories(cats) # type: ignore[attr-defined]
627630
return self.group_index
628631

629632
@cache_readonly
630633
def group_index(self) -> Index:
631-
uniques = self._codes_and_uniques[1]
634+
codes, uniques = self._codes_and_uniques
635+
if not self._dropna and self._passed_categorical:
636+
assert isinstance(uniques, Categorical)
637+
if self._sort and (codes == len(uniques)).any():
638+
# Add NA value on the end when sorting
639+
uniques = Categorical.from_codes(
640+
np.append(uniques.codes, [-1]), uniques.categories
641+
)
642+
else:
643+
# Need to determine proper placement of NA value when not sorting
644+
cat = self.grouping_vector
645+
na_idx = (cat.codes < 0).argmax()
646+
if cat.codes[na_idx] < 0:
647+
# count number of unique codes that comes before the nan value
648+
na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
649+
uniques = Categorical.from_codes(
650+
np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
651+
)
632652
return Index._with_infer(uniques, name=self.name)
633653

634654
@cache_readonly
@@ -651,9 +671,28 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
651671
uniques = Categorical.from_codes(
652672
codes=ucodes, categories=categories, ordered=cat.ordered
653673
)
674+
675+
codes = cat.codes
676+
if not self._dropna:
677+
na_mask = codes < 0
678+
if np.any(na_mask):
679+
if self._sort:
680+
# Replace NA codes with `largest code + 1`
681+
na_code = len(categories)
682+
codes = np.where(na_mask, na_code, codes)
683+
else:
684+
# Insert NA code into the codes based on first appearance
685+
# A negative code must exist, no need to check codes[na_idx] < 0
686+
na_idx = na_mask.argmax()
687+
# count number of unique codes that comes before the nan value
688+
na_code = algorithms.nunique_ints(codes[:na_idx])
689+
codes = np.where(codes >= na_code, codes + 1, codes)
690+
codes = np.where(na_mask, na_code, codes)
691+
654692
if not self._observed:
655693
uniques = uniques.reorder_categories(self._orig_cats)
656-
return cat.codes, uniques
694+
695+
return codes, uniques
657696

658697
elif isinstance(self.grouping_vector, ops.BaseGrouper):
659698
# we have a list of groupers

pandas/tests/groupby/test_categorical.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,7 @@ def test_preserve_categories():
831831
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
832832
sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
833833
# GH#48749 - don't change order of categories
834+
# GH#42482 - don't sort result when sort=False, even when ordered=True
834835
nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
835836
tm.assert_index_equal(
836837
df.groupby("A", sort=True, observed=False).first().index, sort_index
@@ -1218,7 +1219,7 @@ def test_seriesgroupby_observed_true(df_cat, operation):
12181219
lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
12191220
lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
12201221
index = MultiIndex.from_arrays([lev_a, lev_b])
1221-
expected = Series(data=[2, 4, 1, 3], index=index, name="C")
1222+
expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()
12221223

12231224
grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
12241225
result = getattr(grouped, operation)(sum)

0 commit comments

Comments
 (0)