BUG: groupby with dropna=False drops nulls from categorical groupers (#49652)

rhshadrach · web-flow · commit 60ed993f27fd · 2022-12-02T10:49:41.000-08:00
* BUG: groupby(..., dropna=False) drops null values with categorical grouper

* Use intp

* Fixups

* Use intp

* int64

* dtype fix

* Breakup op to debug on CI

* Trying with intp

* Restore cache decorator

* Add bincount comment

* Rework recoding logic
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -768,6 +768,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.DataFrameGroupBy.apply` and :class:`SeriesGroupBy.apply` with ``as_index=False`` would not attempt the computation without using the grouping keys when using them failed with a ``TypeError`` (:issue:`49256`)
 - Bug in :meth:`.DataFrameGroupBy.describe` would describe the group keys (:issue:`49256`)
 - Bug in :meth:`.SeriesGroupBy.describe` with ``as_index=False`` would have the incorrect shape (:issue:`49256`)
+- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` with ``dropna=False`` would drop NA values when the grouper was categorical (:issue:`36327`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -406,6 +406,29 @@ def unique(values):
     return unique_with_mask(values)
 
 
+def nunique_ints(values: ArrayLike) -> int:
+    """
+    Return the number of unique values for integer array-likes.
+
+    Significantly faster than pandas.unique for long enough sequences.
+    No checks are done to ensure input is integral.
+
+    Parameters
+    ----------
+    values : 1d array-like
+
+    Returns
+    -------
+    int : The number of unique values in ``values``
+    """
+    if len(values) == 0:
+        return 0
+    values = _ensure_data(values)
+    # bincount requires intp
+    result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
+    return result
+
+
 def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
     """See algorithms.unique for docs. Takes a mask for masked arrays."""
     values = _ensure_arraylike(values)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -42,6 +42,7 @@ class providing the base-class of operations.
     Timestamp,
     lib,
 )
+from pandas._libs.algos import rank_1d
 import pandas._libs.groupby as libgroupby
 from pandas._typing import (
     AnyArrayLike,
@@ -2268,12 +2269,15 @@ def size(self) -> DataFrame | Series:
         else:
             result = self._obj_1d_constructor(result)
 
+        with com.temp_setattr(self, "as_index", True):
+            # size already has the desired behavior in GH#49519, but this makes the
+            # as_index=False path of _reindex_output fail on categorical groupers.
+            result = self._reindex_output(result, fill_value=0)
         if not self.as_index:
             # error: Incompatible types in assignment (expression has
             # type "DataFrame", variable has type "Series")
             result = result.rename("size").reset_index()  # type: ignore[assignment]
-
-        return self._reindex_output(result, fill_value=0)
+        return result
 
     @final
     @doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
@@ -3269,6 +3273,10 @@ def ngroup(self, ascending: bool = True):
             else:
                 dtype = np.int64
 
+            if any(ping._passed_categorical for ping in self.grouper.groupings):
+                # comp_ids reflect non-observed groups, we need only observed
+                comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
+
             result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
             if not ascending:
                 result = self.ngroups - 1 - result
@@ -3950,7 +3958,7 @@ def _reindex_output(
             names = names + [None]
         index = MultiIndex.from_product(levels_list, names=names)
         if self.sort:
-            index = index.sortlevel()[0]
+            index = index.sort_values()
 
         if self.as_index:
             # Always holds for SeriesGroupBy unless GH#36507 is implemented
@@ -3972,12 +3980,12 @@ def _reindex_output(
         # reindex `output`, and then reset the in-axis grouper columns.
 
         # Select in-axis groupers
-        in_axis_grps = (
+        in_axis_grps = list(
             (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
         )
-        g_nums, g_names = zip(*in_axis_grps)
-
-        output = output.drop(labels=list(g_names), axis=1)
+        if len(in_axis_grps) > 0:
+            g_nums, g_names = zip(*in_axis_grps)
+            output = output.drop(labels=list(g_names), axis=1)
 
         # Set a temp index and reindex (possibly expanding)
         output = output.set_index(self.grouper.result_index).reindex(
@@ -3986,7 +3994,8 @@ def _reindex_output(
 
         # Reset in-axis grouper columns
         # (using level numbers `g_nums` because level names may not be unique)
-        output = output.reset_index(level=g_nums)
+        if len(in_axis_grps) > 0:
+            output = output.reset_index(level=g_nums)
 
         return output.reset_index(drop=True)
 
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -612,6 +612,9 @@ def group_arraylike(self) -> ArrayLike:
             # retain dtype for categories, including unobserved ones
             return self.result_index._values
 
+        elif self._passed_categorical:
+            return self.group_index._values
+
         return self._codes_and_uniques[1]
 
     @cache_readonly
@@ -621,14 +624,31 @@ def result_index(self) -> Index:
         if self._all_grouper is not None:
             group_idx = self.group_index
             assert isinstance(group_idx, CategoricalIndex)
-            categories = self._all_grouper.categories
+            cats = self._orig_cats
             # set_categories is dynamically added
-            return group_idx.set_categories(categories)  # type: ignore[attr-defined]
+            return group_idx.set_categories(cats)  # type: ignore[attr-defined]
         return self.group_index
 
     @cache_readonly
     def group_index(self) -> Index:
-        uniques = self._codes_and_uniques[1]
+        codes, uniques = self._codes_and_uniques
+        if not self._dropna and self._passed_categorical:
+            assert isinstance(uniques, Categorical)
+            if self._sort and (codes == len(uniques)).any():
+                # Add NA value on the end when sorting
+                uniques = Categorical.from_codes(
+                    np.append(uniques.codes, [-1]), uniques.categories
+                )
+            else:
+                # Need to determine proper placement of NA value when not sorting
+                cat = self.grouping_vector
+                na_idx = (cat.codes < 0).argmax()
+                if cat.codes[na_idx] < 0:
+                    # count number of unique codes that comes before the nan value
+                    na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
+                    uniques = Categorical.from_codes(
+                        np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
+                    )
         return Index._with_infer(uniques, name=self.name)
 
     @cache_readonly
@@ -651,9 +671,28 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
             uniques = Categorical.from_codes(
                 codes=ucodes, categories=categories, ordered=cat.ordered
             )
+
+            codes = cat.codes
+            if not self._dropna:
+                na_mask = codes < 0
+                if np.any(na_mask):
+                    if self._sort:
+                        # Replace NA codes with `largest code + 1`
+                        na_code = len(categories)
+                        codes = np.where(na_mask, na_code, codes)
+                    else:
+                        # Insert NA code into the codes based on first appearance
+                        # A negative code must exist, no need to check codes[na_idx] < 0
+                        na_idx = na_mask.argmax()
+                        # count number of unique codes that comes before the nan value
+                        na_code = algorithms.nunique_ints(codes[:na_idx])
+                        codes = np.where(codes >= na_code, codes + 1, codes)
+                        codes = np.where(na_mask, na_code, codes)
+
             if not self._observed:
                 uniques = uniques.reorder_categories(self._orig_cats)
-            return cat.codes, uniques
+
+            return codes, uniques
 
         elif isinstance(self.grouping_vector, ops.BaseGrouper):
             # we have a list of groupers
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -831,6 +831,7 @@ def test_preserve_categories():
     df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
     sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
     # GH#48749 - don't change order of categories
+    # GH#42482 - don't sort result when sort=False, even when ordered=True
     nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
     tm.assert_index_equal(
         df.groupby("A", sort=True, observed=False).first().index, sort_index
@@ -1218,7 +1219,7 @@ def test_seriesgroupby_observed_true(df_cat, operation):
     lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
     lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
     index = MultiIndex.from_arrays([lev_a, lev_b])
-    expected = Series(data=[2, 4, 1, 3], index=index, name="C")
+    expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()
 
     grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
     result = getattr(grouped, operation)(sum)
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py