From b24b0ab317d0574b982f2501685aafdfeb12a1ed Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 21 Sep 2020 12:31:46 -0700 Subject: [PATCH 1/2] REF: Categorical.fillna match patterns in other methods --- pandas/core/arrays/categorical.py | 43 ++++++------------- pandas/tests/frame/test_missing.py | 3 +- .../tests/indexes/categorical/test_fillna.py | 28 +++++++++++- pandas/tests/series/methods/test_fillna.py | 3 +- 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ef69d6565cfeb..7b1a66560e73b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -37,7 +37,6 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import ops @@ -1213,7 +1212,7 @@ def _validate_fill_value(self, fill_value): ValueError """ - if isna(fill_value): + if is_valid_nat_for_dtype(fill_value, self.categories.dtype): fill_value = -1 elif fill_value in self.categories: fill_value = self._unbox_scalar(fill_value) @@ -1636,6 +1635,7 @@ def fillna(self, value=None, method=None, limit=None): value, method = validate_fillna_kwargs( value, method, validate_scalar_dict_value=False ) + value = extract_array(value, extract_numpy=True) if value is None: value = np.nan @@ -1644,10 +1644,8 @@ def fillna(self, value=None, method=None, limit=None): "specifying a limit for fillna has not been implemented yet" ) - codes = self._codes - - # pad / bfill if method is not None: + # pad / bfill # TODO: dispatch when self.categories is EA-dtype values = np.asarray(self).reshape(-1, len(self)) @@ -1657,40 +1655,25 @@ def fillna(self, value=None, method=None, limit=None): codes = _get_codes_for_values(values, self.categories) else: + # We copy even if there is nothing to fill + codes = self._ndarray.copy() + mask = self.isna() - # If value is a dict or a Series (a dict value has already - # been converted to a Series) - if isinstance(value, (np.ndarray, Categorical, ABCSeries)): + if isinstance(value, (np.ndarray, Categorical)): # We get ndarray or Categorical if called via Series.fillna, # where it will unwrap another aligned Series before getting here - mask = ~algorithms.isin(value, self.categories) - if not isna(value[mask]).all(): + not_categories = ~algorithms.isin(value, self.categories) + if not isna(value[not_categories]).all(): + # All entries in `value` must either be a category or NA raise ValueError("fill value must be in categories") values_codes = _get_codes_for_values(value, self.categories) - indexer = np.where(codes == -1) - codes = codes.copy() - codes[indexer] = values_codes[indexer] - - # If value is not a dict or Series it should be a scalar - elif is_hashable(value): - if not isna(value) and value not in self.categories: - raise ValueError("fill value must be in categories") - - mask = codes == -1 - if mask.any(): - codes = codes.copy() - if isna(value): - codes[mask] = -1 - else: - codes[mask] = self._unbox_scalar(value) + codes[mask] = values_codes[mask] else: - raise TypeError( - f"'value' parameter must be a scalar, dict " - f"or Series, but you passed a {type(value).__name__}" - ) + new_code = self._validate_fill_value(value) + codes[mask] = new_code return self._from_backing_data(codes) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index b4f91590e09d1..5d3f8e3a2f7c1 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -362,7 +362,8 @@ def test_na_actions_categorical(self): res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) - with pytest.raises(ValueError, match=("fill value must be in categories")): + msg = "'fill_value=4' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): df.fillna(value={"cats": 4, "vals": "c"}) res = df.fillna(method="pad") diff --git a/pandas/tests/indexes/categorical/test_fillna.py b/pandas/tests/indexes/categorical/test_fillna.py index 0d878249d3800..f6a6747166011 100644 --- a/pandas/tests/indexes/categorical/test_fillna.py +++ b/pandas/tests/indexes/categorical/test_fillna.py @@ -14,6 +14,32 @@ def test_fillna_categorical(self): tm.assert_index_equal(idx.fillna(1.0), exp) # fill by value not in categories raises ValueError - msg = "fill value must be in categories" + msg = "'fill_value=2.0' is not present in this Categorical's categories" with pytest.raises(ValueError, match=msg): idx.fillna(2.0) + + def test_fillna_copies_with_no_nas(self): + # Nothing to fill, should still get a copy + ci = CategoricalIndex([0, 1, 1]) + cat = ci._data + result = ci.fillna(0) + assert result._values._ndarray is not cat._ndarray + assert result._values._ndarray.base is None + + # Same check directly on the Categorical object + result = cat.fillna(0) + assert result._ndarray is not cat._ndarray + assert result._ndarray.base is None + + def test_fillna_validates_with_no_nas(self): + # We validate the fill value even if fillna is a no-op + ci = CategoricalIndex([2, 3, 3]) + cat = ci._data + + msg = "'fill_value=False' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): + ci.fillna(False) + + # Same check directly on the Categorical + with pytest.raises(ValueError, match=msg): + cat.fillna(False) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 80b8271e16e7a..b6a6f4e8200d4 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -125,7 +125,8 @@ def test_fillna_categorical_raises(self): data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b"])) - with pytest.raises(ValueError, match="fill value must be in categories"): + msg = "'fill_value=d' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): ser.fillna("d") with pytest.raises(ValueError, match="fill value must be in categories"): From e6e00ee193435b35c7c208e6964c6500bc05934d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 21 Sep 2020 17:54:18 -0700 Subject: [PATCH 2/2] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 19a563be0a568..ae3587b7bfd18 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -239,7 +239,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - +- :meth:`Categorical.fillna` will always return a copy, will validate a passed fill value regardless of whether there are any NAs to fill, and will disallow a ``NaT`` as a fill value for numeric categories (:issue:`36530`) - -