Skip to content

REF: Categorical.fillna match patterns in other methods #36530

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ Bug fixes

Categorical
^^^^^^^^^^^

- :meth:`Categorical.fillna` will always return a copy, will validate a passed fill value regardless of whether there are any NAs to fill, and will disallow a ``NaT`` as a fill value for numeric categories (:issue:`36530`)
-
-

Expand Down
43 changes: 13 additions & 30 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna

from pandas.core import ops
Expand Down Expand Up @@ -1213,7 +1212,7 @@ def _validate_fill_value(self, fill_value):
ValueError
"""

if isna(fill_value):
if is_valid_nat_for_dtype(fill_value, self.categories.dtype):
fill_value = -1
elif fill_value in self.categories:
fill_value = self._unbox_scalar(fill_value)
Expand Down Expand Up @@ -1636,6 +1635,7 @@ def fillna(self, value=None, method=None, limit=None):
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
value = extract_array(value, extract_numpy=True)

if value is None:
value = np.nan
Expand All @@ -1644,10 +1644,8 @@ def fillna(self, value=None, method=None, limit=None):
"specifying a limit for fillna has not been implemented yet"
)

codes = self._codes

# pad / bfill
if method is not None:
# pad / bfill

# TODO: dispatch when self.categories is EA-dtype
values = np.asarray(self).reshape(-1, len(self))
Expand All @@ -1657,40 +1655,25 @@ def fillna(self, value=None, method=None, limit=None):
codes = _get_codes_for_values(values, self.categories)

else:
# We copy even if there is nothing to fill
codes = self._ndarray.copy()
mask = self.isna()

# If value is a dict or a Series (a dict value has already
# been converted to a Series)
if isinstance(value, (np.ndarray, Categorical, ABCSeries)):
if isinstance(value, (np.ndarray, Categorical)):
# We get ndarray or Categorical if called via Series.fillna,
# where it will unwrap another aligned Series before getting here

mask = ~algorithms.isin(value, self.categories)
if not isna(value[mask]).all():
not_categories = ~algorithms.isin(value, self.categories)
if not isna(value[not_categories]).all():
# All entries in `value` must either be a category or NA
raise ValueError("fill value must be in categories")

values_codes = _get_codes_for_values(value, self.categories)
indexer = np.where(codes == -1)
codes = codes.copy()
codes[indexer] = values_codes[indexer]

# If value is not a dict or Series it should be a scalar
elif is_hashable(value):
if not isna(value) and value not in self.categories:
raise ValueError("fill value must be in categories")

mask = codes == -1
if mask.any():
codes = codes.copy()
if isna(value):
codes[mask] = -1
else:
codes[mask] = self._unbox_scalar(value)
codes[mask] = values_codes[mask]

else:
raise TypeError(
f"'value' parameter must be a scalar, dict "
f"or Series, but you passed a {type(value).__name__}"
)
new_code = self._validate_fill_value(value)
codes[mask] = new_code

return self._from_backing_data(codes)

Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,8 @@ def test_na_actions_categorical(self):
res = df.fillna(value={"cats": 3, "vals": "b"})
tm.assert_frame_equal(res, df_exp_fill)

with pytest.raises(ValueError, match=("fill value must be in categories")):
msg = "'fill_value=4' is not present in this Categorical's categories"
with pytest.raises(ValueError, match=msg):
df.fillna(value={"cats": 4, "vals": "c"})

res = df.fillna(method="pad")
Expand Down
28 changes: 27 additions & 1 deletion pandas/tests/indexes/categorical/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,32 @@ def test_fillna_categorical(self):
tm.assert_index_equal(idx.fillna(1.0), exp)

# fill by value not in categories raises ValueError
msg = "fill value must be in categories"
msg = "'fill_value=2.0' is not present in this Categorical's categories"
with pytest.raises(ValueError, match=msg):
idx.fillna(2.0)

def test_fillna_copies_with_no_nas(self):
# Nothing to fill, should still get a copy
ci = CategoricalIndex([0, 1, 1])
cat = ci._data
result = ci.fillna(0)
assert result._values._ndarray is not cat._ndarray
assert result._values._ndarray.base is None

# Same check directly on the Categorical object
result = cat.fillna(0)
assert result._ndarray is not cat._ndarray
assert result._ndarray.base is None

def test_fillna_validates_with_no_nas(self):
# We validate the fill value even if fillna is a no-op
ci = CategoricalIndex([2, 3, 3])
cat = ci._data

msg = "'fill_value=False' is not present in this Categorical's categories"
with pytest.raises(ValueError, match=msg):
ci.fillna(False)

# Same check directly on the Categorical
with pytest.raises(ValueError, match=msg):
cat.fillna(False)
3 changes: 2 additions & 1 deletion pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ def test_fillna_categorical_raises(self):
data = ["a", np.nan, "b", np.nan, np.nan]
ser = Series(Categorical(data, categories=["a", "b"]))

with pytest.raises(ValueError, match="fill value must be in categories"):
msg = "'fill_value=d' is not present in this Categorical's categories"
with pytest.raises(ValueError, match=msg):
ser.fillna("d")

with pytest.raises(ValueError, match="fill value must be in categories"):
Expand Down