Skip to content

[backport 2.3.x] TST (string dtype): resolve xfails for frame fillna and replace tests + fix bug in replace for string (#60295) #60331

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
2 changes: 2 additions & 0 deletions pandas/core/array_algos/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,6 @@ def re_replacer(s):
if mask is None:
values[:] = f(values)
else:
if values.ndim != mask.ndim:
mask = np.broadcast_to(mask, values.shape)
values[mask] = f(values[mask])
25 changes: 20 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,7 @@ def replace(
blocks = blk.convert(
copy=False,
using_cow=using_cow,
convert_string=convert_string or self.dtype != _dtype_obj,
convert_string=convert_string or self.dtype == "string",
)
if len(blocks) > 1 or blocks[0].dtype != blk.dtype:
warnings.warn(
Expand Down Expand Up @@ -987,7 +987,7 @@ def _replace_regex(
inplace: bool = False,
mask=None,
using_cow: bool = False,
convert_string: bool = True,
convert_string=None,
already_warned=None,
) -> list[Block]:
"""
Expand Down Expand Up @@ -1048,10 +1048,18 @@ def _replace_regex(
already_warned.warned_already = True

nbs = block.convert(
copy=False, using_cow=using_cow, convert_string=convert_string
copy=False,
using_cow=using_cow,
convert_string=convert_string or self.dtype == "string",
)
opt = get_option("future.no_silent_downcasting")
if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt:
if (
len(nbs) > 1
or (
nbs[0].dtype != block.dtype
and not (self.dtype == "string" and nbs[0].dtype == "string")
)
) and not opt:
warnings.warn(
# GH#54710
"Downcasting behavior in `replace` is deprecated and "
Expand Down Expand Up @@ -1088,7 +1096,7 @@ def replace_list(
values._replace(to_replace=src_list, value=dest_list, inplace=True)
return [blk]

convert_string = self.dtype != _dtype_obj
convert_string = self.dtype == "string"

# Exclude anything that we know we won't contain
pairs = [
Expand Down Expand Up @@ -2167,6 +2175,13 @@ def where(
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
# TestSetitemFloatIntervalWithIntIntervalValues
blk = self.coerce_to_target_dtype(orig_other)
if (
self.ndim == 2
and isinstance(orig_cond, np.ndarray)
and orig_cond.ndim == 1
and not is_1d_only_ea_dtype(blk.dtype)
):
orig_cond = orig_cond[:, None]
nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
return self._maybe_downcast(
nbs, downcast=_downcast, using_cow=using_cow, caller="where"
Expand Down
23 changes: 6 additions & 17 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas.util._test_decorators as td

from pandas import (
Expand Down Expand Up @@ -91,8 +89,6 @@ def test_fillna_datetime(self, datetime_frame):
with pytest.raises(ValueError, match=msg):
datetime_frame.fillna(5, method="ffill")

# TODO(infer_string) test as actual error instead of xfail
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
def test_fillna_mixed_type(self, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
Expand Down Expand Up @@ -126,7 +122,7 @@ def test_fillna_empty(self, using_copy_on_write):
df.x.fillna(method=m, inplace=True)
df.x.fillna(method=m)

def test_fillna_different_dtype(self, using_infer_string):
def test_fillna_different_dtype(self):
# with different dtype (GH#3386)
df = DataFrame(
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
Expand All @@ -136,6 +132,7 @@ def test_fillna_different_dtype(self, using_infer_string):
expected = DataFrame(
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
)
# column is originally float (all-NaN) -> filling with string gives object dtype
expected[2] = expected[2].astype("object")
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -654,18 +651,10 @@ def test_fillna_col_reordering(self):
filled = df.fillna(method="ffill")
assert df.columns.tolist() == filled.columns.tolist()

# TODO(infer_string) test as actual error instead of xfail
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
def test_fill_corner(self, float_frame, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
mf.loc[mf.index[-10:], "A"] = np.nan

filled = float_string_frame.fillna(value=0)
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
del float_string_frame["foo"]

float_frame.reindex(columns=[]).fillna(value=0)
def test_fill_empty(self, float_frame):
df = float_frame.reindex(columns=[])
result = df.fillna(value=0)
tm.assert_frame_equal(result, df)

def test_fillna_downcast_dict(self):
# GH#40809
Expand Down
81 changes: 43 additions & 38 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -30,7 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]:


class TestDataFrameReplace:
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_inplace(self, datetime_frame, float_string_frame):
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
Expand All @@ -46,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame):
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan

result = float_string_frame.replace(np.nan, 0)
expected = float_string_frame.fillna(value=0)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=0)
tm.assert_frame_equal(result, expected)

tsframe = datetime_frame.copy()
Expand Down Expand Up @@ -290,34 +289,39 @@ def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype):
tm.assert_frame_equal(result, expected)

def test_regex_replace_dict_nested_gh4115(self):
df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
df = DataFrame(
{"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2}
)
expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2})
msg = "Downcasting behavior in `replace`"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.replace({"Type": {"Q": 0, "T": 1}})

tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_list_to_scalar(self, mix_abc):
def test_regex_replace_list_to_scalar(self, mix_abc, using_infer_string):
df = DataFrame(mix_abc)
expec = DataFrame(
{
"a": mix_abc["a"],
"b": np.array([np.nan] * 4),
"b": [np.nan] * 4,
"c": [np.nan, np.nan, np.nan, "d"],
}
)
if using_infer_string:
expec["b"] = expec["b"].astype("str")
msg = "Downcasting behavior in `replace`"
with tm.assert_produces_warning(FutureWarning, match=msg):
warn = None if using_infer_string else FutureWarning
with tm.assert_produces_warning(warn, match=msg):
res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True)
res2 = df.copy()
res3 = df.copy()
with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
return_value = res2.replace(
[r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True
)
assert return_value is None
with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
return_value = res3.replace(
regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True
)
Expand All @@ -326,7 +330,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc):
tm.assert_frame_equal(res2, expec)
tm.assert_frame_equal(res3, expec)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_str_to_numeric(self, mix_abc):
# what happens when you try to replace a numeric value with a regex?
df = DataFrame(mix_abc)
Expand All @@ -342,7 +345,6 @@ def test_regex_replace_str_to_numeric(self, mix_abc):
tm.assert_frame_equal(res2, expec)
tm.assert_frame_equal(res3, expec)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_regex_list_to_numeric(self, mix_abc):
df = DataFrame(mix_abc)
res = df.replace([r"\s*\.\s*", "b"], 0, regex=True)
Expand Down Expand Up @@ -539,21 +541,28 @@ def test_replace_convert(self):
res = rep.dtypes
tm.assert_series_equal(expec, res)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_mixed(self, float_string_frame):
mf = float_string_frame
mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan

result = float_string_frame.replace(np.nan, -18)
expected = float_string_frame.fillna(value=-18)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=-18)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame)
expected2 = float_string_frame.copy()
expected2["foo"] = expected2["foo"].astype(object)
tm.assert_frame_equal(result.replace(-18, np.nan), expected2)

result = float_string_frame.replace(np.nan, -1e8)
expected = float_string_frame.fillna(value=-1e8)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=-1e8)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame)
expected2 = float_string_frame.copy()
expected2["foo"] = expected2["foo"].astype(object)
tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2)

def test_replace_mixed_int_block_upcasting(self):
# int block upcasting
Expand Down Expand Up @@ -614,15 +623,11 @@ def test_replace_mixed2(self, using_infer_string):

expected = DataFrame(
{
"A": Series(["foo", "bar"]),
"A": Series(["foo", "bar"], dtype="object"),
"B": Series([0, "foo"], dtype="object"),
}
)
if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.replace([1, 2], ["foo", "bar"])
else:
result = df.replace([1, 2], ["foo", "bar"])
result = df.replace([1, 2], ["foo", "bar"])
tm.assert_frame_equal(result, expected)

def test_replace_mixed3(self):
Expand Down Expand Up @@ -931,15 +936,16 @@ def test_replace_limit(self):
# TODO
pass

def test_replace_dict_no_regex(self):
def test_replace_dict_no_regex(self, any_string_dtype):
answer = Series(
{
0: "Strongly Agree",
1: "Agree",
2: "Neutral",
3: "Disagree",
4: "Strongly Disagree",
}
},
dtype=any_string_dtype,
)
weights = {
"Agree": 4,
Expand All @@ -954,15 +960,16 @@ def test_replace_dict_no_regex(self):
result = answer.replace(weights)
tm.assert_series_equal(result, expected)

def test_replace_series_no_regex(self):
def test_replace_series_no_regex(self, any_string_dtype):
answer = Series(
{
0: "Strongly Agree",
1: "Agree",
2: "Neutral",
3: "Disagree",
4: "Strongly Disagree",
}
},
dtype=any_string_dtype,
)
weights = Series(
{
Expand Down Expand Up @@ -1060,16 +1067,15 @@ def test_nested_dict_overlapping_keys_replace_str(self):
expected = df.replace({"a": dict(zip(astr, bstr))})
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_swapping_bug(self, using_infer_string):
def test_replace_swapping_bug(self):
df = DataFrame({"a": [True, False, True]})
res = df.replace({"a": {True: "Y", False: "N"}})
expect = DataFrame({"a": ["Y", "N", "Y"]})
expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object)
tm.assert_frame_equal(res, expect)

df = DataFrame({"a": [0, 1, 0]})
res = df.replace({"a": {0: "Y", 1: "N"}})
expect = DataFrame({"a": ["Y", "N", "Y"]})
expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object)
tm.assert_frame_equal(res, expect)

def test_replace_period(self):
Expand Down Expand Up @@ -1345,7 +1351,7 @@ def test_replace_commutative(self, df, to_replace, exp):
)
def test_replace_replacer_dtype(self, replacer):
# GH26632
df = DataFrame(["a"])
df = DataFrame(["a"], dtype=object)
msg = "Downcasting behavior in `replace` "
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.replace({"a": replacer, "b": replacer})
Expand Down Expand Up @@ -1462,6 +1468,7 @@ def test_replace_value_category_type(self):
input_df = input_df.replace("obj1", "obj9")
result = input_df.replace("cat2", "catX")

result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"})
tm.assert_frame_equal(result, expected)

def test_replace_dict_category_type(self):
Expand Down Expand Up @@ -1503,13 +1510,11 @@ def test_replace_with_compiled_regex(self):
expected = DataFrame(["z", "b", "c"])
tm.assert_frame_equal(result, expected)

def test_replace_intervals(self, using_infer_string):
def test_replace_intervals(self):
# https://github.com/pandas-dev/pandas/issues/35931
df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]})
warning = FutureWarning if using_infer_string else None
with tm.assert_produces_warning(warning, match="Downcasting"):
result = df.replace({"a": {pd.Interval(0, 1): "x"}})
expected = DataFrame({"a": ["x", "x"]})
result = df.replace({"a": {pd.Interval(0, 1): "x"}})
expected = DataFrame({"a": ["x", "x"]}, dtype=object)
tm.assert_frame_equal(result, expected)

def test_replace_unicode(self):
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,7 @@ def test_replace_series(self, how, to_key, from_key, replacer, using_infer_strin
else:
exp = pd.Series(self.rep[to_key], index=index, name="yyy")

if using_infer_string and exp.dtype == "string" and obj.dtype == object:
if using_infer_string and exp.dtype == "string":
# with infer_string, we disable the deprecated downcasting behavior
exp = exp.astype(object)

Expand Down Expand Up @@ -889,8 +889,9 @@ def test_replace_series_datetime_tz(
assert obj.dtype == from_key

exp = pd.Series(self.rep[to_key], index=index, name="yyy")
if using_infer_string and to_key == "object":
assert exp.dtype == "string"
if using_infer_string and exp.dtype == "string":
# with infer_string, we disable the deprecated downcasting behavior
exp = exp.astype(object)
else:
assert exp.dtype == to_key

Expand Down
Loading