Skip to content

Commit 082023e

Browse files
authored
BUG: incorrect casting in DataFrame.append (#39454)
1 parent a9c803c commit 082023e

File tree

5 files changed

+40
-21
lines changed

5 files changed

+40
-21
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ Reshaping
411411
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
412412
- :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`)
413413
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
414-
-
414+
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
415415

416416
Sparse
417417
^^^^^^

pandas/core/internals/concat.py

+20-12
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pandas._typing import ArrayLike, DtypeObj, Manager, Shape
1212
from pandas.util._decorators import cache_readonly
1313

14-
from pandas.core.dtypes.cast import maybe_promote
14+
from pandas.core.dtypes.cast import find_common_type, maybe_promote
1515
from pandas.core.dtypes.common import (
1616
get_dtype,
1717
is_categorical_dtype,
@@ -394,7 +394,11 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A
394394
if _is_uniform_reindex(join_units):
395395
# FIXME: integrate property
396396
empty_dtype = join_units[0].block.dtype
397-
upcasted_na = join_units[0].block.fill_value
397+
if is_extension_array_dtype(empty_dtype):
398+
# for dt64tz we need this to get NaT instead of np.datetime64("NaT")
399+
upcasted_na = empty_dtype.na_value
400+
else:
401+
upcasted_na = join_units[0].block.fill_value
398402
return empty_dtype, upcasted_na
399403

400404
has_none_blocks = False
@@ -405,25 +409,29 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A
405409
else:
406410
dtypes[i] = unit.dtype
407411

412+
filtered_dtypes = [
413+
unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
414+
]
415+
if not len(filtered_dtypes):
416+
filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None]
417+
dtype_alt = find_common_type(filtered_dtypes)
418+
408419
upcast_classes = _get_upcast_classes(join_units, dtypes)
409420

421+
if is_extension_array_dtype(dtype_alt):
422+
return dtype_alt, dtype_alt.na_value
423+
elif dtype_alt == object:
424+
return dtype_alt, np.nan
425+
410426
# TODO: de-duplicate with maybe_promote?
411427
# create the result
412428
if "extension" in upcast_classes:
413-
if len(upcast_classes) == 1:
414-
cls = upcast_classes["extension"][0]
415-
return cls, cls.na_value
416-
else:
417-
return np.dtype("object"), np.nan
418-
elif "object" in upcast_classes:
419-
return np.dtype(np.object_), np.nan
429+
return np.dtype("object"), np.nan
420430
elif "bool" in upcast_classes:
421431
if has_none_blocks:
422432
return np.dtype(np.object_), np.nan
423433
else:
424434
return np.dtype(np.bool_), None
425-
elif "category" in upcast_classes:
426-
return np.dtype(np.object_), np.nan
427435
elif "datetimetz" in upcast_classes:
428436
# GH-25014. We use NaT instead of iNaT, since this eventually
429437
# ends up in DatetimeArray.take, which does not allow iNaT.
@@ -481,7 +489,7 @@ def _get_upcast_classes(
481489
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
482490
"""Select upcast class name based on dtype."""
483491
if is_categorical_dtype(dtype):
484-
return "category"
492+
return "extension"
485493
elif is_datetime64tz_dtype(dtype):
486494
return "datetimetz"
487495
elif is_extension_array_dtype(dtype):

pandas/tests/extension/test_categorical.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,7 @@ class TestConstructors(base.BaseConstructorsTests):
117117

118118

119119
class TestReshaping(base.BaseReshapingTests):
120-
@pytest.mark.xfail(reason="Deliberately upcast to object?")
121-
def test_concat_with_reindex(self, data):
122-
super().test_concat_with_reindex(data)
120+
pass
123121

124122

125123
class TestGetitem(base.BaseGetitemTests):

pandas/tests/reshape/concat/test_append.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -365,13 +365,25 @@ def test_append_empty_tz_frame_with_datetime64ns(self):
365365

366366
# pd.NaT gets inferred as tz-naive, so append result is tz-naive
367367
result = df.append({"a": pd.NaT}, ignore_index=True)
368-
expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
368+
expected = DataFrame({"a": [pd.NaT]}).astype(object)
369369
tm.assert_frame_equal(result, expected)
370370

371371
# also test with typed value to append
372372
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
373-
result = df.append(
374-
Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True
375-
)
376-
expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
373+
other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
374+
result = df.append(other, ignore_index=True)
375+
expected = DataFrame({"a": [pd.NaT]}).astype(object)
376+
tm.assert_frame_equal(result, expected)
377+
378+
@pytest.mark.parametrize(
379+
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
380+
)
381+
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str):
382+
# https://github.com/pandas-dev/pandas/issues/35460
383+
df = DataFrame(columns=["a"]).astype(dtype_str)
384+
385+
other = DataFrame({"a": [np.timedelta64("NaT", "ns")]})
386+
result = df.append(other, ignore_index=True)
387+
388+
expected = other.astype(object)
377389
tm.assert_frame_equal(result, expected)

pandas/tests/reshape/concat/test_categorical.py

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def test_categorical_concat(self, sort):
4242
"h": [None] * 6 + cat_values,
4343
}
4444
)
45+
exp["h"] = exp["h"].astype(df2["h"].dtype)
4546
tm.assert_frame_equal(res, exp)
4647

4748
def test_categorical_concat_dtypes(self):

0 commit comments

Comments
 (0)