From 6534e6ea92ea225f6c7a6de1cdca99169cb91173 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 6 Sep 2023 07:08:44 -0400 Subject: [PATCH] BUG: merge with left and/or right empty returning mis-ordered columns --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/reshape/merge.py | 7 +--- pandas/tests/reshape/merge/test_merge.py | 47 +++++++++++++++++------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index bd15d5fa085e9..4f38d420a53b4 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -246,7 +246,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Sparse diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5b07a0010acdd..6d1ff07e07c76 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1272,12 +1272,7 @@ def _get_merge_keys( # work-around for merge_asof(right_index=True) right_keys.append(right.index._values) if lk is not None and lk == rk: # FIXME: what about other NAs? - # avoid key upcast in corner case (length-0) - lk = cast(Hashable, lk) - if len(left) > 0: - right_drop.append(rk) - else: - left_drop.append(lk) + right_drop.append(rk) else: rk = cast(ArrayLike, rk) right_keys.append(rk) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4659c16909ed7..37ccfddfc82cd 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -582,11 +582,11 @@ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): df_empty = df[:0] expected = DataFrame( { - "value_x": Series(dtype=df.dtypes["value"]), "key": Series(dtype=df.dtypes["key"]), + "value_x": Series(dtype=df.dtypes["value"]), "value_y": Series(dtype=df.dtypes["value"]), }, - columns=["value_x", "key", "value_y"], + columns=["key", "value_x", "value_y"], ) actual = df_empty.merge(df, on="key") tm.assert_frame_equal(actual, expected) @@ -889,13 +889,13 @@ def test_merge_on_datetime64tz_empty(self): result = left.merge(right, on="date") expected = DataFrame( { + "date": Series(dtype=dtz), "value_x": Series(dtype=float), "date2_x": Series(dtype=dtz), - "date": Series(dtype=dtz), "value_y": Series(dtype=float), "date2_y": Series(dtype=dtz), }, - columns=["value_x", "date2_x", "date", "value_y", "date2_y"], + columns=["date", "value_x", "date2_x", "value_y", "date2_y"], ) tm.assert_frame_equal(result, expected) @@ -1827,11 +1827,9 @@ def test_merge_empty(self, left_empty, how, exp): if exp == "left": expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]}) elif exp == "right": - expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]}) + expected = DataFrame({"A": [1], "B": [np.nan], "C": [5]}) elif exp == "empty": expected = DataFrame(columns=["A", "B", "C"], dtype="int64") - if left_empty: - expected = expected[["B", "A", "C"]] elif exp == "empty_cross": expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64") @@ -2481,14 +2479,12 @@ def test_merge_multiindex_columns(): result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))) # Constructing the expected results - expected_labels = [letter + l_suf for letter in letters] + [ - letter + r_suf for letter in letters - ] - expected_index = MultiIndex.from_product( - [expected_labels, numbers], names=["outer", "inner"] - ) + tuples = [(letter + l_suf, num) for letter in letters for num in numbers] + tuples += [("id", "")] + tuples += [(letter + r_suf, num) for letter in letters for num in numbers] + + expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - expected["id"] = "" tm.assert_frame_equal(result, expected) @@ -2959,3 +2955,26 @@ def test_merge_arrow_string_index(any_string_dtype): {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("left_empty", [True, False]) +@pytest.mark.parametrize("right_empty", [True, False]) +def test_merge_empty_frames_column_order(left_empty, right_empty): + # GH 51929 + df1 = DataFrame(1, index=[0], columns=["A", "B"]) + df2 = DataFrame(1, index=[0], columns=["A", "C", "D"]) + + if left_empty: + df1 = df1.iloc[:0] + if right_empty: + df2 = df2.iloc[:0] + + result = merge(df1, df2, on=["A"], how="outer") + expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"]) + if left_empty and right_empty: + expected = expected.iloc[:0] + elif left_empty: + expected.loc[:, "B"] = np.nan + elif right_empty: + expected.loc[:, ["C", "D"]] = np.nan + tm.assert_frame_equal(result, expected)