diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ec6ad38bbc7cf..a4b32e453d9d2 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -300,6 +300,30 @@ New repr for :class:`~pandas.arrays.IntervalArray` pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) +:meth:`DataFrame.merge` preserves right frame's row order +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) + +.. code-block:: python + + left_df = pd.DataFrame({"colors": ["blue", "red"]}, index=pd.Index([0, 1])) + right_df = pd.DataFrame({"hats": ["small", "big"]}, index=pd.Index([1, 0])) + left_df + right_df + +*pandas 0.25.x* + +.. code-block:: python + left_df.merge(right_df, left_index=True, right_index=True, how="right") + + +*pandas 1.0.0* + +.. code-block:: python + left_df.merge(right_df, left_index=True, right_index=True, how="right") + + + ``DataFrame.rename`` now only accepts one positional argument ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ceee2f66dba42..3ed86c7dd51f4 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -568,10 +568,10 @@ def __init__( indicator: bool = False, validate=None, ): - _left = _validate_operand(left) - _right = _validate_operand(right) - self.left = self.orig_left = _left - self.right = self.orig_right = _right + left = validate_operand(left) + right = validate_operand(right) + self.left = self.orig_left = left + self.right = self.orig_right = right self.how = how self.axis = axis @@ -1295,6 +1295,9 @@ def _get_join_indexers( right_keys ), "left_key and right_keys must be the same length" + # bind `sort` arg. of _factorize_keys + fkeys = partial(_factorize_keys, sort=sort) + # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) @@ -1309,15 +1312,20 @@ def _get_join_indexers( # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + # flip left and right keys if performing a right merge + # to preserve right merge row order (GH 27453) + if how == "right": + factorized_rkey, factorized_lkey, count = fkeys(rkey, lkey) + else: + factorized_lkey, factorized_rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if how == "left": kwargs["sort"] = sort join_func = _join_functions[how] - return join_func(lkey, rkey, count, **kwargs) + return join_func(factorized_lkey, factorized_rkey, count, **kwargs) def _restore_dropped_levels_multijoin( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 30c440035d48e..7254ca14947b3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1288,17 +1288,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) - df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) + df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) + df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on="key", right_index=True, how=how) expected = pd.DataFrame( [ - [1.0, 0, 1], - [2.0, 2, 3], - [3.0, 2, 3], - [np.nan, 1, 2], - [np.nan, 3, 4], - [np.nan, 4, 5], + [0, 0, 0], + [1, 1, 1], + [2, 2, 2], + [np.nan, 3, 3], + [np.nan, 4, 4], + [np.nan, 5, 5], ], columns=["a", "key", "b"], ) @@ -2152,3 +2152,35 @@ def test_merge_multiindex_columns(): expected["id"] = "" tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right"]) +def test_merge_preserves_row_order(how): + # GH 27453 + population = [ + ("Jenn", "Jamaica", 3), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + columns = ["name", "country", "population"] + population_df = DataFrame(population, columns=columns) + + people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] + columns = ["name", "country"] + people_df = DataFrame(people, columns=columns) + + expected_data = [ + ("Abe", "America", np.nan), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + expected_cols = ["name", "country", "population"] + expected = DataFrame(expected_data, columns=expected_cols) + + if how == "right": + left_df, right_df = population_df, people_df + elif how == "left": + left_df, right_df = people_df, population_df + + result = left_df.merge(right_df, on=("name", "country"), how=how) + assert_frame_equal(expected, result)