BUG: 27453 right merge order (#31278)

MarcoGorelli · web-flow · commit 8a5f2917e163 · 2020-03-26T08:45:37.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -168,6 +168,32 @@ key and type of :class:`Index`.  These now consistently raise ``KeyError`` (:iss
     ...
     KeyError: Timestamp('1970-01-01 00:00:00')
 
+:meth:`DataFrame.merge` preserves right frame's row order
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
+
+.. ipython:: python
+
+    left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]})
+    right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]})
+    left_df
+    right_df
+
+*Previous behavior*:
+
+.. code-block:: python
+
+    >>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
+        animal  max_speed
+    0      pig         11
+    1  quetzal         80
+
+*New behavior*:
+
+.. ipython:: python
+
+    left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_110.api_breaking.assignment_to_multiple_columns:
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -6,14 +6,14 @@
 import datetime
 from functools import partial
 import string
-from typing import TYPE_CHECKING, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Optional, Tuple, Union, cast
 import warnings
 
 import numpy as np
 
 from pandas._libs import Timedelta, hashtable as libhashtable, lib
 import pandas._libs.join as libjoin
-from pandas._typing import FrameOrSeries
+from pandas._typing import ArrayLike, FrameOrSeries
 from pandas.errors import MergeError
 from pandas.util._decorators import Appender, Substitution
 
@@ -24,6 +24,7 @@
     is_array_like,
     is_bool,
     is_bool_dtype,
+    is_categorical,
     is_categorical_dtype,
     is_datetime64tz_dtype,
     is_dtype_equal,
@@ -1271,7 +1272,7 @@ def _get_join_indexers(
 
     # get left & right join labels and num. of levels at each location
     mapped = (
-        _factorize_keys(left_keys[n], right_keys[n], sort=sort)
+        _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
         for n in range(len(left_keys))
     )
     zipped = zip(*mapped)
@@ -1283,8 +1284,8 @@ def _get_join_indexers(
     # factorize keys to a dense i8 space
     # `count` is the num. of unique keys
     # set(lkey) | set(rkey) == range(count)
-    lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
 
+    lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
     # preserve left frame order if how == 'left' and sort == False
     kwargs = copy.copy(kwargs)
     if how == "left":
@@ -1822,7 +1823,59 @@ def _right_outer_join(x, y, max_groups):
     return left_indexer, right_indexer
 
 
-def _factorize_keys(lk, rk, sort=True):
+def _factorize_keys(
+    lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner"
+) -> Tuple[np.array, np.array, int]:
+    """
+    Encode left and right keys as enumerated types.
+
+    This is used to get the join indexers to be used when merging DataFrames.
+
+    Parameters
+    ----------
+    lk : array-like
+        Left key.
+    rk : array-like
+        Right key.
+    sort : bool, defaults to True
+        If True, the encoding is done such that the unique elements in the
+        keys are sorted.
+    how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
+        Type of merge.
+
+    Returns
+    -------
+    array
+        Left (resp. right if called with `key='right'`) labels, as enumerated type.
+    array
+        Right (resp. left if called with `key='right'`) labels, as enumerated type.
+    int
+        Number of unique elements in union of left and right labels.
+
+    See Also
+    --------
+    merge : Merge DataFrame or named Series objects
+        with a database-style join.
+    algorithms.factorize : Encode the object as an enumerated type
+        or categorical variable.
+
+    Examples
+    --------
+    >>> lk = np.array(["a", "c", "b"])
+    >>> rk = np.array(["a", "c"])
+
+    Here, the unique values are `'a', 'b', 'c'`. With the default
+    `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
+
+    >>> pd.core.reshape.merge._factorize_keys(lk, rk)
+    (array([0, 2, 1]), array([0, 2]), 3)
+
+    With the `sort=False`, the encoding will correspond to the order
+    in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
+
+    >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
+    (array([0, 1, 2]), array([0, 1]), 3)
+    """
     # Some pre-processing for non-ndarray lk / rk
     lk = extract_array(lk, extract_numpy=True)
     rk = extract_array(rk, extract_numpy=True)
@@ -1834,8 +1887,11 @@ def _factorize_keys(lk, rk, sort=True):
         rk, _ = rk._values_for_factorize()
 
     elif (
-        is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)
+        is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk)
     ):
+        assert is_categorical(lk) and is_categorical(rk)
+        lk = cast(Categorical, lk)
+        rk = cast(Categorical, rk)
         if lk.categories.equals(rk.categories):
             # if we exactly match in categories, allow us to factorize on codes
             rk = rk.codes
@@ -1892,6 +1948,8 @@ def _factorize_keys(lk, rk, sort=True):
             np.putmask(rlab, rmask, count)
         count += 1
 
+    if how == "right":
+        return rlab, llab, count
     return llab, rlab, count
 
 
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
         # GH 24212
         # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
         # -1 is interpreted as a missing value instead of the last element
-        df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index)
-        df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]})
+        df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
+        df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]})
         result = df1.merge(df2, left_on="key", right_index=True, how=how)
         expected = pd.DataFrame(
             [
-                [1.0, 0, 1],
-                [2.0, 2, 3],
-                [3.0, 2, 3],
-                [np.nan, 1, 2],
-                [np.nan, 3, 4],
-                [np.nan, 4, 5],
+                [0, 0, 0],
+                [1, 1, 1],
+                [2, 2, 2],
+                [np.nan, 3, 3],
+                [np.nan, 4, 4],
+                [np.nan, 5, 5],
             ],
             columns=["a", "key", "b"],
         )
@@ -1318,6 +1318,20 @@ def test_merge_right_index_right(self):
         result = left.merge(right, left_on="key", right_index=True, how="right")
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("how", ["left", "right"])
+    def test_merge_preserves_row_order(self, how):
+        # GH 27453
+        left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
+        right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
+        result = left_df.merge(right_df, on=["animal", "max_speed"], how=how)
+        if how == "right":
+            expected = pd.DataFrame(
+                {"animal": ["quetzal", "pig"], "max_speed": [80, 11]}
+            )
+        else:
+            expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
+        tm.assert_frame_equal(result, expected)
+
     def test_merge_take_missing_values_from_index_of_other_dtype(self):
         # GH 24212
         left = pd.DataFrame(