Skip to content

Commit 8a5f291

Browse files
authored
BUG: 27453 right merge order (#31278)
1 parent 218cc30 commit 8a5f291

File tree

3 files changed

+112
-14
lines changed

3 files changed

+112
-14
lines changed

doc/source/whatsnew/v1.1.0.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,32 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss
168168
...
169169
KeyError: Timestamp('1970-01-01 00:00:00')
170170
171+
:meth:`DataFrame.merge` preserves right frame's row order
172+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
173+
:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
174+
175+
.. ipython:: python
176+
177+
left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]})
178+
right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]})
179+
left_df
180+
right_df
181+
182+
*Previous behavior*:
183+
184+
.. code-block:: python
185+
186+
>>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
187+
animal max_speed
188+
0 pig 11
189+
1 quetzal 80
190+
191+
*New behavior*:
192+
193+
.. ipython:: python
194+
195+
left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
196+
171197
.. ---------------------------------------------------------------------------
172198
173199
.. _whatsnew_110.api_breaking.assignment_to_multiple_columns:

pandas/core/reshape/merge.py

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
import datetime
77
from functools import partial
88
import string
9-
from typing import TYPE_CHECKING, Optional, Tuple, Union
9+
from typing import TYPE_CHECKING, Optional, Tuple, Union, cast
1010
import warnings
1111

1212
import numpy as np
1313

1414
from pandas._libs import Timedelta, hashtable as libhashtable, lib
1515
import pandas._libs.join as libjoin
16-
from pandas._typing import FrameOrSeries
16+
from pandas._typing import ArrayLike, FrameOrSeries
1717
from pandas.errors import MergeError
1818
from pandas.util._decorators import Appender, Substitution
1919

@@ -24,6 +24,7 @@
2424
is_array_like,
2525
is_bool,
2626
is_bool_dtype,
27+
is_categorical,
2728
is_categorical_dtype,
2829
is_datetime64tz_dtype,
2930
is_dtype_equal,
@@ -1271,7 +1272,7 @@ def _get_join_indexers(
12711272

12721273
# get left & right join labels and num. of levels at each location
12731274
mapped = (
1274-
_factorize_keys(left_keys[n], right_keys[n], sort=sort)
1275+
_factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
12751276
for n in range(len(left_keys))
12761277
)
12771278
zipped = zip(*mapped)
@@ -1283,8 +1284,8 @@ def _get_join_indexers(
12831284
# factorize keys to a dense i8 space
12841285
# `count` is the num. of unique keys
12851286
# set(lkey) | set(rkey) == range(count)
1286-
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
12871287

1288+
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
12881289
# preserve left frame order if how == 'left' and sort == False
12891290
kwargs = copy.copy(kwargs)
12901291
if how == "left":
@@ -1822,7 +1823,59 @@ def _right_outer_join(x, y, max_groups):
18221823
return left_indexer, right_indexer
18231824

18241825

1825-
def _factorize_keys(lk, rk, sort=True):
1826+
def _factorize_keys(
1827+
lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner"
1828+
) -> Tuple[np.array, np.array, int]:
1829+
"""
1830+
Encode left and right keys as enumerated types.
1831+
1832+
This is used to get the join indexers to be used when merging DataFrames.
1833+
1834+
Parameters
1835+
----------
1836+
lk : array-like
1837+
Left key.
1838+
rk : array-like
1839+
Right key.
1840+
sort : bool, defaults to True
1841+
If True, the encoding is done such that the unique elements in the
1842+
keys are sorted.
1843+
how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
1844+
Type of merge.
1845+
1846+
Returns
1847+
-------
1848+
array
1849+
Left (resp. right if called with `key='right'`) labels, as enumerated type.
1850+
array
1851+
Right (resp. left if called with `key='right'`) labels, as enumerated type.
1852+
int
1853+
Number of unique elements in union of left and right labels.
1854+
1855+
See Also
1856+
--------
1857+
merge : Merge DataFrame or named Series objects
1858+
with a database-style join.
1859+
algorithms.factorize : Encode the object as an enumerated type
1860+
or categorical variable.
1861+
1862+
Examples
1863+
--------
1864+
>>> lk = np.array(["a", "c", "b"])
1865+
>>> rk = np.array(["a", "c"])
1866+
1867+
Here, the unique values are `'a', 'b', 'c'`. With the default
1868+
`sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
1869+
1870+
>>> pd.core.reshape.merge._factorize_keys(lk, rk)
1871+
(array([0, 2, 1]), array([0, 2]), 3)
1872+
1873+
With the `sort=False`, the encoding will correspond to the order
1874+
in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
1875+
1876+
>>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
1877+
(array([0, 1, 2]), array([0, 1]), 3)
1878+
"""
18261879
# Some pre-processing for non-ndarray lk / rk
18271880
lk = extract_array(lk, extract_numpy=True)
18281881
rk = extract_array(rk, extract_numpy=True)
@@ -1834,8 +1887,11 @@ def _factorize_keys(lk, rk, sort=True):
18341887
rk, _ = rk._values_for_factorize()
18351888

18361889
elif (
1837-
is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)
1890+
is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk)
18381891
):
1892+
assert is_categorical(lk) and is_categorical(rk)
1893+
lk = cast(Categorical, lk)
1894+
rk = cast(Categorical, rk)
18391895
if lk.categories.equals(rk.categories):
18401896
# if we exactly match in categories, allow us to factorize on codes
18411897
rk = rk.codes
@@ -1892,6 +1948,8 @@ def _factorize_keys(lk, rk, sort=True):
18921948
np.putmask(rlab, rmask, count)
18931949
count += 1
18941950

1951+
if how == "right":
1952+
return rlab, llab, count
18951953
return llab, rlab, count
18961954

18971955

pandas/tests/reshape/merge/test_merge.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
12861286
# GH 24212
12871287
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
12881288
# -1 is interpreted as a missing value instead of the last element
1289-
df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index)
1290-
df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]})
1289+
df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
1290+
df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]})
12911291
result = df1.merge(df2, left_on="key", right_index=True, how=how)
12921292
expected = pd.DataFrame(
12931293
[
1294-
[1.0, 0, 1],
1295-
[2.0, 2, 3],
1296-
[3.0, 2, 3],
1297-
[np.nan, 1, 2],
1298-
[np.nan, 3, 4],
1299-
[np.nan, 4, 5],
1294+
[0, 0, 0],
1295+
[1, 1, 1],
1296+
[2, 2, 2],
1297+
[np.nan, 3, 3],
1298+
[np.nan, 4, 4],
1299+
[np.nan, 5, 5],
13001300
],
13011301
columns=["a", "key", "b"],
13021302
)
@@ -1318,6 +1318,20 @@ def test_merge_right_index_right(self):
13181318
result = left.merge(right, left_on="key", right_index=True, how="right")
13191319
tm.assert_frame_equal(result, expected)
13201320

1321+
@pytest.mark.parametrize("how", ["left", "right"])
1322+
def test_merge_preserves_row_order(self, how):
1323+
# GH 27453
1324+
left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
1325+
right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
1326+
result = left_df.merge(right_df, on=["animal", "max_speed"], how=how)
1327+
if how == "right":
1328+
expected = pd.DataFrame(
1329+
{"animal": ["quetzal", "pig"], "max_speed": [80, 11]}
1330+
)
1331+
else:
1332+
expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
1333+
tm.assert_frame_equal(result, expected)
1334+
13211335
def test_merge_take_missing_values_from_index_of_other_dtype(self):
13221336
# GH 24212
13231337
left = pd.DataFrame(

0 commit comments

Comments
 (0)