Skip to content

Commit 6abb38b

Browse files
lukemanleycbpygit
authored andcommitted
PERF: merge on monotonic keys (pandas-dev#56523)
1 parent 0ef3469 commit 6abb38b

File tree

6 files changed

+146
-65
lines changed

6 files changed

+146
-65
lines changed

asv_bench/benchmarks/join_merge.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -275,18 +275,21 @@ def time_merge_dataframes_cross(self, sort):
275275

276276
class MergeEA:
277277
params = [
278-
"Int64",
279-
"Int32",
280-
"Int16",
281-
"UInt64",
282-
"UInt32",
283-
"UInt16",
284-
"Float64",
285-
"Float32",
278+
[
279+
"Int64",
280+
"Int32",
281+
"Int16",
282+
"UInt64",
283+
"UInt32",
284+
"UInt16",
285+
"Float64",
286+
"Float32",
287+
],
288+
[True, False],
286289
]
287-
param_names = ["dtype"]
290+
param_names = ["dtype", "monotonic"]
288291

289-
def setup(self, dtype):
292+
def setup(self, dtype, monotonic):
290293
N = 10_000
291294
indices = np.arange(1, N)
292295
key = np.tile(indices[:8000], 10)
@@ -299,8 +302,11 @@ def setup(self, dtype):
299302
"value2": np.random.randn(7999),
300303
}
301304
)
305+
if monotonic:
306+
self.left = self.left.sort_values("key")
307+
self.right = self.right.sort_values("key")
302308

303-
def time_merge(self, dtype):
309+
def time_merge(self, dtype, monotonic):
304310
merge(self.left, self.right)
305311

306312

@@ -330,10 +336,11 @@ class MergeDatetime:
330336
("ns", "ms"),
331337
],
332338
[None, "Europe/Brussels"],
339+
[True, False],
333340
]
334-
param_names = ["units", "tz"]
341+
param_names = ["units", "tz", "monotonic"]
335342

336-
def setup(self, units, tz):
343+
def setup(self, units, tz, monotonic):
337344
unit_left, unit_right = units
338345
N = 10_000
339346
keys = Series(date_range("2012-01-01", freq="min", periods=N, tz=tz))
@@ -349,8 +356,11 @@ def setup(self, units, tz):
349356
"value2": np.random.randn(8000),
350357
}
351358
)
359+
if monotonic:
360+
self.left = self.left.sort_values("key")
361+
self.right = self.right.sort_values("key")
352362

353-
def time_merge(self, units, tz):
363+
def time_merge(self, units, tz, monotonic):
354364
merge(self.left, self.right)
355365

356366

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,7 @@ Performance improvements
498498
- Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` (:issue:`55949`, :issue:`55971`)
499499
- Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`)
500500
- Performance improvement in :func:`get_dummies` (:issue:`56089`)
501+
- Performance improvement in :func:`merge` and :func:`merge_ordered` when joining on sorted ascending keys (:issue:`56115`)
501502
- Performance improvement in :func:`merge_asof` when ``by`` is not ``None`` (:issue:`55580`, :issue:`55678`)
502503
- Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`)
503504
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)

pandas/core/groupby/generic.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,8 @@ def value_counts(
850850
_, idx = get_join_indexers(
851851
left, right, sort=False, how="left" # type: ignore[arg-type]
852852
)
853-
out = np.where(idx != -1, out[idx], 0)
853+
if idx is not None:
854+
out = np.where(idx != -1, out[idx], 0)
854855

855856
if sort:
856857
sorter = np.lexsort((out if ascending else -out, left[0]))

pandas/core/indexes/base.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4783,13 +4783,13 @@ def _join_multi(self, other: Index, how: JoinHow):
47834783
def _join_non_unique(
47844784
self, other: Index, how: JoinHow = "left", sort: bool = False
47854785
) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
4786-
from pandas.core.reshape.merge import get_join_indexers
4786+
from pandas.core.reshape.merge import get_join_indexers_non_unique
47874787

47884788
# We only get here if dtypes match
47894789
assert self.dtype == other.dtype
47904790

4791-
left_idx, right_idx = get_join_indexers(
4792-
[self._values], [other._values], how=how, sort=sort
4791+
left_idx, right_idx = get_join_indexers_non_unique(
4792+
self._values, other._values, how=how, sort=sort
47934793
)
47944794
mask = left_idx == -1
47954795

pandas/core/reshape/merge.py

Lines changed: 114 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,10 +1019,14 @@ def _maybe_add_join_keys(
10191019
take_left, take_right = None, None
10201020

10211021
if name in result:
1022-
if left_indexer is not None and right_indexer is not None:
1022+
if left_indexer is not None or right_indexer is not None:
10231023
if name in self.left:
10241024
if left_has_missing is None:
1025-
left_has_missing = (left_indexer == -1).any()
1025+
left_has_missing = (
1026+
False
1027+
if left_indexer is None
1028+
else (left_indexer == -1).any()
1029+
)
10261030

10271031
if left_has_missing:
10281032
take_right = self.right_join_keys[i]
@@ -1032,21 +1036,27 @@ def _maybe_add_join_keys(
10321036

10331037
elif name in self.right:
10341038
if right_has_missing is None:
1035-
right_has_missing = (right_indexer == -1).any()
1039+
right_has_missing = (
1040+
False
1041+
if right_indexer is None
1042+
else (right_indexer == -1).any()
1043+
)
10361044

10371045
if right_has_missing:
10381046
take_left = self.left_join_keys[i]
10391047

10401048
if result[name].dtype != self.right[name].dtype:
10411049
take_right = self.right[name]._values
10421050

1043-
elif left_indexer is not None:
1051+
else:
10441052
take_left = self.left_join_keys[i]
10451053
take_right = self.right_join_keys[i]
10461054

10471055
if take_left is not None or take_right is not None:
10481056
if take_left is None:
10491057
lvals = result[name]._values
1058+
elif left_indexer is None:
1059+
lvals = take_left
10501060
else:
10511061
# TODO: can we pin down take_left's type earlier?
10521062
take_left = extract_array(take_left, extract_numpy=True)
@@ -1055,6 +1065,8 @@ def _maybe_add_join_keys(
10551065

10561066
if take_right is None:
10571067
rvals = result[name]._values
1068+
elif right_indexer is None:
1069+
rvals = take_right
10581070
else:
10591071
# TODO: can we pin down take_right's type earlier?
10601072
taker = extract_array(take_right, extract_numpy=True)
@@ -1063,16 +1075,17 @@ def _maybe_add_join_keys(
10631075

10641076
# if we have an all missing left_indexer
10651077
# make sure to just use the right values or vice-versa
1066-
mask_left = left_indexer == -1
1067-
# error: Item "bool" of "Union[Any, bool]" has no attribute "all"
1068-
if mask_left.all(): # type: ignore[union-attr]
1078+
if left_indexer is not None and (left_indexer == -1).all():
10691079
key_col = Index(rvals)
10701080
result_dtype = rvals.dtype
10711081
elif right_indexer is not None and (right_indexer == -1).all():
10721082
key_col = Index(lvals)
10731083
result_dtype = lvals.dtype
10741084
else:
1075-
key_col = Index(lvals).where(~mask_left, rvals)
1085+
key_col = Index(lvals)
1086+
if left_indexer is not None:
1087+
mask_left = left_indexer == -1
1088+
key_col = key_col.where(~mask_left, rvals)
10761089
result_dtype = find_common_type([lvals.dtype, rvals.dtype])
10771090
if (
10781091
lvals.dtype.kind == "M"
@@ -1103,7 +1116,9 @@ def _maybe_add_join_keys(
11031116
else:
11041117
result.insert(i, name or f"key_{i}", key_col)
11051118

1106-
def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1119+
def _get_join_indexers(
1120+
self,
1121+
) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
11071122
"""return the join indexers"""
11081123
# make mypy happy
11091124
assert self.how != "asof"
@@ -1143,6 +1158,8 @@ def _get_join_info(
11431158
left_indexer,
11441159
how="right",
11451160
)
1161+
elif right_indexer is None:
1162+
join_index = right_ax.copy()
11461163
else:
11471164
join_index = right_ax.take(right_indexer)
11481165
elif self.left_index:
@@ -1162,10 +1179,13 @@ def _get_join_info(
11621179
right_indexer,
11631180
how="left",
11641181
)
1182+
elif left_indexer is None:
1183+
join_index = left_ax.copy()
11651184
else:
11661185
join_index = left_ax.take(left_indexer)
11671186
else:
1168-
join_index = default_index(len(left_indexer))
1187+
n = len(left_ax) if left_indexer is None else len(left_indexer)
1188+
join_index = default_index(n)
11691189

11701190
return join_index, left_indexer, right_indexer
11711191

@@ -1174,17 +1194,20 @@ def _create_join_index(
11741194
self,
11751195
index: Index,
11761196
other_index: Index,
1177-
indexer: npt.NDArray[np.intp],
1197+
indexer: npt.NDArray[np.intp] | None,
11781198
how: JoinHow = "left",
11791199
) -> Index:
11801200
"""
11811201
Create a join index by rearranging one index to match another
11821202
11831203
Parameters
11841204
----------
1185-
index : Index being rearranged
1186-
other_index : Index used to supply values not found in index
1187-
indexer : np.ndarray[np.intp] how to rearrange index
1205+
index : Index
1206+
index being rearranged
1207+
other_index : Index
1208+
used to supply values not found in index
1209+
indexer : np.ndarray[np.intp] or None
1210+
how to rearrange index
11881211
how : str
11891212
Replacement is only necessary if indexer based on other_index.
11901213
@@ -1202,6 +1225,8 @@ def _create_join_index(
12021225
if np.any(mask):
12031226
fill_value = na_value_for_dtype(index.dtype, compat=False)
12041227
index = index.append(Index([fill_value]))
1228+
if indexer is None:
1229+
return index.copy()
12051230
return index.take(indexer)
12061231

12071232
@final
@@ -1660,7 +1685,7 @@ def get_join_indexers(
16601685
right_keys: list[ArrayLike],
16611686
sort: bool = False,
16621687
how: JoinHow = "inner",
1663-
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1688+
) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
16641689
"""
16651690
16661691
Parameters
@@ -1672,9 +1697,9 @@ def get_join_indexers(
16721697
16731698
Returns
16741699
-------
1675-
np.ndarray[np.intp]
1700+
np.ndarray[np.intp] or None
16761701
Indexer into the left_keys.
1677-
np.ndarray[np.intp]
1702+
np.ndarray[np.intp] or None
16781703
Indexer into the right_keys.
16791704
"""
16801705
assert len(left_keys) == len(
@@ -1695,37 +1720,77 @@ def get_join_indexers(
16951720
elif not sort and how in ["left", "outer"]:
16961721
return _get_no_sort_one_missing_indexer(left_n, False)
16971722

1698-
# get left & right join labels and num. of levels at each location
1699-
mapped = (
1700-
_factorize_keys(left_keys[n], right_keys[n], sort=sort)
1701-
for n in range(len(left_keys))
1702-
)
1703-
zipped = zip(*mapped)
1704-
llab, rlab, shape = (list(x) for x in zipped)
1723+
lkey: ArrayLike
1724+
rkey: ArrayLike
1725+
if len(left_keys) > 1:
1726+
# get left & right join labels and num. of levels at each location
1727+
mapped = (
1728+
_factorize_keys(left_keys[n], right_keys[n], sort=sort)
1729+
for n in range(len(left_keys))
1730+
)
1731+
zipped = zip(*mapped)
1732+
llab, rlab, shape = (list(x) for x in zipped)
17051733

1706-
# get flat i8 keys from label lists
1707-
lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort)
1734+
# get flat i8 keys from label lists
1735+
lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort)
1736+
else:
1737+
lkey = left_keys[0]
1738+
rkey = right_keys[0]
17081739

1709-
# factorize keys to a dense i8 space
1710-
# `count` is the num. of unique keys
1711-
# set(lkey) | set(rkey) == range(count)
1740+
left = Index(lkey)
1741+
right = Index(rkey)
17121742

1713-
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
1714-
# preserve left frame order if how == 'left' and sort == False
1715-
kwargs = {}
1716-
if how in ("inner", "left", "right"):
1717-
kwargs["sort"] = sort
1718-
join_func = {
1719-
"inner": libjoin.inner_join,
1720-
"left": libjoin.left_outer_join,
1721-
"right": lambda x, y, count, **kwargs: libjoin.left_outer_join(
1722-
y, x, count, **kwargs
1723-
)[::-1],
1724-
"outer": libjoin.full_outer_join,
1725-
}[how]
1726-
1727-
# error: Cannot call function of unknown type
1728-
return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator]
1743+
if (
1744+
left.is_monotonic_increasing
1745+
and right.is_monotonic_increasing
1746+
and (left.is_unique or right.is_unique)
1747+
):
1748+
_, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort)
1749+
else:
1750+
lidx, ridx = get_join_indexers_non_unique(
1751+
left._values, right._values, sort, how
1752+
)
1753+
1754+
if lidx is not None and is_range_indexer(lidx, len(left)):
1755+
lidx = None
1756+
if ridx is not None and is_range_indexer(ridx, len(right)):
1757+
ridx = None
1758+
return lidx, ridx
1759+
1760+
1761+
def get_join_indexers_non_unique(
1762+
left: ArrayLike,
1763+
right: ArrayLike,
1764+
sort: bool = False,
1765+
how: JoinHow = "inner",
1766+
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1767+
"""
1768+
Get join indexers for left and right.
1769+
1770+
Parameters
1771+
----------
1772+
left : ArrayLike
1773+
right : ArrayLike
1774+
sort : bool, default False
1775+
how : {'inner', 'outer', 'left', 'right'}, default 'inner'
1776+
1777+
Returns
1778+
-------
1779+
np.ndarray[np.intp]
1780+
Indexer into left.
1781+
np.ndarray[np.intp]
1782+
Indexer into right.
1783+
"""
1784+
lkey, rkey, count = _factorize_keys(left, right, sort=sort)
1785+
if how == "left":
1786+
lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort)
1787+
elif how == "right":
1788+
ridx, lidx = libjoin.left_outer_join(rkey, lkey, count, sort=sort)
1789+
elif how == "inner":
1790+
lidx, ridx = libjoin.inner_join(lkey, rkey, count, sort=sort)
1791+
elif how == "outer":
1792+
lidx, ridx = libjoin.full_outer_join(lkey, rkey, count)
1793+
return lidx, ridx
17291794

17301795

17311796
def restore_dropped_levels_multijoin(
@@ -1860,7 +1925,10 @@ def get_result(self, copy: bool | None = True) -> DataFrame:
18601925
left_indexer = cast("npt.NDArray[np.intp]", left_indexer)
18611926
right_indexer = cast("npt.NDArray[np.intp]", right_indexer)
18621927
left_join_indexer = libjoin.ffill_indexer(left_indexer)
1863-
right_join_indexer = libjoin.ffill_indexer(right_indexer)
1928+
if right_indexer is None:
1929+
right_join_indexer = None
1930+
else:
1931+
right_join_indexer = libjoin.ffill_indexer(right_indexer)
18641932
elif self.fill_method is None:
18651933
left_join_indexer = left_indexer
18661934
right_join_indexer = right_indexer

0 commit comments

Comments
 (0)