6
6
import datetime
7
7
from functools import partial
8
8
import string
9
- from typing import TYPE_CHECKING , Optional , Tuple , Union
9
+ from typing import TYPE_CHECKING , Optional , Tuple , Union , cast
10
10
import warnings
11
11
12
12
import numpy as np
13
13
14
14
from pandas ._libs import Timedelta , hashtable as libhashtable , lib
15
15
import pandas ._libs .join as libjoin
16
- from pandas ._typing import FrameOrSeries
16
+ from pandas ._typing import ArrayLike , FrameOrSeries
17
17
from pandas .errors import MergeError
18
18
from pandas .util ._decorators import Appender , Substitution
19
19
24
24
is_array_like ,
25
25
is_bool ,
26
26
is_bool_dtype ,
27
+ is_categorical ,
27
28
is_categorical_dtype ,
28
29
is_datetime64tz_dtype ,
29
30
is_dtype_equal ,
@@ -1271,7 +1272,7 @@ def _get_join_indexers(
1271
1272
1272
1273
# get left & right join labels and num. of levels at each location
1273
1274
mapped = (
1274
- _factorize_keys (left_keys [n ], right_keys [n ], sort = sort )
1275
+ _factorize_keys (left_keys [n ], right_keys [n ], sort = sort , how = how )
1275
1276
for n in range (len (left_keys ))
1276
1277
)
1277
1278
zipped = zip (* mapped )
@@ -1283,8 +1284,8 @@ def _get_join_indexers(
1283
1284
# factorize keys to a dense i8 space
1284
1285
# `count` is the num. of unique keys
1285
1286
# set(lkey) | set(rkey) == range(count)
1286
- lkey , rkey , count = _factorize_keys (lkey , rkey , sort = sort )
1287
1287
1288
+ lkey , rkey , count = _factorize_keys (lkey , rkey , sort = sort , how = how )
1288
1289
# preserve left frame order if how == 'left' and sort == False
1289
1290
kwargs = copy .copy (kwargs )
1290
1291
if how == "left" :
@@ -1822,7 +1823,59 @@ def _right_outer_join(x, y, max_groups):
1822
1823
return left_indexer , right_indexer
1823
1824
1824
1825
1825
- def _factorize_keys (lk , rk , sort = True ):
1826
+ def _factorize_keys (
1827
+ lk : ArrayLike , rk : ArrayLike , sort : bool = True , how : str = "inner"
1828
+ ) -> Tuple [np .array , np .array , int ]:
1829
+ """
1830
+ Encode left and right keys as enumerated types.
1831
+
1832
+ This is used to get the join indexers to be used when merging DataFrames.
1833
+
1834
+ Parameters
1835
+ ----------
1836
+ lk : array-like
1837
+ Left key.
1838
+ rk : array-like
1839
+ Right key.
1840
+ sort : bool, defaults to True
1841
+ If True, the encoding is done such that the unique elements in the
1842
+ keys are sorted.
1843
+ how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
1844
+ Type of merge.
1845
+
1846
+ Returns
1847
+ -------
1848
+ array
1849
+ Left (resp. right if called with `key='right'`) labels, as enumerated type.
1850
+ array
1851
+ Right (resp. left if called with `key='right'`) labels, as enumerated type.
1852
+ int
1853
+ Number of unique elements in union of left and right labels.
1854
+
1855
+ See Also
1856
+ --------
1857
+ merge : Merge DataFrame or named Series objects
1858
+ with a database-style join.
1859
+ algorithms.factorize : Encode the object as an enumerated type
1860
+ or categorical variable.
1861
+
1862
+ Examples
1863
+ --------
1864
+ >>> lk = np.array(["a", "c", "b"])
1865
+ >>> rk = np.array(["a", "c"])
1866
+
1867
+ Here, the unique values are `'a', 'b', 'c'`. With the default
1868
+ `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
1869
+
1870
+ >>> pd.core.reshape.merge._factorize_keys(lk, rk)
1871
+ (array([0, 2, 1]), array([0, 2]), 3)
1872
+
1873
+ With the `sort=False`, the encoding will correspond to the order
1874
+ in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
1875
+
1876
+ >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
1877
+ (array([0, 1, 2]), array([0, 1]), 3)
1878
+ """
1826
1879
# Some pre-processing for non-ndarray lk / rk
1827
1880
lk = extract_array (lk , extract_numpy = True )
1828
1881
rk = extract_array (rk , extract_numpy = True )
@@ -1834,8 +1887,11 @@ def _factorize_keys(lk, rk, sort=True):
1834
1887
rk , _ = rk ._values_for_factorize ()
1835
1888
1836
1889
elif (
1837
- is_categorical_dtype (lk ) and is_categorical_dtype (rk ) and lk . is_dtype_equal (rk )
1890
+ is_categorical_dtype (lk ) and is_categorical_dtype (rk ) and is_dtype_equal (lk , rk )
1838
1891
):
1892
+ assert is_categorical (lk ) and is_categorical (rk )
1893
+ lk = cast (Categorical , lk )
1894
+ rk = cast (Categorical , rk )
1839
1895
if lk .categories .equals (rk .categories ):
1840
1896
# if we exactly match in categories, allow us to factorize on codes
1841
1897
rk = rk .codes
@@ -1892,6 +1948,8 @@ def _factorize_keys(lk, rk, sort=True):
1892
1948
np .putmask (rlab , rmask , count )
1893
1949
count += 1
1894
1950
1951
+ if how == "right" :
1952
+ return rlab , llab , count
1895
1953
return llab , rlab , count
1896
1954
1897
1955
0 commit comments