Skip to content

Commit 93bd97f

Browse files
committed
Add kwarg to Series/(Multi)Index, incl. tests/asv; review; refactor
1 parent f82e3f8 commit 93bd97f

17 files changed

+954
-269
lines changed

asv_bench/benchmarks/frame_methods.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -412,21 +412,35 @@ def time_frame_nunique(self):
412412
class Duplicated(object):
413413

414414
goal_time = 0.2
415+
params = (['first', 'last', False], [True, False])
416+
param_names = ['keep', 'return_inverse']
417+
418+
def setup(self, keep, return_inverse):
419+
if keep is False and return_inverse:
420+
raise NotImplementedError
415421

416-
def setup(self):
417422
n = (1 << 20)
418423
t = date_range('2015-01-01', freq='S', periods=(n // 64))
419424
xs = np.random.randn(n // 64).round(2)
420425
self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
421426
'b': np.random.choice(t, n),
422427
'c': np.random.choice(xs, n)})
423-
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
428+
# df2 will not have any duplicates
429+
self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))
430+
431+
df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
432+
columns=list('ABCDE'))
433+
df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
434+
self.df3 = df3
435+
436+
def time_frame_duplicated(self, keep, return_inverse):
437+
self.df.duplicated(keep=keep, return_inverse=return_inverse)
424438

425-
def time_frame_duplicated(self):
426-
self.df.duplicated()
439+
def time_frame_duplicated_wide(self, keep, return_inverse):
440+
self.df2.duplicated(keep=keep, return_inverse=return_inverse)
427441

428-
def time_frame_duplicated_wide(self):
429-
self.df2.duplicated()
442+
def time_frame_duplicated_mixed(self, keep, return_inverse):
443+
self.df3.duplicated(keep=keep, return_inverse=return_inverse)
430444

431445

432446
class XS(object):

asv_bench/benchmarks/index_object.py

+18
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,24 @@ def time_modulo(self, dtype):
8484
self.index % 2
8585

8686

87+
class Duplicated(object):
88+
89+
goal_time = 0.2
90+
params = (['first', 'last', False], [True, False])
91+
param_names = ['keep', 'return_inverse']
92+
93+
def setup(self, keep, return_inverse):
94+
if keep is False and return_inverse:
95+
raise NotImplementedError
96+
97+
n, k = 200, 1000
98+
base = tm.makeStringIndex(n)
99+
self.idx = Index(base[np.random.choice(n, k * n)])
100+
101+
def time_duplicated(self, keep, return_inverse):
102+
self.idx.duplicated(keep=keep, return_inverse=return_inverse)
103+
104+
87105
class Range(object):
88106

89107
goal_time = 0.2

asv_bench/benchmarks/multiindex_object.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,22 @@ def time_is_monotonic(self):
8383
class Duplicated(object):
8484

8585
goal_time = 0.2
86+
params = (['first', 'last', False], [True, False])
87+
param_names = ['keep', 'return_inverse']
8688

87-
def setup(self):
88-
n, k = 200, 5000
89+
def setup(self, keep, return_inverse):
90+
if keep is False and return_inverse:
91+
raise NotImplementedError
92+
93+
n, k = 200, 1000
8994
levels = [np.arange(n),
9095
tm.makeStringIndex(n).values,
9196
1000 + np.arange(n)]
9297
labels = [np.random.choice(n, (k * n)) for lev in levels]
9398
self.mi = MultiIndex(levels=levels, labels=labels)
9499

95-
def time_duplicated(self):
96-
self.mi.duplicated()
100+
def time_duplicated(self, keep, return_inverse):
101+
self.mi.duplicated(keep=keep, return_inverse=return_inverse)
97102

98103

99104
class Sortlevel(object):

asv_bench/benchmarks/series_methods.py

+18
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,21 @@ def setup(self):
134134

135135
def time_series_datetimeindex_repr(self):
136136
getattr(self.s, 'a', None)
137+
138+
139+
class Duplicated(object):
140+
141+
goal_time = 0.2
142+
params = (['first', 'last', False], [True, False])
143+
param_names = ['keep', 'return_inverse']
144+
145+
def setup(self, keep, return_inverse):
146+
if keep is False and return_inverse:
147+
raise NotImplementedError
148+
149+
n, k = 200, 1000
150+
base = tm.makeStringIndex(n)
151+
self.s = Series(base[np.random.choice(n, k * n)])
152+
153+
def time_series_duplicated(self, keep, return_inverse):
154+
self.s.duplicated(keep=keep, return_inverse=return_inverse)

doc/source/whatsnew/v0.24.0.txt

+50-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ v0.24.0
88
New features
99
~~~~~~~~~~~~
1010

11-
- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
12-
1311
.. _whatsnew_0240.enhancements.extension_array_operators:
1412

1513
``ExtensionArray`` operator support
@@ -66,10 +64,60 @@ Current Behavior:
6664

6765
result
6866

67+
.. _whatsnew_0240.enhancements.duplicated_inverse:
68+
69+
The ``duplicated``-method has gained the ``return_inverse`` kwarg
70+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
71+
72+
The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavours of ``Index`` has gained a ``return_inverse`` keyword,
73+
which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
74+
that allows reconstructing the original object from the deduplicated, unique subset.
75+
76+
For ``Index`` objects, the inverse is an ``np.ndarray``:
77+
78+
.. ipython:: python
79+
80+
idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
81+
idx.has_duplicates
82+
isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first'
83+
isduplicate
84+
inverse
85+
86+
This allows to reconstruct the original ``Index`` as follows:
87+
88+
.. ipython:: python
89+
90+
unique = idx[~isduplicate] # same as idx.drop_duplicates()
91+
unique
92+
93+
reconstruct = unique[inverse]
94+
reconstruct.equals(idx)
95+
96+
For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``,
97+
which contains the mapping from the index of the deduplicated, unique subset back to the original index.
98+
99+
.. ipython:: python
100+
101+
df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
102+
index=[1, 4, 9, 16, 25])
103+
df
104+
isduplicate, inverse = df.duplicated(keep='last', return_inverse=True)
105+
isduplicate
106+
inverse
107+
108+
unique = df.loc[~isduplicate] # same as df.drop_duplicates(keep='last')
109+
unique
110+
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
111+
reconstruct.equals(df)
112+
113+
The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
114+
to construct an inverse).
115+
69116
.. _whatsnew_0240.enhancements.other:
70117

71118
Other Enhancements
72119
^^^^^^^^^^^^^^^^^^
120+
- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
73121
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
74122
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
75123
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)

pandas/core/algorithms.py

+73-4
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,8 @@ def _value_counts_arraylike(values, dropna):
771771
return keys, counts
772772

773773

774-
def duplicated(values, keep='first'):
774+
def duplicated(values, keep='first', return_index=False, return_inverse=False,
775+
stabilize=True):
775776
"""
776777
Return boolean ndarray denoting duplicate values.
777778
@@ -786,16 +787,84 @@ def duplicated(values, keep='first'):
786787
occurrence.
787788
- ``last`` : Mark duplicates as ``True`` except for the last
788789
occurrence.
789-
- False : Mark all duplicates as ``True``.
790+
- False : Mark all duplicates as ``True``. This option is not
791+
compatible with ``return_index`` or ``return_inverse``.
792+
return_index : boolean, default False
793+
If True, also return the (array of) integer indices for the unique
794+
elements within values.
795+
796+
.. versionadded:: 0.24.0
797+
return_inverse : boolean, default False
798+
If True, also return the indices of the unique array that can be used
799+
to reconstruct values..
800+
801+
.. versionadded:: 0.24.0
802+
stabilize : boolean, default True
803+
This keyword is only relevant if index and/or inverse are returned. If
804+
True (the default), it will be ensured that index and inverse fit to
805+
the order of `values`. In case that index and inverse are not needed
806+
separately, but combined right away, this sorting process is
807+
unnecessary and can be disabled for improved performance by setting
808+
`stabilize=False`.
809+
810+
.. versionadded:: 0.24.0
790811
791812
Returns
792813
-------
793-
duplicated : ndarray
814+
duplicated : ndarray or tuple of ndarray
815+
np.ndarray if both `return_index` and `return_inverse` are False.
816+
Otherwise, tuple of ndarray.
794817
"""
795818

819+
if (return_index or return_inverse) and keep is False:
820+
raise ValueError("The parameters return_inverse=True and "
821+
"keep=False cannot be used together (impossible "
822+
"to calculate an inverse when discarding all "
823+
"instances of a duplicate).")
824+
796825
values, dtype, ndtype = _ensure_data(values)
797826
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
798-
return f(values, keep=keep)
827+
isdup = f(values, keep=keep)
828+
if not (return_index or return_inverse):
829+
return isdup
830+
elif not isdup.any():
831+
# no need to calculate inverse if no duplicates
832+
inv = np.array(range(len(values)))
833+
return (isdup,) + (inv,) * return_index + (inv,) * return_inverse
834+
835+
if keep == 'first':
836+
# ind: original indices to indices of ARRAY of unique values
837+
# inv: reduplication from array of unique values to original array
838+
# this fits together in the way that values[ind] are the unique values
839+
# and values[ind][inv] == values
840+
_, ind, inv = np.unique(values, return_index=True,
841+
return_inverse=True)
842+
elif keep == 'last':
843+
# np.unique takes first occurrence as unique value,
844+
# so we flip values that first becomes last
845+
values = values[::-1]
846+
_, ind, inv = np.unique(values, return_index=True,
847+
return_inverse=True)
848+
# the values in "values" correspond(ed) to the index of "values",
849+
# which is simply np.array(range(len(values))).
850+
# By flipping "values" around, we need to do the same for the index,
851+
# ___because ind and inv are relative to that order___.
852+
# Finally, to fit with the original order again, we need to flip the
853+
# result around one last time.
854+
ind, inv = np.array(range(len(values)))[::-1][ind], inv[::-1]
855+
856+
if stabilize:
857+
# np.unique yields a __sorted__ list of uniques, and the index/inverse
858+
# are relative to this order. To restore the original order, we argsort
859+
# the returned index (corresponding to the mapping from values to
860+
# sorted, which is the wrong way around for us), and invert this
861+
# mapping once more (corresponding to the mapping from sorted back to
862+
# values), which is again done by argsorting.
863+
undo_sort = np.argsort(np.argsort(ind))
864+
ind, inv = ind[undo_sort], undo_sort[inv]
865+
866+
res = (isdup,) + (ind,) * return_index + (inv,) * return_inverse
867+
return res
799868

800869

801870
def mode(values, dropna=True):

pandas/core/base.py

+29-4
Original file line numberDiff line numberDiff line change
@@ -1242,16 +1242,41 @@ def drop_duplicates(self, keep='first', inplace=False):
12421242
else:
12431243
return result
12441244

1245-
def duplicated(self, keep='first'):
1245+
def duplicated(self, keep='first', return_inverse=False):
12461246
from pandas.core.algorithms import duplicated
1247+
1248+
if return_inverse and keep is False:
1249+
raise ValueError("The parameters return_inverse=True and "
1250+
"keep=False cannot be used together (impossible "
1251+
"to calculate an inverse when discarding all "
1252+
"instances of a duplicate).")
1253+
12471254
if isinstance(self, ABCIndexClass):
12481255
if self.is_unique:
1249-
return np.zeros(len(self), dtype=np.bool)
1250-
return duplicated(self, keep=keep)
1251-
else:
1256+
isdup = np.zeros(len(self), dtype=np.bool)
1257+
if not return_inverse:
1258+
return isdup
1259+
return isdup, np.array(range(len(self)))
1260+
# algorithms.duplicated has the same output signature as
1261+
# Index.duplicated -> no need to distinguish cases here
1262+
return duplicated(self, keep=keep, return_inverse=return_inverse)
1263+
1264+
# Series case
1265+
if not return_inverse:
12521266
return self._constructor(duplicated(self, keep=keep),
12531267
index=self.index).__finalize__(self)
12541268

1269+
# return_inverse = True
1270+
isdup_array, ind_array, inv_array = duplicated(self, keep=keep,
1271+
return_index=True,
1272+
return_inverse=True,
1273+
stabilize=False)
1274+
isdup = self._constructor(isdup_array,
1275+
index=self.index).__finalize__(self)
1276+
inv = self._constructor(self.index[ind_array][inv_array],
1277+
index=self.index)
1278+
return isdup, inv
1279+
12551280
# ----------------------------------------------------------------------
12561281
# abstracts
12571282

0 commit comments

Comments
 (0)