From 09e2798e251865faa3b37db43d971bc462ff120c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 28 Aug 2022 19:38:35 +0200 Subject: [PATCH 1/4] PERF: Improve performance for midx unique --- asv_bench/benchmarks/multiindex_object.py | 29 +++++++++++++++++++++++ pandas/core/indexes/multi.py | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index a498c6b2e4944..ca1d71f50bd5f 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -3,9 +3,11 @@ import numpy as np from pandas import ( + NA, DataFrame, MultiIndex, RangeIndex, + Series, date_range, ) @@ -255,4 +257,31 @@ def time_operation(self, index_structure, dtype, method): getattr(self.left, method)(self.right) +class Unique: + params = [ + (("Int64", NA), ("int64", 0)), + ] + param_names = ["dtype_val"] + + def setup(self, dtype_val): + level = Series( + [1, 2, dtype_val[1], dtype_val[1]] + list(range(1_000_000)), + dtype=dtype_val[0], + ) + self.midx = MultiIndex.from_arrays([level, level]) + + level_dups = Series( + [1, 2, dtype_val[1], dtype_val[1]] + list(range(500_000)) * 2, + dtype=dtype_val[0], + ) + + self.midx_dups = MultiIndex.from_arrays([level_dups, level_dups]) + + def time_unique(self, dtype_val): + self.midx.unique() + + def time_unique_dups(self, dtype_val): + self.midx_dups.unique() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 63c78b7002786..a7640bfdaee7c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1727,7 +1727,7 @@ def get_level_values(self, level): def unique(self, level=None): if level is None: - return super().unique() + return self.drop_duplicates() else: level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) From a88471f19a682d409b394f17d027fe7d60ee2ce0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 31 Aug 2022 19:53:08 +0200 Subject: [PATCH 2/4] Add whatsnew and tests --- doc/source/whatsnew/v1.6.0.rst | 4 ++-- pandas/tests/indexes/multi/test_duplicates.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index eac5e5d3a0f52..e4a54c9943cda 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -100,7 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement for :meth:`MultiIndex.unique` (:issue:`40000`) - .. --------------------------------------------------------------------------- @@ -161,7 +161,7 @@ Missing MultiIndex ^^^^^^^^^^ -- +- Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`40000`) - I/O diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 6ec4d1fac8c8a..9d60db9ef4b46 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -6,6 +6,7 @@ from pandas._libs import hashtable from pandas import ( + NA, DatetimeIndex, MultiIndex, Series, @@ -337,3 +338,16 @@ def test_multi_drop_duplicates_pos_args_deprecation(): result = idx.drop_duplicates("last") expected = MultiIndex.from_arrays([[2, 3, 1], [2, 3, 1]]) tm.assert_index_equal(expected, result) + + +def test_midx_unique_ea_dtype(): + # GH# + a = Series([1, 2, NA, NA], dtype="Int64") + b = np.array([1, 2, 3, 3]) + midx = MultiIndex.from_arrays([a, b], names=["a", "b"]) + result = midx.unique() + + a = Series([1, 2, NA], dtype="Int64") + b = np.array([1, 2, 3]) + expected = MultiIndex.from_arrays([a, b], names=["a", "b"]) + tm.assert_index_equal(result, expected) From d17eb1418c87cdee8c307d79bc7d681e87ea8472 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 31 Aug 2022 19:55:55 +0200 Subject: [PATCH 3/4] Add gh ref --- doc/source/whatsnew/v1.6.0.rst | 4 ++-- pandas/tests/indexes/multi/test_duplicates.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index aa8d8b84ec013..8c4eaa32b3d66 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -100,7 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvement for :meth:`MultiIndex.unique` (:issue:`40000`) +- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - .. --------------------------------------------------------------------------- @@ -161,7 +161,7 @@ Missing MultiIndex ^^^^^^^^^^ -- Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`40000`) +- Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) - diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 9d60db9ef4b46..05f6c91ade225 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -341,7 +341,7 @@ def test_multi_drop_duplicates_pos_args_deprecation(): def test_midx_unique_ea_dtype(): - # GH# + # GH#48335 a = Series([1, 2, NA, NA], dtype="Int64") b = np.array([1, 2, 3, 3]) midx = MultiIndex.from_arrays([a, b], names=["a", "b"]) From 2668639d1bc7d241142e09e144860e497e6101c5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 31 Aug 2022 22:12:59 +0200 Subject: [PATCH 4/4] Rename --- pandas/tests/indexes/multi/test_duplicates.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 05f6c91ade225..6832fb77ed30d 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -342,12 +342,12 @@ def test_multi_drop_duplicates_pos_args_deprecation(): def test_midx_unique_ea_dtype(): # GH#48335 - a = Series([1, 2, NA, NA], dtype="Int64") - b = np.array([1, 2, 3, 3]) - midx = MultiIndex.from_arrays([a, b], names=["a", "b"]) + vals_a = Series([1, 2, NA, NA], dtype="Int64") + vals_b = np.array([1, 2, 3, 3]) + midx = MultiIndex.from_arrays([vals_a, vals_b], names=["a", "b"]) result = midx.unique() - a = Series([1, 2, NA], dtype="Int64") - b = np.array([1, 2, 3]) - expected = MultiIndex.from_arrays([a, b], names=["a", "b"]) + exp_vals_a = Series([1, 2, NA], dtype="Int64") + exp_vals_b = np.array([1, 2, 3]) + expected = MultiIndex.from_arrays([exp_vals_a, exp_vals_b], names=["a", "b"]) tm.assert_index_equal(result, expected)