From bc7f46ff757e18b45843da78be09aaffd6d5703e Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 3 Oct 2020 23:05:19 +0100 Subject: [PATCH 1/5] PERF: Index.equals when comparing to copies of self --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/indexes/base.py | 11 ++++++----- pandas/core/indexes/datetimelike.py | 1 + pandas/core/indexes/interval.py | 10 +++++----- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/period.py | 11 ++++++----- pandas/core/indexes/range.py | 11 ++++++----- pandas/tests/indexes/common.py | 6 ++++++ pandas/tests/indexes/datetimes/test_constructors.py | 2 +- pandas/tests/indexes/multi/test_equivalence.py | 2 +- pandas/tests/indexes/test_base.py | 4 ++-- 11 files changed, 37 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index cb0858fd678f8..23d6b212e6e17 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -284,6 +284,7 @@ Performance improvements - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) - Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) +- Performance improvement for the :meth:`~Index.equals` method on all index classes when compared to copies of the same index (:issue:`xxxxx`) .. --------------------------------------------------------------------------- @@ -358,6 +359,7 @@ Indexing - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) +- Bug in :meth:`Index.equals`, where it required the name to the be equal. This method should not compare names for equality (:issue:`xxxxx`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ff3d8bf05f9a5..6358d0adeeecc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -561,12 +561,13 @@ def _shallow_copy(self, values=None, name: Label = no_default): name : Label, defaults to self.name """ name = self.name if name is no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._values - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._values, name=name) + result._cache = self._cache.copy() + result._id = self._id return result def is_(self, other) -> bool: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d2162d987ccd6..9b7e8683743ad 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -682,6 +682,7 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): result = type(self)._simple_new(values, name=name) result._cache = cache + result._id = self._id return result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a56f6a5bb0340..288b16a06707a 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -335,12 +335,12 @@ def _shallow_copy( self, values: Optional[IntervalArray] = None, name: Label = lib.no_default ): name = self.name if name is lib.no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data + if values is not None: + return self._simple_new(values, name=name) - result = self._simple_new(values, name=name) - result._cache = cache + result = self._simple_new(self._data, name=name) + result._cache = self._cache.copy() + result._id = self._id return result @cache_readonly diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a157fdfdde447..224497882354c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1089,6 +1089,7 @@ def _shallow_copy( ) result._cache = self._cache.copy() result._cache.pop("levels", None) # GH32669 + result._id = self._id return result def symmetric_difference(self, other, result_name=None, sort=None): @@ -1192,7 +1193,6 @@ def __array__(self, dtype=None) -> np.ndarray: def view(self, cls=None): """ this is defined as a copy with the same identity """ result = self.copy() - result._id = self._id return result @doc(Index.__contains__) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 27b60747015de..b98f757c9e49b 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -260,12 +260,13 @@ def _has_complex_internals(self) -> bool: def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache.copy() + result._id = self._id return result def _maybe_convert_timedelta(self, other): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 4dffda2605ef7..bdd7d778f2909 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -397,13 +397,14 @@ def __iter__(self): def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name - if values is None: - result = self._simple_new(self._range, name=name) - result._cache = self._cache.copy() - return result - else: + if values is not None: return Int64Index._simple_new(values, name=name) + result = self._simple_new(self._range, name=name) + result._cache = self._cache.copy() + result._id = self._id + return result + @doc(Int64Index.copy) def copy(self, name=None, deep=False, dtype=None, names=None): name = self._validate_names(name=name, names=names, deep=deep)[0] diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c40f7b1bc2120..8b2f19dc8a8e3 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -960,3 +960,9 @@ def test_shallow_copy_copies_cache(self): # cache values should reference the same object for key, val in idx._cache.items(): assert shallow_copy._cache[key] is val, key + + def test_shallow_copy_copies_id(self): + # GHxxxxx + idx = self.create_index() + shallow_copy = idx._shallow_copy() + assert idx._id is shallow_copy._id diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index d3c79f231449a..940642278db9f 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -931,7 +931,7 @@ def test_is_(self): dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") assert dti.is_(dti) assert dti.is_(dti.view()) - assert not dti.is_(dti.copy()) + assert dti.is_(dti.copy()) def test_index_cast_datetime64_other_units(self): arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]") diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 184cedea7dc5c..15c1b20ae7d87 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -180,7 +180,7 @@ def test_is_(): assert mi2.is_(mi) assert mi.is_(mi2) - assert not mi.is_(mi.set_names(["C", "D"])) + assert mi.is_(mi.set_names(["C", "D"])) mi2 = mi.view() mi2.set_names(["E", "F"], inplace=True) assert mi.is_(mi2) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8db1bcc84bfa6..33cf24b1540c5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -523,8 +523,8 @@ def test_is_(self): assert ind.is_(ind) assert ind.is_(ind.view().view().view().view()) assert not ind.is_(Index(range(10))) - assert not ind.is_(ind.copy()) - assert not ind.is_(ind.copy(deep=False)) + assert ind.is_(ind.copy()) + assert ind.is_(ind.copy(deep=False)) assert not ind.is_(ind[:]) assert not ind.is_(np.array(range(10))) From 353cae2ac062184fd534d84ad4e605644ccea566 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 3 Oct 2020 23:29:07 +0100 Subject: [PATCH 2/5] refactor _shallow_copy, add GH number --- doc/source/whatsnew/v1.2.0.rst | 4 ++-- pandas/core/indexes/datetimelike.py | 14 ++++++-------- pandas/tests/indexes/common.py | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 23d6b212e6e17..13fbe7f8c8363 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -284,7 +284,7 @@ Performance improvements - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) - Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) -- Performance improvement for the :meth:`~Index.equals` method on all index classes when compared to copies of the same index (:issue:`xxxxx`) +- Performance improvement for the :meth:`~Index.equals` method on all index classes, when compared to copies of the same index (:issue:`36840`) .. --------------------------------------------------------------------------- @@ -359,7 +359,7 @@ Indexing - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) -- Bug in :meth:`Index.equals`, where it required the name to the be equal. This method should not compare names for equality (:issue:`xxxxx`) +- Bug in :meth:`Index.equals`, where it required the name to the be equal. This method should not compare names for equality (:issue:`36840`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9b7e8683743ad..752a9ce667d93 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -671,17 +671,15 @@ def _with_freq(self, freq): def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - - if isinstance(values, np.ndarray): + if values is not None: # TODO: We would rather not get here - values = type(self._data)(values, dtype=self.dtype) + if isinstance(values, np.ndarray): + values = type(self._data)(values, dtype=self.dtype) + return self._simple_new(values, name=name) - result = type(self)._simple_new(values, name=name) - result._cache = cache + result = self._simple_new(self._data, name=name) + result._cache = self._cache.copy() result._id = self._id return result diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8b2f19dc8a8e3..9bce9fb581737 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -962,7 +962,7 @@ def test_shallow_copy_copies_cache(self): assert shallow_copy._cache[key] is val, key def test_shallow_copy_copies_id(self): - # GHxxxxx + # GH36840 idx = self.create_index() shallow_copy = idx._shallow_copy() assert idx._id is shallow_copy._id From ffa9eeeb03fbfaead1263fea98f8047964fca570 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 4 Oct 2020 08:24:50 +0100 Subject: [PATCH 3/5] PERF: share _cache, don't share _id --- doc/source/whatsnew/v1.2.0.rst | 5 ++-- pandas/core/indexes/base.py | 3 +-- pandas/core/indexes/datetimelike.py | 3 +-- pandas/core/indexes/interval.py | 4 +-- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/period.py | 3 +-- pandas/core/indexes/range.py | 3 +-- pandas/tests/indexes/common.py | 26 +++++-------------- .../indexes/datetimes/test_constructors.py | 2 +- .../tests/indexes/multi/test_equivalence.py | 2 +- pandas/tests/indexes/test_base.py | 4 +-- 11 files changed, 21 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 13fbe7f8c8363..7fb29e4b96e8c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -284,7 +284,9 @@ Performance improvements - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) - Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) -- Performance improvement for the :meth:`~Index.equals` method on all index classes, when compared to copies of the same index (:issue:`36840`) +- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes, + avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) + .. --------------------------------------------------------------------------- @@ -359,7 +361,6 @@ Indexing - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) -- Bug in :meth:`Index.equals`, where it required the name to the be equal. This method should not compare names for equality (:issue:`36840`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6358d0adeeecc..d019c18bb4a66 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -566,8 +566,7 @@ def _shallow_copy(self, values=None, name: Label = no_default): return self._simple_new(values, name=name) result = self._simple_new(self._values, name=name) - result._cache = self._cache.copy() - result._id = self._id + result._cache = self._cache return result def is_(self, other) -> bool: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 752a9ce667d93..dae6bb79b2cb8 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -679,8 +679,7 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): return self._simple_new(values, name=name) result = self._simple_new(self._data, name=name) - result._cache = self._cache.copy() - result._id = self._id + result._cache = self._cache return result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 288b16a06707a..4a877621a94c2 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -335,12 +335,12 @@ def _shallow_copy( self, values: Optional[IntervalArray] = None, name: Label = lib.no_default ): name = self.name if name is lib.no_default else name + if values is not None: return self._simple_new(values, name=name) result = self._simple_new(self._data, name=name) - result._cache = self._cache.copy() - result._id = self._id + result._cache = self._cache return result @cache_readonly diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 224497882354c..a157fdfdde447 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1089,7 +1089,6 @@ def _shallow_copy( ) result._cache = self._cache.copy() result._cache.pop("levels", None) # GH32669 - result._id = self._id return result def symmetric_difference(self, other, result_name=None, sort=None): @@ -1193,6 +1192,7 @@ def __array__(self, dtype=None) -> np.ndarray: def view(self, cls=None): """ this is defined as a copy with the same identity """ result = self.copy() + result._id = self._id return result @doc(Index.__contains__) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b98f757c9e49b..adf7a75b33b38 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -265,8 +265,7 @@ def _shallow_copy(self, values=None, name: Label = no_default): return self._simple_new(values, name=name) result = self._simple_new(self._data, name=name) - result._cache = self._cache.copy() - result._id = self._id + result._cache = self._cache return result def _maybe_convert_timedelta(self, other): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index bdd7d778f2909..d5d9a9b5bc0a3 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -401,8 +401,7 @@ def _shallow_copy(self, values=None, name: Label = no_default): return Int64Index._simple_new(values, name=name) result = self._simple_new(self._range, name=name) - result._cache = self._cache.copy() - result._id = self._id + result._cache = self._cache return result @doc(Int64Index.copy) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 9bce9fb581737..1a3fb343c2c36 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -936,33 +936,21 @@ def test_contains_requires_hashable_raises(self): {} in idx._engine def test_copy_copies_cache(self): - # GH32898 + # GH32898, GH36840 idx = self.create_index() idx.get_loc(idx[0]) # populates the _cache. copy = idx.copy() - # check that the copied cache is a copy of the original - assert idx._cache == copy._cache - assert idx._cache is not copy._cache - # cache values should reference the same object - for key, val in idx._cache.items(): - assert copy._cache[key] is val, key + assert copy._cache is idx._cache def test_shallow_copy_copies_cache(self): - # GH32669 + # GH32669, GH36840 idx = self.create_index() idx.get_loc(idx[0]) # populates the _cache. shallow_copy = idx._shallow_copy() - # check that the shallow_copied cache is a copy of the original - assert idx._cache == shallow_copy._cache - assert idx._cache is not shallow_copy._cache - # cache values should reference the same object - for key, val in idx._cache.items(): - assert shallow_copy._cache[key] is val, key + assert shallow_copy._cache is idx._cache - def test_shallow_copy_copies_id(self): - # GH36840 - idx = self.create_index() - shallow_copy = idx._shallow_copy() - assert idx._id is shallow_copy._id + shallow_copy = idx._shallow_copy(idx._data) + assert shallow_copy._cache is not idx._cache + assert shallow_copy._cache == {} diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 940642278db9f..d3c79f231449a 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -931,7 +931,7 @@ def test_is_(self): dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") assert dti.is_(dti) assert dti.is_(dti.view()) - assert dti.is_(dti.copy()) + assert not dti.is_(dti.copy()) def test_index_cast_datetime64_other_units(self): arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]") diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 15c1b20ae7d87..184cedea7dc5c 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -180,7 +180,7 @@ def test_is_(): assert mi2.is_(mi) assert mi.is_(mi2) - assert mi.is_(mi.set_names(["C", "D"])) + assert not mi.is_(mi.set_names(["C", "D"])) mi2 = mi.view() mi2.set_names(["E", "F"], inplace=True) assert mi.is_(mi2) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 33cf24b1540c5..8db1bcc84bfa6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -523,8 +523,8 @@ def test_is_(self): assert ind.is_(ind) assert ind.is_(ind.view().view().view().view()) assert not ind.is_(Index(range(10))) - assert ind.is_(ind.copy()) - assert ind.is_(ind.copy(deep=False)) + assert not ind.is_(ind.copy()) + assert not ind.is_(ind.copy(deep=False)) assert not ind.is_(ind[:]) assert not ind.is_(np.array(range(10))) From cff45392df21e1add1a420756226af4f92c8b2ed Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 4 Oct 2020 08:38:09 +0100 Subject: [PATCH 4/5] rename tests --- pandas/tests/indexes/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1a3fb343c2c36..73d2e99d3ff5e 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -935,7 +935,7 @@ def test_contains_requires_hashable_raises(self): with pytest.raises(TypeError, match=msg): {} in idx._engine - def test_copy_copies_cache(self): + def test_copy_shares_cache(self): # GH32898, GH36840 idx = self.create_index() idx.get_loc(idx[0]) # populates the _cache. @@ -943,7 +943,7 @@ def test_copy_copies_cache(self): assert copy._cache is idx._cache - def test_shallow_copy_copies_cache(self): + def test_shallow_copy_shares_cache(self): # GH32669, GH36840 idx = self.create_index() idx.get_loc(idx[0]) # populates the _cache. From c0c95622031c56340dd88188cf81b32db4f8dfce Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 4 Oct 2020 09:10:54 +0100 Subject: [PATCH 5/5] fix memory usage test --- pandas/tests/base/test_misc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index b8468a5acf277..2dc2fe6d2ad07 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -128,7 +128,8 @@ def test_memory_usage(index_or_series_obj): ) if len(obj) == 0: - assert res_deep == res == 0 + expected = 0 if isinstance(obj, Index) else 80 + assert res_deep == res == expected elif is_object or is_categorical: # only deep will pick them up assert res_deep > res