From 109c0e7e46f68d13182e95007b02b5a86ebb52e2 Mon Sep 17 00:00:00 2001 From: tushushu Date: Wed, 9 Dec 2020 08:58:22 +0800 Subject: [PATCH 01/32] fix series.isin slow issue with Dtype IntegerArray --- pandas/core/series.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0e9476285c258..dd7f2fdfacb64 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -70,7 +70,7 @@ from pandas.core import algorithms, base, generic, missing, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.aggregation import aggregate, transform -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ExtensionArray, IntegerArray from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com @@ -4630,7 +4630,10 @@ def isin(self, values) -> "Series": 5 False Name: animal, dtype: bool """ - result = algorithms.isin(self._values, values) + if isinstance(self._values, IntegerArray): + result = algorithms.isin(self._values._data, values) + else: + result = algorithms.isin(self._values, values) return self._constructor(result, index=self.index).__finalize__( self, method="isin" ) From e9f96ea2252bf2f46b5605a9d657a3561a6a5b2c Mon Sep 17 00:00:00 2001 From: tushushu Date: Wed, 9 Dec 2020 15:09:03 +0800 Subject: [PATCH 02/32] Move isinstance(comps, IntegerArray) to algo.isin --- pandas/core/algorithms.py | 3 +++ pandas/core/series.py | 7 ++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 67a0e02fc2d4d..b2ed35fa4cab3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -15,6 +15,7 @@ from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion from pandas.util._decorators import doc +from pandas.core.arrays import IntegerArray from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, infer_dtype_from_array, @@ -447,6 +448,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: else: values = extract_array(values, extract_numpy=True) + if isinstance(comps, IntegerArray): + comps = comps._data comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) if is_categorical_dtype(comps.dtype): diff --git a/pandas/core/series.py b/pandas/core/series.py index dd7f2fdfacb64..0e9476285c258 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -70,7 +70,7 @@ from pandas.core import algorithms, base, generic, missing, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.aggregation import aggregate, transform -from pandas.core.arrays import ExtensionArray, IntegerArray +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com @@ -4630,10 +4630,7 @@ def isin(self, values) -> "Series": 5 False Name: animal, dtype: bool """ - if isinstance(self._values, IntegerArray): - result = algorithms.isin(self._values._data, values) - else: - result = algorithms.isin(self._values, values) + result = algorithms.isin(self._values, values) return self._constructor(result, index=self.index).__finalize__( self, method="isin" ) From a6be9c842195bf22723c81ba9bd5539adc7bf1ac Mon Sep 17 00:00:00 2001 From: tushushu Date: Wed, 9 Dec 2020 15:58:23 +0800 Subject: [PATCH 03/32] cannot import IntegerArray due to circular import --- pandas/core/algorithms.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b2ed35fa4cab3..3e74d11097b9e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -15,7 +15,6 @@ from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion from pandas.util._decorators import doc -from pandas.core.arrays import IntegerArray from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, infer_dtype_from_array, @@ -448,7 +447,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: else: values = extract_array(values, extract_numpy=True) - if isinstance(comps, IntegerArray): + if comps.__class__.__name__ is 'IntegerArray': comps = comps._data comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) From 415b59008f90cec188adbbe93ba820db9872b4a9 Mon Sep 17 00:00:00 2001 From: tushushu Date: Wed, 9 Dec 2020 18:08:57 +0800 Subject: [PATCH 04/32] fix bug in pandas (Linux py38_np_dev) --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3e74d11097b9e..dcef30a64f25b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -447,7 +447,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: else: values = extract_array(values, extract_numpy=True) - if comps.__class__.__name__ is 'IntegerArray': + if comps.__class__.__name__ == 'IntegerArray': comps = comps._data comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) From f3e5afb19a4cd03cad9caa50c8f599b20a1cc396 Mon Sep 17 00:00:00 2001 From: tushushu Date: Wed, 9 Dec 2020 18:18:44 +0800 Subject: [PATCH 05/32] fix pre commit issue. --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dcef30a64f25b..09c63dd891db6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -447,7 +447,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: else: values = extract_array(values, extract_numpy=True) - if comps.__class__.__name__ == 'IntegerArray': + if comps.__class__.__name__ == "IntegerArray": comps = comps._data comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) From 14579fc83d257069c61c71ed41afb2f6e2d47f9f Mon Sep 17 00:00:00 2001 From: tushushu Date: Wed, 9 Dec 2020 19:55:16 +0800 Subject: [PATCH 06/32] fix the code style issue. --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 09c63dd891db6..5e5c6153f5735 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -447,8 +447,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: else: values = extract_array(values, extract_numpy=True) - if comps.__class__.__name__ == "IntegerArray": - comps = comps._data + if type(comps).__name__ == "IntegerArray": + comps = comps._data # type: ignore[attr-defined, assignment] comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) if is_categorical_dtype(comps.dtype): From 562c91874cd7451a61f802b24d9140172f12c68c Mon Sep 17 00:00:00 2001 From: tushushu Date: Fri, 11 Dec 2020 14:14:52 +0800 Subject: [PATCH 07/32] move the logic to elif block. --- pandas/core/algorithms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5e5c6153f5735..74511b57ce2b9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -446,9 +446,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: values = np.array(values) else: values = extract_array(values, extract_numpy=True) - - if type(comps).__name__ == "IntegerArray": - comps = comps._data # type: ignore[attr-defined, assignment] + comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) if is_categorical_dtype(comps.dtype): @@ -469,6 +467,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype( values.dtype ): + if type(comps).__name__ == "IntegerArray": + comps = comps._data # type: ignore[attr-defined, assignment] return isin(np.asarray(comps), np.asarray(values)) # GH16012 From 1449d3c06185d019894f130eb53a1f6e22bd6b7a Mon Sep 17 00:00:00 2001 From: tushushu Date: Fri, 11 Dec 2020 14:17:22 +0800 Subject: [PATCH 08/32] remove blank line. --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 74511b57ce2b9..9e7362da0931a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -446,7 +446,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: values = np.array(values) else: values = extract_array(values, extract_numpy=True) - + comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) if is_categorical_dtype(comps.dtype): From 3ccc917b22d47a10df6ab6dabd678d92f2eada6a Mon Sep 17 00:00:00 2001 From: tushushu Date: Sun, 27 Dec 2020 16:17:48 +0800 Subject: [PATCH 09/32] copy codes from #38422 --- doc/source/reference/extensions.rst | 1 + pandas/core/algorithms.py | 12 +++--------- pandas/core/arrays/base.py | 19 ++++++++++++++++++- pandas/core/arrays/masked.py | 10 +++++++++- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index fe4113d100abf..7b451ed3bf296 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -48,6 +48,7 @@ objects. api.extensions.ExtensionArray.equals api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna + api.extensions.ExtensionArray.isin api.extensions.ExtensionArray.isna api.extensions.ExtensionArray.ravel api.extensions.ExtensionArray.repeat diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e7362da0931a..fb299497df9fc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -449,10 +449,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) - if is_categorical_dtype(comps.dtype): - # TODO(extension) - # handle categoricals - return cast("Categorical", comps).isin(values) + if is_extension_array_dtype(comps.dtype): + return comps.isin(values) if needs_i8_conversion(comps.dtype): # Dispatch to DatetimeLikeArrayMixin.isin @@ -464,11 +462,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: elif needs_i8_conversion(values.dtype): return isin(comps, values.astype(object)) - elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype( - values.dtype - ): - if type(comps).__name__ == "IntegerArray": - comps = comps._data # type: ignore[attr-defined, assignment] + elif is_extension_array_dtype(values.dtype): return isin(np.asarray(comps), np.asarray(values)) # GH16012 diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 95470422f2ccd..089bb14772835 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -45,7 +45,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops -from pandas.core.algorithms import factorize_array, unique +from pandas.core.algorithms import factorize_array, isin, unique from pandas.core.missing import get_fill_func from pandas.core.sorting import nargminmax, nargsort @@ -78,6 +78,7 @@ class ExtensionArray: factorize fillna equals + isin isna ravel repeat @@ -833,6 +834,22 @@ def equals(self, other: object) -> bool: equal_na = self.isna() & other.isna() return bool((equal_values | equal_na).all()) + def isin(self, values) -> np.ndarray: + """ + Pointwise comparison for set containment in the given values. + + Roughly equivalent to `np.array([x in values for x in self])` + + Parameters + ---------- + values : Sequence + + Returns + ------- + np.ndarray[bool] + """ + return isin(np.asarray(self), values) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ Return an array and missing value suitable for factorization. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index caed932cd7857..9b519152c7b75 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -19,7 +19,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops -from pandas.core.algorithms import factorize_array, take +from pandas.core.algorithms import factorize_array, isin, take from pandas.core.array_algos import masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray @@ -27,6 +27,7 @@ if TYPE_CHECKING: from pandas import Series + from pandas.core.arrays import BooleanArray BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray") @@ -299,6 +300,13 @@ def take( return type(self)(result, mask, copy=False) + def isin(self, values) -> "BooleanArray": + + from pandas.core.arrays import BooleanArray + + result = isin(self._data, values) + return BooleanArray(result, self._mask.copy(), copy=False) + def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: data, mask = self._data, self._mask data = data.copy() From 98a068304efed7abe802a64e0e65862e12c70f05 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sun, 27 Dec 2020 19:50:17 +0800 Subject: [PATCH 10/32] make `isin` correct for pd.NA --- pandas/core/arrays/masked.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9b519152c7b75..06183becb6382 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union import numpy as np +import pandas as pd from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar @@ -304,8 +305,11 @@ def isin(self, values) -> "BooleanArray": from pandas.core.arrays import BooleanArray - result = isin(self._data, values) - return BooleanArray(result, self._mask.copy(), copy=False) + result = isin(self._data, values) * np.invert(self._mask) + if any(x is pd.NA for x in values): + result += self._mask + mask = np.zeros_like(self, dtype=bool) + return BooleanArray(result, mask, copy=False) def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: data, mask = self._data, self._mask From 6e2917e4015767d52a02ad817876eab9c131ba7f Mon Sep 17 00:00:00 2001 From: tushushu Date: Sun, 27 Dec 2020 20:21:43 +0800 Subject: [PATCH 11/32] sort imports --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 06183becb6382..36df0af38915e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -3,7 +3,6 @@ from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union import numpy as np -import pandas as pd from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar @@ -19,6 +18,7 @@ ) from pandas.core.dtypes.missing import isna, notna +import pandas as pd from pandas.core import nanops from pandas.core.algorithms import factorize_array, isin, take from pandas.core.array_algos import masked_reductions From a4b6503741ab6cc0d23b7e1af61f61794fd35adc Mon Sep 17 00:00:00 2001 From: tushushu Date: Thu, 31 Dec 2020 20:37:06 +0800 Subject: [PATCH 12/32] Avoiding import pandas as pd. --- pandas/core/arrays/masked.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 36df0af38915e..389cdc5b2219f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -18,7 +18,7 @@ ) from pandas.core.dtypes.missing import isna, notna -import pandas as pd +from pandas import NA from pandas.core import nanops from pandas.core.algorithms import factorize_array, isin, take from pandas.core.array_algos import masked_reductions @@ -306,7 +306,7 @@ def isin(self, values) -> "BooleanArray": from pandas.core.arrays import BooleanArray result = isin(self._data, values) * np.invert(self._mask) - if any(x is pd.NA for x in values): + if any(x is NA for x in values): result += self._mask mask = np.zeros_like(self, dtype=bool) return BooleanArray(result, mask, copy=False) From f95fde94da7e45c401afe2ceccfbe0a0a4869a71 Mon Sep 17 00:00:00 2001 From: tushushu Date: Thu, 31 Dec 2020 20:50:12 +0800 Subject: [PATCH 13/32] fix cannot import NA issue. --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 389cdc5b2219f..df1f5f253848e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -18,9 +18,9 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas import NA from pandas.core import nanops from pandas.core.algorithms import factorize_array, isin, take +from pandas.core.api import NA from pandas.core.array_algos import masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray From 13dc64f5aa0be208dc83b19d5c3221f7a6b284da Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 9 Jan 2021 15:03:30 +0800 Subject: [PATCH 14/32] Adding Int64 and Float64 for benchmarks. --- asv_bench/benchmarks/series_methods.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 3f4da8acf4db0..585102d8a476f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -25,7 +25,7 @@ def time_constructor(self, data): class IsIn: - params = ["int64", "uint64", "object"] + params = ["int64", "uint64", "object", "Int64"] param_names = ["dtype"] def setup(self, dtype): @@ -59,8 +59,12 @@ def time_isin_empty(self): class IsInFloat64: - def setup(self): - self.small = Series([1, 2], dtype=np.float64) + + params = [np.float64, "Float64"] + param_names = ["dtype"] + + def setup(self, dtype): + self.small = Series([1, 2], dtype=dtype) self.many_different_values = np.arange(10 ** 6, dtype=np.float64) self.few_different_values = np.zeros(10 ** 7, dtype=np.float64) self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64) @@ -114,7 +118,7 @@ def time_isin_long_series_long_values_floats(self): class IsInLongSeriesLookUpDominates: params = [ - ["int64", "int32", "float64", "float32", "object"], + ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], [5, 1000], ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], ] @@ -141,7 +145,7 @@ def time_isin(self, dtypes, MaxNumber, series_type): class IsInLongSeriesValuesDominate: params = [ - ["int64", "int32", "float64", "float32", "object"], + ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], ["random", "monotone"], ] param_names = ["dtype", "series_type"] From cc38088089644a5d4500239117581a974ffbeb2c Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 9 Jan 2021 15:14:48 +0800 Subject: [PATCH 15/32] Adding isin benchmarks for Boolean array --- asv_bench/benchmarks/series_methods.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 585102d8a476f..4d56042564192 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -36,6 +36,19 @@ def time_isin(self, dtypes): self.s.isin(self.values) +class IsInBoolean: + + params = ["boolean", "bool"] + param_names = ["dtype"] + + def setup(self, dtype): + self.s = Series(np.random.randint(0, 2, 100000)).astype(dtype) + self.values = [True, False] + + def time_isin(self, dtypes): + self.s.isin(self.values) + + class IsInDatetime64: def setup(self): dti = date_range( From 94846cc16296480c17e8f5af0aed962623b91e3b Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 9 Jan 2021 15:20:21 +0800 Subject: [PATCH 16/32] Adding what's new note. --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1d76c9d296255..7c02aa77f2feb 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -174,6 +174,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`) - Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`) +- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) - .. --------------------------------------------------------------------------- From f4cb5ce2fb2f021fce0eae1b54fc2d550f051046 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 9 Jan 2021 16:01:44 +0800 Subject: [PATCH 17/32] fix IsInFloat64 benchmarks --- asv_bench/benchmarks/series_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 4d56042564192..d1d9a0c15d396 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -82,15 +82,15 @@ def setup(self, dtype): self.few_different_values = np.zeros(10 ** 7, dtype=np.float64) self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64) - def time_isin_many_different(self): + def time_isin_many_different(self, dtypes): # runtime is dominated by creation of the lookup-table self.small.isin(self.many_different_values) - def time_isin_few_different(self): + def time_isin_few_different(self, dtypes): # runtime is dominated by creation of the lookup-table self.small.isin(self.few_different_values) - def time_isin_nan_values(self): + def time_isin_nan_values(self, dtypes): # runtime is dominated by creation of the lookup-table self.small.isin(self.few_different_values) From 2ee8b05e024f00e3e73bb3b076f081ac727e46dd Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 9 Jan 2021 16:46:46 +0800 Subject: [PATCH 18/32] always return false for null values. --- pandas/core/arrays/masked.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6ed85a978af62..08218e001fbdb 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -335,8 +335,6 @@ def isin(self, values) -> "BooleanArray": from pandas.core.arrays import BooleanArray result = isin(self._data, values) * np.invert(self._mask) - if any(x is NA for x in values): - result += self._mask mask = np.zeros_like(self, dtype=bool) return BooleanArray(result, mask, copy=False) From 7763b1828cfb18f7ef4170c81f713a45de16df7e Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 9 Jan 2021 21:39:51 +0800 Subject: [PATCH 19/32] fix flake8 error. --- pandas/core/arrays/masked.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 08218e001fbdb..dee618933a459 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -22,7 +22,6 @@ from pandas.core import nanops from pandas.core.algorithms import factorize_array, isin, take -from pandas.core.api import NA from pandas.core.array_algos import masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray From 87dfff903651269f385434eaf732bbcd20e237f0 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sun, 10 Jan 2021 17:37:23 +0800 Subject: [PATCH 20/32] remove unused lines. --- pandas/core/algorithms.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3e554e607174e..9ff49c3560d04 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -456,9 +456,6 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if is_extension_array_dtype(comps.dtype): return comps.isin(values) - elif is_interval_dtype(comps.dtype): - return cast("IntervalArray", comps).isin(values) - elif needs_i8_conversion(comps.dtype): # Dispatch to DatetimeLikeArrayMixin.isin return array(comps).isin(values) From d2d32d185eb56baae8c9f71668c591daecaef5d3 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sun, 10 Jan 2021 17:43:49 +0800 Subject: [PATCH 21/32] refactors for series benchmarks. --- asv_bench/benchmarks/series_methods.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index d1d9a0c15d396..e3ab8b243e25a 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -29,7 +29,8 @@ class IsIn: param_names = ["dtype"] def setup(self, dtype): - self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) + N = 10000 + self.s = Series(np.random.randint(1, 10, N)).astype(dtype) self.values = [1, 2] def time_isin(self, dtypes): @@ -42,7 +43,8 @@ class IsInBoolean: param_names = ["dtype"] def setup(self, dtype): - self.s = Series(np.random.randint(0, 2, 100000)).astype(dtype) + N = 10000 + self.s = Series(np.random.randint(0, 2, N)).astype(dtype) self.values = [True, False] def time_isin(self, dtypes): @@ -77,10 +79,12 @@ class IsInFloat64: param_names = ["dtype"] def setup(self, dtype): + N_many = 10 ** 5 + N_few = 10 ** 6 self.small = Series([1, 2], dtype=dtype) - self.many_different_values = np.arange(10 ** 6, dtype=np.float64) - self.few_different_values = np.zeros(10 ** 7, dtype=np.float64) - self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64) + self.many_different_values = np.arange(N_many, dtype=np.float64) + self.few_different_values = np.zeros(N_few, dtype=np.float64) + self.only_nans_values = np.full(N_few, np.nan, dtype=np.float64) def time_isin_many_different(self, dtypes): # runtime is dominated by creation of the lookup-table From 90ef57a21c6ec72b8a4201e2dac74866d5665be2 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sun, 10 Jan 2021 18:35:26 +0800 Subject: [PATCH 22/32] fix flake8 errors. --- pandas/core/algorithms.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9ff49c3560d04..834b43d55a83e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -36,7 +36,6 @@ is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, @@ -68,7 +67,7 @@ if TYPE_CHECKING: from pandas import Categorical, DataFrame, Index, Series - from pandas.core.arrays import DatetimeArray, IntervalArray, TimedeltaArray + from pandas.core.arrays import DatetimeArray, TimedeltaArray _shared_docs: Dict[str, str] = {} From a48e00b8df7a063e843cb3eaa480005bf60085c0 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sun, 10 Jan 2021 21:18:07 +0800 Subject: [PATCH 23/32] Change back to see if can pass the tests. --- pandas/core/algorithms.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 834b43d55a83e..3e554e607174e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -36,6 +36,7 @@ is_float_dtype, is_integer, is_integer_dtype, + is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, @@ -67,7 +68,7 @@ if TYPE_CHECKING: from pandas import Categorical, DataFrame, Index, Series - from pandas.core.arrays import DatetimeArray, TimedeltaArray + from pandas.core.arrays import DatetimeArray, IntervalArray, TimedeltaArray _shared_docs: Dict[str, str] = {} @@ -455,6 +456,9 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if is_extension_array_dtype(comps.dtype): return comps.isin(values) + elif is_interval_dtype(comps.dtype): + return cast("IntervalArray", comps).isin(values) + elif needs_i8_conversion(comps.dtype): # Dispatch to DatetimeLikeArrayMixin.isin return array(comps).isin(values) From c726d4a9e323f6693c5993022e9d10768af2e0e8 Mon Sep 17 00:00:00 2001 From: tushushu Date: Thu, 14 Jan 2021 23:20:52 +0800 Subject: [PATCH 24/32] makes NA isin [NA] return True. --- pandas/core/arrays/masked.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index dee618933a459..3227288de0757 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -22,6 +22,7 @@ from pandas.core import nanops from pandas.core.algorithms import factorize_array, isin, take +from pandas.core.api import NA from pandas.core.array_algos import masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray @@ -334,6 +335,8 @@ def isin(self, values) -> "BooleanArray": from pandas.core.arrays import BooleanArray result = isin(self._data, values) * np.invert(self._mask) + if NA in values: + result += self._mask mask = np.zeros_like(self, dtype=bool) return BooleanArray(result, mask, copy=False) From 1134ad6cc302f7d973199e67c23cac479b5a3b83 Mon Sep 17 00:00:00 2001 From: tushushu Date: Thu, 14 Jan 2021 23:26:10 +0800 Subject: [PATCH 25/32] remove redundant codes. --- pandas/core/algorithms.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7e3fafee90292..1c0bc0fc52b6c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -456,9 +456,6 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if is_extension_array_dtype(comps.dtype): return comps.isin(values) - elif is_interval_dtype(comps.dtype): - return cast("IntervalArray", comps).isin(values) - elif needs_i8_conversion(comps.dtype): # Dispatch to DatetimeLikeArrayMixin.isin return array(comps).isin(values) From bce0e3e4c723650ef9119132cbc4ad83a7786a65 Mon Sep 17 00:00:00 2001 From: tushushu Date: Thu, 14 Jan 2021 23:34:22 +0800 Subject: [PATCH 26/32] makes performance better. --- pandas/core/arrays/masked.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3227288de0757..1154480b25c22 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -12,6 +12,7 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_dtype_equal, + is_extension_array_dtype, is_integer, is_object_dtype, is_scalar, @@ -335,7 +336,8 @@ def isin(self, values) -> "BooleanArray": from pandas.core.arrays import BooleanArray result = isin(self._data, values) * np.invert(self._mask) - if NA in values: + if (is_extension_array_dtype(values.dtype) or is_object_dtype(values.dtype))\ + and NA in values: result += self._mask mask = np.zeros_like(self, dtype=bool) return BooleanArray(result, mask, copy=False) From bf788e55b2167214b1914d1d83fbbaf50039bc06 Mon Sep 17 00:00:00 2001 From: tushushu Date: Thu, 14 Jan 2021 23:43:21 +0800 Subject: [PATCH 27/32] fix flake8 errors. --- pandas/core/algorithms.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1c0bc0fc52b6c..be65adeb34139 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -36,7 +36,6 @@ is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, @@ -68,7 +67,7 @@ if TYPE_CHECKING: from pandas import Categorical, DataFrame, Index, Series - from pandas.core.arrays import DatetimeArray, IntervalArray, TimedeltaArray + from pandas.core.arrays import DatetimeArray, TimedeltaArray _shared_docs: Dict[str, str] = {} @@ -477,7 +476,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any(): - f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) + def f(c, v): return np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d From 40950bef647807a509ff8e7f9cddccd46f336b10 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 16 Jan 2021 16:19:58 +0800 Subject: [PATCH 28/32] polish codes --- pandas/core/arrays/masked.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1154480b25c22..0b0786b8d2ff7 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -12,7 +12,6 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_dtype_equal, - is_extension_array_dtype, is_integer, is_object_dtype, is_scalar, @@ -335,10 +334,12 @@ def isin(self, values) -> "BooleanArray": from pandas.core.arrays import BooleanArray - result = isin(self._data, values) * np.invert(self._mask) - if (is_extension_array_dtype(values.dtype) or is_object_dtype(values.dtype))\ - and NA in values: - result += self._mask + result = isin(self._data, values) + if self._hasna: + if NA in values: + result += self._mask + else: + result *= np.invert(self._mask) mask = np.zeros_like(self, dtype=bool) return BooleanArray(result, mask, copy=False) From 570d640c167ee52e98408516f3362d322d983cd8 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 16 Jan 2021 16:21:11 +0800 Subject: [PATCH 29/32] not import NA --- pandas/core/arrays/masked.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0b0786b8d2ff7..6b5d3e1230a23 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -22,7 +22,6 @@ from pandas.core import nanops from pandas.core.algorithms import factorize_array, isin, take -from pandas.core.api import NA from pandas.core.array_algos import masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray @@ -336,7 +335,7 @@ def isin(self, values) -> "BooleanArray": result = isin(self._data, values) if self._hasna: - if NA in values: + if libmissing.NA in values: result += self._mask else: result *= np.invert(self._mask) From 0f89578f02a69b5a6c9d817f0ca7c1f254c1343c Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 16 Jan 2021 16:21:37 +0800 Subject: [PATCH 30/32] fix code style --- pandas/core/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index be65adeb34139..8105500071a70 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -476,7 +476,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any(): - def f(c, v): return np.logical_or(np.in1d(c, v), np.isnan(c)) + def f(c, v): + return np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d From 199c11c478f36292e7bd84afa7964e9dc5d03671 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sat, 16 Jan 2021 22:08:28 +0800 Subject: [PATCH 31/32] fix black error. --- pandas/core/algorithms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8105500071a70..9f938a95d070e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -476,8 +476,10 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any(): + def f(c, v): return np.logical_or(np.in1d(c, v), np.isnan(c)) + else: f = np.in1d From 9f35b5b674c2b656b99973c0901e235bfabbdb81 Mon Sep 17 00:00:00 2001 From: tushushu Date: Sun, 17 Jan 2021 11:58:23 +0800 Subject: [PATCH 32/32] fix CI --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6b5d3e1230a23..d062d114abb6b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -329,7 +329,7 @@ def take( return type(self)(result, mask, copy=False) - def isin(self, values) -> "BooleanArray": + def isin(self, values) -> BooleanArray: from pandas.core.arrays import BooleanArray