Skip to content

BENCH: collect isin asvs #39922

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/algos/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
algos/ directory is intended for individual functions from core.algorithms

In many cases these algorithms are reachable in multiple ways:
algos.foo(x, y)
Series(x).foo(y)
Index(x).foo(y)
pd.array(x).foo(y)

In most cases we profile the Series variant directly, trusting the performance
of the others to be highly correlated.
"""
317 changes: 317 additions & 0 deletions asv_bench/benchmarks/algos/isin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
import numpy as np

from pandas.compat.numpy import np_version_under1p20

from pandas import (
Categorical,
NaT,
Series,
date_range,
)


class IsIn:

params = [
"int64",
"uint64",
"object",
"Int64",
"boolean",
"bool",
"datetime64[ns]",
"category[object]",
"category[int]",
]
param_names = ["dtype"]

def setup(self, dtype):
N = 10000

self.mismatched = [NaT.to_datetime64()] * 2

if dtype in ["boolean", "bool"]:
self.series = Series(np.random.randint(0, 2, N)).astype(dtype)
self.values = [True, False]

elif dtype == "datetime64[ns]":
# Note: values here is much larger than non-dt64ns cases

# dti has length=115777
dti = date_range(start="2015-10-26", end="2016-01-01", freq="50s")
self.series = Series(dti)
self.values = self.series._values[::3]
self.mismatched = [1, 2]

elif dtype in ["category[object]", "category[int]"]:
# Note: sizes are different in this case than others
np.random.seed(1234)

n = 5 * 10 ** 5
sample_size = 100

arr = list(np.random.randint(0, n // 10, size=n))
if dtype == "category[object]":
arr = [f"s{i:04d}" for i in arr]

self.values = np.random.choice(arr, sample_size)
self.series = Series(arr).astype("category")

else:
self.series = Series(np.random.randint(1, 10, N)).astype(dtype)
self.values = [1, 2]

self.cat_values = Categorical(self.values)

def time_isin(self, dtype):
self.series.isin(self.values)

def time_isin_categorical(self, dtype):
self.series.isin(self.cat_values)

def time_isin_empty(self, dtype):
self.series.isin([])

def time_isin_mismatched_dtype(self, dtype):
self.series.isin(self.mismatched)


class IsinAlmostFullWithRandomInt:
params = [
[np.float64, np.int64, np.uint64, np.object_],
range(10, 21),
["inside", "outside"],
]
param_names = ["dtype", "exponent", "title"]

def setup(self, dtype, exponent, title):
M = 3 * 2 ** (exponent - 2)
# 0.77-the maximal share of occupied buckets
np.random.seed(42)
self.series = Series(np.random.randint(0, M, M)).astype(dtype)

values = np.random.randint(0, M, M).astype(dtype)
if title == "inside":
self.values = values
elif title == "outside":
self.values = values + M
else:
raise ValueError(title)

def time_isin(self, dtype, exponent, title):
self.series.isin(self.values)


class IsinWithRandomFloat:
params = [
[np.float64, np.object],
[
1_300,
2_000,
7_000,
8_000,
70_000,
80_000,
750_000,
900_000,
],
["inside", "outside"],
]
param_names = ["dtype", "size", "title"]

def setup(self, dtype, size, title):
np.random.seed(42)
self.values = np.random.rand(size)
self.series = Series(self.values).astype(dtype)
np.random.shuffle(self.values)

if title == "outside":
self.values = self.values + 0.1

def time_isin(self, dtype, size, title):
self.series.isin(self.values)


class IsinWithArangeSorted:
params = [
[np.float64, np.int64, np.uint64, np.object],
[
1_000,
2_000,
8_000,
100_000,
1_000_000,
],
]
param_names = ["dtype", "size"]

def setup(self, dtype, size):
self.series = Series(np.arange(size)).astype(dtype)
self.values = np.arange(size).astype(dtype)

def time_isin(self, dtype, size):
self.series.isin(self.values)


class IsinWithArange:
params = [
[np.float64, np.int64, np.uint64, np.object],
[
1_000,
2_000,
8_000,
],
[-2, 0, 2],
]
param_names = ["dtype", "M", "offset_factor"]

def setup(self, dtype, M, offset_factor):
offset = int(M * offset_factor)
np.random.seed(42)
tmp = Series(np.random.randint(offset, M + offset, 10 ** 6))
self.series = tmp.astype(dtype)
self.values = np.arange(M).astype(dtype)

def time_isin(self, dtype, M, offset_factor):
self.series.isin(self.values)


class IsInFloat64:

params = [
[np.float64, "Float64"],
["many_different_values", "few_different_values", "only_nans_values"],
]
param_names = ["dtype", "title"]

def setup(self, dtype, title):
N_many = 10 ** 5
N_few = 10 ** 6
self.series = Series([1, 2], dtype=dtype)

if title == "many_different_values":
# runtime is dominated by creation of the lookup-table
self.values = np.arange(N_many, dtype=np.float64)
elif title == "few_different_values":
# runtime is dominated by creation of the lookup-table
self.values = np.zeros(N_few, dtype=np.float64)
elif title == "only_nans_values":
# runtime is dominated by creation of the lookup-table
self.values = np.full(N_few, np.nan, dtype=np.float64)
else:
raise ValueError(title)

def time_isin(self, dtype, title):
self.series.isin(self.values)


class IsInForObjects:
"""
A subset of the cartesian product of cases have special motivations:

"nans" x "nans"
if nan-objects are different objects,
this has the potential to trigger O(n^2) running time

"short" x "long"
running time dominated by the preprocessing

"long" x "short"
running time dominated by look-up

"long" x "long"
no dominating part

"long_floats" x "long_floats"
because of nans floats are special
no dominating part

"""

variants = ["nans", "short", "long", "long_floats"]

params = [variants, variants]
param_names = ["series_type", "vals_type"]

def setup(self, series_type, vals_type):
N_many = 10 ** 5

if series_type == "nans":
ser_vals = np.full(10 ** 4, np.nan)
elif series_type == "short":
ser_vals = np.arange(2)
elif series_type == "long":
ser_vals = np.arange(N_many)
elif series_type == "long_floats":
ser_vals = np.arange(N_many, dtype=np.float_)

self.series = Series(ser_vals).astype(object)

if vals_type == "nans":
values = np.full(10 ** 4, np.nan)
elif vals_type == "short":
values = np.arange(2)
elif vals_type == "long":
values = np.arange(N_many)
elif vals_type == "long_floats":
values = np.arange(N_many, dtype=np.float_)

self.values = values.astype(object)

def time_isin(self, series_type, vals_type):
self.series.isin(self.values)


class IsInLongSeriesLookUpDominates:
params = [
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"],
[5, 1000],
["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
]
param_names = ["dtype", "MaxNumber", "series_type"]

def setup(self, dtype, MaxNumber, series_type):
N = 10 ** 7

if not np_version_under1p20 and dtype in ("Int64", "Float64"):
raise NotImplementedError

if series_type == "random_hits":
np.random.seed(42)
array = np.random.randint(0, MaxNumber, N)
if series_type == "random_misses":
np.random.seed(42)
array = np.random.randint(0, MaxNumber, N) + MaxNumber
if series_type == "monotone_hits":
array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
if series_type == "monotone_misses":
array = np.arange(N) + MaxNumber

self.series = Series(array).astype(dtype)
self.values = np.arange(MaxNumber).astype(dtype)

def time_isin(self, dtypes, MaxNumber, series_type):
self.series.isin(self.values)


class IsInLongSeriesValuesDominate:
params = [
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"],
["random", "monotone"],
]
param_names = ["dtype", "series_type"]

def setup(self, dtype, series_type):
N = 10 ** 7
if series_type == "random":
np.random.seed(42)
vals = np.random.randint(0, 10 * N, N)
if series_type == "monotone":
vals = np.arange(N)

self.values = vals.astype(dtype)
M = 10 ** 6 + 1
self.series = Series(np.arange(M)).astype(dtype)

def time_isin(self, dtypes, series_type):
self.series.isin(self.values)
19 changes: 0 additions & 19 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,25 +220,6 @@ def time_rank_int_cat_ordered(self):
self.s_int_cat_ordered.rank()


class Isin:

params = ["object", "int64"]
param_names = ["dtype"]

def setup(self, dtype):
np.random.seed(1234)
n = 5 * 10 ** 5
sample_size = 100
arr = list(np.random.randint(0, n // 10, size=n))
if dtype == "object":
arr = [f"s{i:04d}" for i in arr]
self.sample = np.random.choice(arr, sample_size)
self.series = pd.Series(arr).astype("category")

def time_isin_categorical(self, dtype):
self.series.isin(self.sample)


class IsMonotonic:
def setup(self):
N = 1000
Expand Down
Loading