|
| 1 | +import numpy as np |
| 2 | + |
| 3 | +from pandas.compat.numpy import np_version_under1p20 |
| 4 | + |
| 5 | +from pandas import ( |
| 6 | + Categorical, |
| 7 | + NaT, |
| 8 | + Series, |
| 9 | + date_range, |
| 10 | +) |
| 11 | + |
| 12 | + |
| 13 | +class IsIn: |
| 14 | + |
| 15 | + params = [ |
| 16 | + "int64", |
| 17 | + "uint64", |
| 18 | + "object", |
| 19 | + "Int64", |
| 20 | + "boolean", |
| 21 | + "bool", |
| 22 | + "datetime64[ns]", |
| 23 | + "category[object]", |
| 24 | + "category[int]", |
| 25 | + ] |
| 26 | + param_names = ["dtype"] |
| 27 | + |
| 28 | + def setup(self, dtype): |
| 29 | + N = 10000 |
| 30 | + |
| 31 | + self.mismatched = [NaT.to_datetime64()] * 2 |
| 32 | + |
| 33 | + if dtype in ["boolean", "bool"]: |
| 34 | + self.series = Series(np.random.randint(0, 2, N)).astype(dtype) |
| 35 | + self.values = [True, False] |
| 36 | + |
| 37 | + elif dtype == "datetime64[ns]": |
| 38 | + # Note: values here is much larger than non-dt64ns cases |
| 39 | + |
| 40 | + # dti has length=115777 |
| 41 | + dti = date_range(start="2015-10-26", end="2016-01-01", freq="50s") |
| 42 | + self.series = Series(dti) |
| 43 | + self.values = self.series._values[::3] |
| 44 | + self.mismatched = [1, 2] |
| 45 | + |
| 46 | + elif dtype in ["category[object]", "category[int]"]: |
| 47 | + # Note: sizes are different in this case than others |
| 48 | + np.random.seed(1234) |
| 49 | + |
| 50 | + n = 5 * 10 ** 5 |
| 51 | + sample_size = 100 |
| 52 | + |
| 53 | + arr = list(np.random.randint(0, n // 10, size=n)) |
| 54 | + if dtype == "category[object]": |
| 55 | + arr = [f"s{i:04d}" for i in arr] |
| 56 | + |
| 57 | + self.values = np.random.choice(arr, sample_size) |
| 58 | + self.series = Series(arr).astype("category") |
| 59 | + |
| 60 | + else: |
| 61 | + self.series = Series(np.random.randint(1, 10, N)).astype(dtype) |
| 62 | + self.values = [1, 2] |
| 63 | + |
| 64 | + self.cat_values = Categorical(self.values) |
| 65 | + |
| 66 | + def time_isin(self, dtype): |
| 67 | + self.series.isin(self.values) |
| 68 | + |
| 69 | + def time_isin_categorical(self, dtype): |
| 70 | + self.series.isin(self.cat_values) |
| 71 | + |
| 72 | + def time_isin_empty(self, dtype): |
| 73 | + self.series.isin([]) |
| 74 | + |
| 75 | + def time_isin_mismatched_dtype(self, dtype): |
| 76 | + self.series.isin(self.mismatched) |
| 77 | + |
| 78 | + |
| 79 | +class IsinAlmostFullWithRandomInt: |
| 80 | + params = [ |
| 81 | + [np.float64, np.int64, np.uint64, np.object_], |
| 82 | + range(10, 21), |
| 83 | + ["inside", "outside"], |
| 84 | + ] |
| 85 | + param_names = ["dtype", "exponent", "title"] |
| 86 | + |
| 87 | + def setup(self, dtype, exponent, title): |
| 88 | + M = 3 * 2 ** (exponent - 2) |
| 89 | + # 0.77-the maximal share of occupied buckets |
| 90 | + np.random.seed(42) |
| 91 | + self.series = Series(np.random.randint(0, M, M)).astype(dtype) |
| 92 | + |
| 93 | + values = np.random.randint(0, M, M).astype(dtype) |
| 94 | + if title == "inside": |
| 95 | + self.values = values |
| 96 | + elif title == "outside": |
| 97 | + self.values = values + M |
| 98 | + else: |
| 99 | + raise ValueError(title) |
| 100 | + |
| 101 | + def time_isin(self, dtype, exponent, title): |
| 102 | + self.series.isin(self.values) |
| 103 | + |
| 104 | + |
| 105 | +class IsinWithRandomFloat: |
| 106 | + params = [ |
| 107 | + [np.float64, np.object], |
| 108 | + [ |
| 109 | + 1_300, |
| 110 | + 2_000, |
| 111 | + 7_000, |
| 112 | + 8_000, |
| 113 | + 70_000, |
| 114 | + 80_000, |
| 115 | + 750_000, |
| 116 | + 900_000, |
| 117 | + ], |
| 118 | + ["inside", "outside"], |
| 119 | + ] |
| 120 | + param_names = ["dtype", "size", "title"] |
| 121 | + |
| 122 | + def setup(self, dtype, size, title): |
| 123 | + np.random.seed(42) |
| 124 | + self.values = np.random.rand(size) |
| 125 | + self.series = Series(self.values).astype(dtype) |
| 126 | + np.random.shuffle(self.values) |
| 127 | + |
| 128 | + if title == "outside": |
| 129 | + self.values = self.values + 0.1 |
| 130 | + |
| 131 | + def time_isin(self, dtype, size, title): |
| 132 | + self.series.isin(self.values) |
| 133 | + |
| 134 | + |
| 135 | +class IsinWithArangeSorted: |
| 136 | + params = [ |
| 137 | + [np.float64, np.int64, np.uint64, np.object], |
| 138 | + [ |
| 139 | + 1_000, |
| 140 | + 2_000, |
| 141 | + 8_000, |
| 142 | + 100_000, |
| 143 | + 1_000_000, |
| 144 | + ], |
| 145 | + ] |
| 146 | + param_names = ["dtype", "size"] |
| 147 | + |
| 148 | + def setup(self, dtype, size): |
| 149 | + self.series = Series(np.arange(size)).astype(dtype) |
| 150 | + self.values = np.arange(size).astype(dtype) |
| 151 | + |
| 152 | + def time_isin(self, dtype, size): |
| 153 | + self.series.isin(self.values) |
| 154 | + |
| 155 | + |
| 156 | +class IsinWithArange: |
| 157 | + params = [ |
| 158 | + [np.float64, np.int64, np.uint64, np.object], |
| 159 | + [ |
| 160 | + 1_000, |
| 161 | + 2_000, |
| 162 | + 8_000, |
| 163 | + ], |
| 164 | + [-2, 0, 2], |
| 165 | + ] |
| 166 | + param_names = ["dtype", "M", "offset_factor"] |
| 167 | + |
| 168 | + def setup(self, dtype, M, offset_factor): |
| 169 | + offset = int(M * offset_factor) |
| 170 | + np.random.seed(42) |
| 171 | + tmp = Series(np.random.randint(offset, M + offset, 10 ** 6)) |
| 172 | + self.series = tmp.astype(dtype) |
| 173 | + self.values = np.arange(M).astype(dtype) |
| 174 | + |
| 175 | + def time_isin(self, dtype, M, offset_factor): |
| 176 | + self.series.isin(self.values) |
| 177 | + |
| 178 | + |
| 179 | +class IsInFloat64: |
| 180 | + |
| 181 | + params = [ |
| 182 | + [np.float64, "Float64"], |
| 183 | + ["many_different_values", "few_different_values", "only_nans_values"], |
| 184 | + ] |
| 185 | + param_names = ["dtype", "title"] |
| 186 | + |
| 187 | + def setup(self, dtype, title): |
| 188 | + N_many = 10 ** 5 |
| 189 | + N_few = 10 ** 6 |
| 190 | + self.series = Series([1, 2], dtype=dtype) |
| 191 | + |
| 192 | + if title == "many_different_values": |
| 193 | + # runtime is dominated by creation of the lookup-table |
| 194 | + self.values = np.arange(N_many, dtype=np.float64) |
| 195 | + elif title == "few_different_values": |
| 196 | + # runtime is dominated by creation of the lookup-table |
| 197 | + self.values = np.zeros(N_few, dtype=np.float64) |
| 198 | + elif title == "only_nans_values": |
| 199 | + # runtime is dominated by creation of the lookup-table |
| 200 | + self.values = np.full(N_few, np.nan, dtype=np.float64) |
| 201 | + else: |
| 202 | + raise ValueError(title) |
| 203 | + |
| 204 | + def time_isin(self, dtype, title): |
| 205 | + self.series.isin(self.values) |
| 206 | + |
| 207 | + |
| 208 | +class IsInForObjects: |
| 209 | + """ |
| 210 | + A subset of the cartesian product of cases have special motivations: |
| 211 | +
|
| 212 | + "nans" x "nans" |
| 213 | + if nan-objects are different objects, |
| 214 | + this has the potential to trigger O(n^2) running time |
| 215 | +
|
| 216 | + "short" x "long" |
| 217 | + running time dominated by the preprocessing |
| 218 | +
|
| 219 | + "long" x "short" |
| 220 | + running time dominated by look-up |
| 221 | +
|
| 222 | + "long" x "long" |
| 223 | + no dominating part |
| 224 | +
|
| 225 | + "long_floats" x "long_floats" |
| 226 | + because of nans floats are special |
| 227 | + no dominating part |
| 228 | +
|
| 229 | + """ |
| 230 | + |
| 231 | + variants = ["nans", "short", "long", "long_floats"] |
| 232 | + |
| 233 | + params = [variants, variants] |
| 234 | + param_names = ["series_type", "vals_type"] |
| 235 | + |
| 236 | + def setup(self, series_type, vals_type): |
| 237 | + N_many = 10 ** 5 |
| 238 | + |
| 239 | + if series_type == "nans": |
| 240 | + ser_vals = np.full(10 ** 4, np.nan) |
| 241 | + elif series_type == "short": |
| 242 | + ser_vals = np.arange(2) |
| 243 | + elif series_type == "long": |
| 244 | + ser_vals = np.arange(N_many) |
| 245 | + elif series_type == "long_floats": |
| 246 | + ser_vals = np.arange(N_many, dtype=np.float_) |
| 247 | + |
| 248 | + self.series = Series(ser_vals).astype(object) |
| 249 | + |
| 250 | + if vals_type == "nans": |
| 251 | + values = np.full(10 ** 4, np.nan) |
| 252 | + elif vals_type == "short": |
| 253 | + values = np.arange(2) |
| 254 | + elif vals_type == "long": |
| 255 | + values = np.arange(N_many) |
| 256 | + elif vals_type == "long_floats": |
| 257 | + values = np.arange(N_many, dtype=np.float_) |
| 258 | + |
| 259 | + self.values = values.astype(object) |
| 260 | + |
| 261 | + def time_isin(self, series_type, vals_type): |
| 262 | + self.series.isin(self.values) |
| 263 | + |
| 264 | + |
| 265 | +class IsInLongSeriesLookUpDominates: |
| 266 | + params = [ |
| 267 | + ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], |
| 268 | + [5, 1000], |
| 269 | + ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], |
| 270 | + ] |
| 271 | + param_names = ["dtype", "MaxNumber", "series_type"] |
| 272 | + |
| 273 | + def setup(self, dtype, MaxNumber, series_type): |
| 274 | + N = 10 ** 7 |
| 275 | + |
| 276 | + if not np_version_under1p20 and dtype in ("Int64", "Float64"): |
| 277 | + raise NotImplementedError |
| 278 | + |
| 279 | + if series_type == "random_hits": |
| 280 | + np.random.seed(42) |
| 281 | + array = np.random.randint(0, MaxNumber, N) |
| 282 | + if series_type == "random_misses": |
| 283 | + np.random.seed(42) |
| 284 | + array = np.random.randint(0, MaxNumber, N) + MaxNumber |
| 285 | + if series_type == "monotone_hits": |
| 286 | + array = np.repeat(np.arange(MaxNumber), N // MaxNumber) |
| 287 | + if series_type == "monotone_misses": |
| 288 | + array = np.arange(N) + MaxNumber |
| 289 | + |
| 290 | + self.series = Series(array).astype(dtype) |
| 291 | + self.values = np.arange(MaxNumber).astype(dtype) |
| 292 | + |
| 293 | + def time_isin(self, dtypes, MaxNumber, series_type): |
| 294 | + self.series.isin(self.values) |
| 295 | + |
| 296 | + |
| 297 | +class IsInLongSeriesValuesDominate: |
| 298 | + params = [ |
| 299 | + ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], |
| 300 | + ["random", "monotone"], |
| 301 | + ] |
| 302 | + param_names = ["dtype", "series_type"] |
| 303 | + |
| 304 | + def setup(self, dtype, series_type): |
| 305 | + N = 10 ** 7 |
| 306 | + if series_type == "random": |
| 307 | + np.random.seed(42) |
| 308 | + vals = np.random.randint(0, 10 * N, N) |
| 309 | + if series_type == "monotone": |
| 310 | + vals = np.arange(N) |
| 311 | + |
| 312 | + self.values = vals.astype(dtype) |
| 313 | + M = 10 ** 6 + 1 |
| 314 | + self.series = Series(np.arange(M)).astype(dtype) |
| 315 | + |
| 316 | + def time_isin(self, dtypes, series_type): |
| 317 | + self.series.isin(self.values) |
0 commit comments