Skip to content

Commit 2212ff6

Browse files
committed
Merge remote-tracking branch 'upstream/master' into css_multiple_selectors
2 parents 0db44b8 + 8837b36 commit 2212ff6

File tree

192 files changed

+8226
-7086
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

192 files changed

+8226
-7086
lines changed

.github/workflows/ci.yml

+5
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,12 @@ jobs:
153153
run: |
154154
source activate pandas-dev
155155
pytest pandas/tests/frame/methods --array-manager
156+
pytest pandas/tests/frame/test_reductions.py --array-manager
157+
pytest pandas/tests/reductions/ --array-manager
158+
pytest pandas/tests/generic/test_generic.py --array-manager
156159
pytest pandas/tests/arithmetic/ --array-manager
160+
pytest pandas/tests/groupby/aggregate/ --array-manager
161+
pytest pandas/tests/reshape/merge --array-manager
157162
158163
# indexing subset (temporary since other tests don't pass yet)
159164
pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager

.pre-commit-config.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,11 @@ repos:
163163
entry: np\.bool[^_8]
164164
language: pygrep
165165
types_or: [python, cython, rst]
166+
- id: np-object
167+
name: Check for use of np.object instead of np.object_
168+
entry: np\.object[^_8]
169+
language: pygrep
170+
types_or: [python, cython, rst]
166171
- id: no-os-remove
167172
name: Check code for instances of os.remove
168173
entry: os\.remove

Makefile

-13
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,3 @@ doc:
2525
cd doc; \
2626
python make.py clean; \
2727
python make.py html
28-
29-
check:
30-
python3 scripts/validate_unwanted_patterns.py \
31-
--validation-type="private_function_across_module" \
32-
--included-file-extensions="py" \
33-
--excluded-file-paths=pandas/tests,asv_bench/ \
34-
pandas/
35-
36-
python3 scripts/validate_unwanted_patterns.py \
37-
--validation-type="private_import_across_module" \
38-
--included-file-extensions="py" \
39-
--excluded-file-paths=pandas/tests,asv_bench/,doc/
40-
pandas/
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""
2+
algos/ directory is intended for individual functions from core.algorithms
3+
4+
In many cases these algorithms are reachable in multiple ways:
5+
algos.foo(x, y)
6+
Series(x).foo(y)
7+
Index(x).foo(y)
8+
pd.array(x).foo(y)
9+
10+
In most cases we profile the Series variant directly, trusting the performance
11+
of the others to be highly correlated.
12+
"""

asv_bench/benchmarks/algos/isin.py

+317
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
import numpy as np
2+
3+
from pandas.compat.numpy import np_version_under1p20
4+
5+
from pandas import (
6+
Categorical,
7+
NaT,
8+
Series,
9+
date_range,
10+
)
11+
12+
13+
class IsIn:
14+
15+
params = [
16+
"int64",
17+
"uint64",
18+
"object",
19+
"Int64",
20+
"boolean",
21+
"bool",
22+
"datetime64[ns]",
23+
"category[object]",
24+
"category[int]",
25+
]
26+
param_names = ["dtype"]
27+
28+
def setup(self, dtype):
29+
N = 10000
30+
31+
self.mismatched = [NaT.to_datetime64()] * 2
32+
33+
if dtype in ["boolean", "bool"]:
34+
self.series = Series(np.random.randint(0, 2, N)).astype(dtype)
35+
self.values = [True, False]
36+
37+
elif dtype == "datetime64[ns]":
38+
# Note: values here is much larger than non-dt64ns cases
39+
40+
# dti has length=115777
41+
dti = date_range(start="2015-10-26", end="2016-01-01", freq="50s")
42+
self.series = Series(dti)
43+
self.values = self.series._values[::3]
44+
self.mismatched = [1, 2]
45+
46+
elif dtype in ["category[object]", "category[int]"]:
47+
# Note: sizes are different in this case than others
48+
np.random.seed(1234)
49+
50+
n = 5 * 10 ** 5
51+
sample_size = 100
52+
53+
arr = list(np.random.randint(0, n // 10, size=n))
54+
if dtype == "category[object]":
55+
arr = [f"s{i:04d}" for i in arr]
56+
57+
self.values = np.random.choice(arr, sample_size)
58+
self.series = Series(arr).astype("category")
59+
60+
else:
61+
self.series = Series(np.random.randint(1, 10, N)).astype(dtype)
62+
self.values = [1, 2]
63+
64+
self.cat_values = Categorical(self.values)
65+
66+
def time_isin(self, dtype):
67+
self.series.isin(self.values)
68+
69+
def time_isin_categorical(self, dtype):
70+
self.series.isin(self.cat_values)
71+
72+
def time_isin_empty(self, dtype):
73+
self.series.isin([])
74+
75+
def time_isin_mismatched_dtype(self, dtype):
76+
self.series.isin(self.mismatched)
77+
78+
79+
class IsinAlmostFullWithRandomInt:
80+
params = [
81+
[np.float64, np.int64, np.uint64, np.object_],
82+
range(10, 21),
83+
["inside", "outside"],
84+
]
85+
param_names = ["dtype", "exponent", "title"]
86+
87+
def setup(self, dtype, exponent, title):
88+
M = 3 * 2 ** (exponent - 2)
89+
# 0.77-the maximal share of occupied buckets
90+
np.random.seed(42)
91+
self.series = Series(np.random.randint(0, M, M)).astype(dtype)
92+
93+
values = np.random.randint(0, M, M).astype(dtype)
94+
if title == "inside":
95+
self.values = values
96+
elif title == "outside":
97+
self.values = values + M
98+
else:
99+
raise ValueError(title)
100+
101+
def time_isin(self, dtype, exponent, title):
102+
self.series.isin(self.values)
103+
104+
105+
class IsinWithRandomFloat:
106+
params = [
107+
[np.float64, np.object_],
108+
[
109+
1_300,
110+
2_000,
111+
7_000,
112+
8_000,
113+
70_000,
114+
80_000,
115+
750_000,
116+
900_000,
117+
],
118+
["inside", "outside"],
119+
]
120+
param_names = ["dtype", "size", "title"]
121+
122+
def setup(self, dtype, size, title):
123+
np.random.seed(42)
124+
self.values = np.random.rand(size)
125+
self.series = Series(self.values).astype(dtype)
126+
np.random.shuffle(self.values)
127+
128+
if title == "outside":
129+
self.values = self.values + 0.1
130+
131+
def time_isin(self, dtype, size, title):
132+
self.series.isin(self.values)
133+
134+
135+
class IsinWithArangeSorted:
136+
params = [
137+
[np.float64, np.int64, np.uint64, np.object_],
138+
[
139+
1_000,
140+
2_000,
141+
8_000,
142+
100_000,
143+
1_000_000,
144+
],
145+
]
146+
param_names = ["dtype", "size"]
147+
148+
def setup(self, dtype, size):
149+
self.series = Series(np.arange(size)).astype(dtype)
150+
self.values = np.arange(size).astype(dtype)
151+
152+
def time_isin(self, dtype, size):
153+
self.series.isin(self.values)
154+
155+
156+
class IsinWithArange:
157+
params = [
158+
[np.float64, np.int64, np.uint64, np.object_],
159+
[
160+
1_000,
161+
2_000,
162+
8_000,
163+
],
164+
[-2, 0, 2],
165+
]
166+
param_names = ["dtype", "M", "offset_factor"]
167+
168+
def setup(self, dtype, M, offset_factor):
169+
offset = int(M * offset_factor)
170+
np.random.seed(42)
171+
tmp = Series(np.random.randint(offset, M + offset, 10 ** 6))
172+
self.series = tmp.astype(dtype)
173+
self.values = np.arange(M).astype(dtype)
174+
175+
def time_isin(self, dtype, M, offset_factor):
176+
self.series.isin(self.values)
177+
178+
179+
class IsInFloat64:
180+
181+
params = [
182+
[np.float64, "Float64"],
183+
["many_different_values", "few_different_values", "only_nans_values"],
184+
]
185+
param_names = ["dtype", "title"]
186+
187+
def setup(self, dtype, title):
188+
N_many = 10 ** 5
189+
N_few = 10 ** 6
190+
self.series = Series([1, 2], dtype=dtype)
191+
192+
if title == "many_different_values":
193+
# runtime is dominated by creation of the lookup-table
194+
self.values = np.arange(N_many, dtype=np.float64)
195+
elif title == "few_different_values":
196+
# runtime is dominated by creation of the lookup-table
197+
self.values = np.zeros(N_few, dtype=np.float64)
198+
elif title == "only_nans_values":
199+
# runtime is dominated by creation of the lookup-table
200+
self.values = np.full(N_few, np.nan, dtype=np.float64)
201+
else:
202+
raise ValueError(title)
203+
204+
def time_isin(self, dtype, title):
205+
self.series.isin(self.values)
206+
207+
208+
class IsInForObjects:
209+
"""
210+
A subset of the cartesian product of cases have special motivations:
211+
212+
"nans" x "nans"
213+
if nan-objects are different objects,
214+
this has the potential to trigger O(n^2) running time
215+
216+
"short" x "long"
217+
running time dominated by the preprocessing
218+
219+
"long" x "short"
220+
running time dominated by look-up
221+
222+
"long" x "long"
223+
no dominating part
224+
225+
"long_floats" x "long_floats"
226+
because of nans floats are special
227+
no dominating part
228+
229+
"""
230+
231+
variants = ["nans", "short", "long", "long_floats"]
232+
233+
params = [variants, variants]
234+
param_names = ["series_type", "vals_type"]
235+
236+
def setup(self, series_type, vals_type):
237+
N_many = 10 ** 5
238+
239+
if series_type == "nans":
240+
ser_vals = np.full(10 ** 4, np.nan)
241+
elif series_type == "short":
242+
ser_vals = np.arange(2)
243+
elif series_type == "long":
244+
ser_vals = np.arange(N_many)
245+
elif series_type == "long_floats":
246+
ser_vals = np.arange(N_many, dtype=np.float_)
247+
248+
self.series = Series(ser_vals).astype(object)
249+
250+
if vals_type == "nans":
251+
values = np.full(10 ** 4, np.nan)
252+
elif vals_type == "short":
253+
values = np.arange(2)
254+
elif vals_type == "long":
255+
values = np.arange(N_many)
256+
elif vals_type == "long_floats":
257+
values = np.arange(N_many, dtype=np.float_)
258+
259+
self.values = values.astype(object)
260+
261+
def time_isin(self, series_type, vals_type):
262+
self.series.isin(self.values)
263+
264+
265+
class IsInLongSeriesLookUpDominates:
266+
params = [
267+
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"],
268+
[5, 1000],
269+
["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
270+
]
271+
param_names = ["dtype", "MaxNumber", "series_type"]
272+
273+
def setup(self, dtype, MaxNumber, series_type):
274+
N = 10 ** 7
275+
276+
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
277+
raise NotImplementedError
278+
279+
if series_type == "random_hits":
280+
np.random.seed(42)
281+
array = np.random.randint(0, MaxNumber, N)
282+
if series_type == "random_misses":
283+
np.random.seed(42)
284+
array = np.random.randint(0, MaxNumber, N) + MaxNumber
285+
if series_type == "monotone_hits":
286+
array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
287+
if series_type == "monotone_misses":
288+
array = np.arange(N) + MaxNumber
289+
290+
self.series = Series(array).astype(dtype)
291+
self.values = np.arange(MaxNumber).astype(dtype)
292+
293+
def time_isin(self, dtypes, MaxNumber, series_type):
294+
self.series.isin(self.values)
295+
296+
297+
class IsInLongSeriesValuesDominate:
298+
params = [
299+
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"],
300+
["random", "monotone"],
301+
]
302+
param_names = ["dtype", "series_type"]
303+
304+
def setup(self, dtype, series_type):
305+
N = 10 ** 7
306+
if series_type == "random":
307+
np.random.seed(42)
308+
vals = np.random.randint(0, 10 * N, N)
309+
if series_type == "monotone":
310+
vals = np.arange(N)
311+
312+
self.values = vals.astype(dtype)
313+
M = 10 ** 6 + 1
314+
self.series = Series(np.arange(M)).astype(dtype)
315+
316+
def time_isin(self, dtypes, series_type):
317+
self.series.isin(self.values)

asv_bench/benchmarks/categoricals.py

-19
Original file line numberDiff line numberDiff line change
@@ -220,25 +220,6 @@ def time_rank_int_cat_ordered(self):
220220
self.s_int_cat_ordered.rank()
221221

222222

223-
class Isin:
224-
225-
params = ["object", "int64"]
226-
param_names = ["dtype"]
227-
228-
def setup(self, dtype):
229-
np.random.seed(1234)
230-
n = 5 * 10 ** 5
231-
sample_size = 100
232-
arr = list(np.random.randint(0, n // 10, size=n))
233-
if dtype == "object":
234-
arr = [f"s{i:04d}" for i in arr]
235-
self.sample = np.random.choice(arr, sample_size)
236-
self.series = pd.Series(arr).astype("category")
237-
238-
def time_isin_categorical(self, dtype):
239-
self.series.isin(self.sample)
240-
241-
242223
class IsMonotonic:
243224
def setup(self):
244225
N = 1000

0 commit comments

Comments
 (0)