Skip to content

Commit 448b36a

Browse files
committed
PERF: fix easily fixed issues in asv benchmarks
- eval.eval_frame_*: fix up local variables - groupby: range() -> list(range()) and using np.random.choice - plotting: ensure matplotlib Agg backend is used - packers: fix data file paths
1 parent e083c01 commit 448b36a

File tree

4 files changed

+37
-181
lines changed

4 files changed

+37
-181
lines changed

asv_bench/benchmarks/eval.py

Lines changed: 22 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -3,192 +3,36 @@
33
import pandas.computation.expressions as expr
44

55

6-
class eval_frame_add_all_threads(object):
6+
class eval_frame(object):
77
goal_time = 0.2
88

9-
def setup(self):
10-
self.df = DataFrame(np.random.randn(20000, 100))
11-
self.df2 = DataFrame(np.random.randn(20000, 100))
12-
self.df3 = DataFrame(np.random.randn(20000, 100))
13-
self.df4 = DataFrame(np.random.randn(20000, 100))
14-
15-
def time_eval_frame_add_all_threads(self):
16-
pd.eval('df + df2 + df3 + df4')
17-
18-
19-
class eval_frame_add_one_thread(object):
20-
goal_time = 0.2
21-
22-
def setup(self):
23-
self.df = DataFrame(np.random.randn(20000, 100))
24-
self.df2 = DataFrame(np.random.randn(20000, 100))
25-
self.df3 = DataFrame(np.random.randn(20000, 100))
26-
self.df4 = DataFrame(np.random.randn(20000, 100))
27-
expr.set_numexpr_threads(1)
28-
29-
def time_eval_frame_add_one_thread(self):
30-
pd.eval('df + df2 + df3 + df4')
31-
32-
33-
class eval_frame_add_python(object):
34-
goal_time = 0.2
35-
36-
def setup(self):
37-
self.df = DataFrame(np.random.randn(20000, 100))
38-
self.df2 = DataFrame(np.random.randn(20000, 100))
39-
self.df3 = DataFrame(np.random.randn(20000, 100))
40-
self.df4 = DataFrame(np.random.randn(20000, 100))
41-
42-
def time_eval_frame_add_python(self):
43-
pd.eval('df + df2 + df3 + df4', engine='python')
44-
45-
46-
class eval_frame_add_python_one_thread(object):
47-
goal_time = 0.2
48-
49-
def setup(self):
50-
self.df = DataFrame(np.random.randn(20000, 100))
51-
self.df2 = DataFrame(np.random.randn(20000, 100))
52-
self.df3 = DataFrame(np.random.randn(20000, 100))
53-
self.df4 = DataFrame(np.random.randn(20000, 100))
54-
expr.set_numexpr_threads(1)
55-
56-
def time_eval_frame_add_python_one_thread(self):
57-
pd.eval('df + df2 + df3 + df4', engine='python')
58-
59-
60-
class eval_frame_and_all_threads(object):
61-
goal_time = 0.2
62-
63-
def setup(self):
64-
self.df = DataFrame(np.random.randn(20000, 100))
65-
self.df2 = DataFrame(np.random.randn(20000, 100))
66-
self.df3 = DataFrame(np.random.randn(20000, 100))
67-
self.df4 = DataFrame(np.random.randn(20000, 100))
68-
69-
def time_eval_frame_and_all_threads(self):
70-
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')
71-
72-
73-
class eval_frame_and_python_one_thread(object):
74-
goal_time = 0.2
75-
76-
def setup(self):
77-
self.df = DataFrame(np.random.randn(20000, 100))
78-
self.df2 = DataFrame(np.random.randn(20000, 100))
79-
self.df3 = DataFrame(np.random.randn(20000, 100))
80-
self.df4 = DataFrame(np.random.randn(20000, 100))
81-
expr.set_numexpr_threads(1)
82-
83-
def time_eval_frame_and_python_one_thread(self):
84-
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
85-
86-
87-
class eval_frame_and_python(object):
88-
goal_time = 0.2
89-
90-
def setup(self):
91-
self.df = DataFrame(np.random.randn(20000, 100))
92-
self.df2 = DataFrame(np.random.randn(20000, 100))
93-
self.df3 = DataFrame(np.random.randn(20000, 100))
94-
self.df4 = DataFrame(np.random.randn(20000, 100))
95-
96-
def time_eval_frame_and_python(self):
97-
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
98-
99-
100-
class eval_frame_chained_cmp_all_threads(object):
101-
goal_time = 0.2
102-
103-
def setup(self):
104-
self.df = DataFrame(np.random.randn(20000, 100))
105-
self.df2 = DataFrame(np.random.randn(20000, 100))
106-
self.df3 = DataFrame(np.random.randn(20000, 100))
107-
self.df4 = DataFrame(np.random.randn(20000, 100))
108-
109-
def time_eval_frame_chained_cmp_all_threads(self):
110-
pd.eval('df < df2 < df3 < df4')
111-
112-
113-
class eval_frame_chained_cmp_python_one_thread(object):
114-
goal_time = 0.2
9+
params = [['numexpr', 'python'], [1, 'all']]
10+
param_names = ['engine', 'threads']
11511

116-
def setup(self):
117-
self.df = DataFrame(np.random.randn(20000, 100))
118-
self.df2 = DataFrame(np.random.randn(20000, 100))
119-
self.df3 = DataFrame(np.random.randn(20000, 100))
120-
self.df4 = DataFrame(np.random.randn(20000, 100))
121-
expr.set_numexpr_threads(1)
122-
123-
def time_eval_frame_chained_cmp_python_one_thread(self):
124-
pd.eval('df < df2 < df3 < df4', engine='python')
125-
126-
127-
class eval_frame_chained_cmp_python(object):
128-
goal_time = 0.2
129-
130-
def setup(self):
12+
def setup(self, engine, threads):
13113
self.df = DataFrame(np.random.randn(20000, 100))
13214
self.df2 = DataFrame(np.random.randn(20000, 100))
13315
self.df3 = DataFrame(np.random.randn(20000, 100))
13416
self.df4 = DataFrame(np.random.randn(20000, 100))
13517

136-
def time_eval_frame_chained_cmp_python(self):
137-
pd.eval('df < df2 < df3 < df4', engine='python')
18+
if threads == 1:
19+
expr.set_numexpr_threads(1)
13820

21+
def time_add(self, engine, threads):
22+
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
23+
pd.eval('df + df2 + df3 + df4', engine=engine)
13924

140-
class eval_frame_mult_all_threads(object):
141-
goal_time = 0.2
25+
def time_and(self, engine, threads):
26+
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
27+
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine)
14228

143-
def setup(self):
144-
self.df = DataFrame(np.random.randn(20000, 100))
145-
self.df2 = DataFrame(np.random.randn(20000, 100))
146-
self.df3 = DataFrame(np.random.randn(20000, 100))
147-
self.df4 = DataFrame(np.random.randn(20000, 100))
148-
149-
def time_eval_frame_mult_all_threads(self):
150-
pd.eval('df * df2 * df3 * df4')
151-
152-
153-
class eval_frame_mult_one_thread(object):
154-
goal_time = 0.2
155-
156-
def setup(self):
157-
self.df = DataFrame(np.random.randn(20000, 100))
158-
self.df2 = DataFrame(np.random.randn(20000, 100))
159-
self.df3 = DataFrame(np.random.randn(20000, 100))
160-
self.df4 = DataFrame(np.random.randn(20000, 100))
161-
expr.set_numexpr_threads(1)
162-
163-
def time_eval_frame_mult_one_thread(self):
164-
pd.eval('df * df2 * df3 * df4')
165-
166-
167-
class eval_frame_mult_python(object):
168-
goal_time = 0.2
169-
170-
def setup(self):
171-
self.df = DataFrame(np.random.randn(20000, 100))
172-
self.df2 = DataFrame(np.random.randn(20000, 100))
173-
self.df3 = DataFrame(np.random.randn(20000, 100))
174-
self.df4 = DataFrame(np.random.randn(20000, 100))
175-
176-
def time_eval_frame_mult_python(self):
177-
pd.eval('df * df2 * df3 * df4', engine='python')
178-
179-
180-
class eval_frame_mult_python_one_thread(object):
181-
goal_time = 0.2
182-
183-
def setup(self):
184-
self.df = DataFrame(np.random.randn(20000, 100))
185-
self.df2 = DataFrame(np.random.randn(20000, 100))
186-
self.df3 = DataFrame(np.random.randn(20000, 100))
187-
self.df4 = DataFrame(np.random.randn(20000, 100))
188-
expr.set_numexpr_threads(1)
29+
def time_chained_cmp(self, engine, threads):
30+
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
31+
pd.eval('df < df2 < df3 < df4', engine=engine)
18932

190-
def time_eval_frame_mult_python_one_thread(self):
191-
pd.eval('df * df2 * df3 * df4', engine='python')
33+
def time_mult(self, engine, threads):
34+
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
35+
pd.eval('df * df2 * df3 * df4', engine=engine)
19236

19337

19438
class query_datetime_index(object):
@@ -203,6 +47,7 @@ def setup(self):
20347
self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index)
20448

20549
def time_query_datetime_index(self):
50+
ts = self.ts
20651
self.df.query('index < @ts')
20752

20853

@@ -218,6 +63,7 @@ def setup(self):
21863
self.df = DataFrame({'dates': self.s.values, })
21964

22065
def time_query_datetime_series(self):
66+
ts = self.ts
22167
self.df.query('dates < @ts')
22268

22369

@@ -236,4 +82,5 @@ def setup(self):
23682
self.max_val = self.df['a'].max()
23783

23884
def time_query_with_boolean_selection(self):
239-
self.df.query('(a >= @min_val) & (a <= @max_val)')
85+
min_val, max_val = self.min_val, self.max_val
86+
self.df.query('(a >= @min_val) & (a <= @max_val)')

asv_bench/benchmarks/groupby.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def setup(self):
254254
self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat')
255255
self.value2 = np.random.randn(self.n)
256256
self.value2[(np.random.rand(self.n) > 0.5)] = np.nan
257-
self.obj = tm.choice(list('ab'), size=self.n).astype(object)
257+
self.obj = np.random.choice(list('ab'), size=self.n).astype(object)
258258
self.obj[(np.random.randn(self.n) > 0.5)] = np.nan
259259
self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n),
260260
'key2': np.random.randint(0, 100, size=self.n),
@@ -651,7 +651,7 @@ class groupby_sum_multiindex(object):
651651

652652
def setup(self):
653653
self.N = 50
654-
self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B'])
654+
self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B'])
655655

656656
def time_groupby_sum_multiindex(self):
657657
self.df.groupby(level=[0, 1]).sum()
@@ -673,9 +673,9 @@ def setup(self):
673673
self.secid_min = int('10000000', 16)
674674
self.secid_max = int('F0000000', 16)
675675
self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1))
676-
self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step))
676+
self.security_ids = map((lambda x: hex(x)[2:10].upper()), list(range(self.secid_min, (self.secid_max + 1), self.step)))
677677
self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids],
678-
labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (range(self.n_securities) * self.n_dates)],
678+
labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (list(range(self.n_securities)) * self.n_dates)],
679679
names=['date', 'security_id'])
680680
self.n_data = len(self.data_index)
681681
self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))])

asv_bench/benchmarks/packers.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,9 @@ def remove(self, f):
321321
class packers_read_sas7bdat(object):
322322

323323
def setup(self):
324-
self.f = 'data/test1.sas7bdat'
324+
self.f = os.path.join(os.path.dirname(__file__), '..', '..',
325+
'pandas', 'io', 'tests', 'sas', 'data',
326+
'test1.sas7bdat')
325327

326328
def time_packers_read_sas7bdat(self):
327329
pd.read_sas(self.f, format='sas7bdat')
@@ -330,7 +332,9 @@ def time_packers_read_sas7bdat(self):
330332
class packers_read_xport(object):
331333

332334
def setup(self):
333-
self.f = 'data/paxraw_d_short.xpt'
335+
self.f = os.path.join(os.path.dirname(__file__), '..', '..',
336+
'pandas', 'io', 'tests', 'sas', 'data',
337+
'paxraw_d_short.xpt')
334338

335339
def time_packers_read_xport(self):
336340
pd.read_sas(self.f, format='xport')

asv_bench/benchmarks/plotting.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,22 @@ class plot_timeseries_period(object):
1111
goal_time = 0.2
1212

1313
def setup(self):
14+
import matplotlib
15+
matplotlib.use('Agg')
1416
self.N = 2000
1517
self.M = 5
1618
self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N))
1719

1820
def time_plot_timeseries_period(self):
1921
self.df.plot()
2022

23+
2124
class plot_andrews_curves(object):
2225
goal_time = 0.6
2326

2427
def setup(self):
28+
import matplotlib
29+
matplotlib.use('Agg')
2530
self.N = 500
2631
self.M = 10
2732
data_dict = {x: np.random.randn(self.N) for x in range(self.M)}

0 commit comments

Comments
 (0)