Skip to content

Commit fb7af6e

Browse files
committedMar 22, 2017
CLN: move groupby algos separate cython lib
- separate out groupby algorithms to separate lib - release GIL on median - release GIL on is_lexsorted / fix memory leak - release GIL on nancorr Author: Jeff Reback <[email protected]> Closes #15775 from jreback/groupby and squashes the following commits: 4e2bfec [Jeff Reback] release GIL on median release GIL on is_lexsorted / fix memory leak release GIL on nancorr ce28bb5 [Jeff Reback] CLN: separate out groupby algorithms to separate lib
1 parent 2a3b05a commit fb7af6e

File tree

9 files changed

+474
-422
lines changed

9 files changed

+474
-422
lines changed
 

‎pandas/_libs/algos.pxd

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from util cimport numeric
2+
from numpy cimport float64_t, double_t
3+
4+
cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil
5+
6+
cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
7+
cdef numeric t
8+
9+
# cython doesn't allow pointer dereference so use array syntax
10+
t = a[0]
11+
a[0] = b[0]
12+
b[0] = t
13+
return 0

‎pandas/_libs/algos.pyx

Lines changed: 135 additions & 395 deletions
Large diffs are not rendered by default.

‎pandas/_libs/groupby.pyx

Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
# cython: profile=False
2+
3+
from numpy cimport *
4+
cimport numpy as np
5+
import numpy as np
6+
7+
cimport cython
8+
9+
import_array()
10+
11+
cimport util
12+
13+
from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
14+
uint32_t, uint64_t, float16_t, float32_t, float64_t)
15+
16+
from libc.stdlib cimport malloc, free
17+
18+
from util cimport numeric, get_nat
19+
from algos cimport swap
20+
from algos import take_2d_axis1_float64_float64, groupsort_indexer
21+
22+
cdef int64_t iNaT = get_nat()
23+
24+
cdef double NaN = <double> np.NaN
25+
cdef double nan = NaN
26+
27+
28+
# TODO: aggregate multiple columns in single pass
29+
#----------------------------------------------------------------------
30+
# first, nth, last
31+
32+
33+
@cython.boundscheck(False)
34+
@cython.wraparound(False)
35+
def group_nth_object(ndarray[object, ndim=2] out,
36+
ndarray[int64_t] counts,
37+
ndarray[object, ndim=2] values,
38+
ndarray[int64_t] labels,
39+
int64_t rank):
40+
"""
41+
Only aggregates on axis=0
42+
"""
43+
cdef:
44+
Py_ssize_t i, j, N, K, lab
45+
object val
46+
float64_t count
47+
ndarray[int64_t, ndim=2] nobs
48+
ndarray[object, ndim=2] resx
49+
50+
nobs = np.zeros((<object> out).shape, dtype=np.int64)
51+
resx = np.empty((<object> out).shape, dtype=object)
52+
53+
N, K = (<object> values).shape
54+
55+
for i in range(N):
56+
lab = labels[i]
57+
if lab < 0:
58+
continue
59+
60+
counts[lab] += 1
61+
for j in range(K):
62+
val = values[i, j]
63+
64+
# not nan
65+
if val == val:
66+
nobs[lab, j] += 1
67+
if nobs[lab, j] == rank:
68+
resx[lab, j] = val
69+
70+
for i in range(len(counts)):
71+
for j in range(K):
72+
if nobs[i, j] == 0:
73+
out[i, j] = <object> nan
74+
else:
75+
out[i, j] = resx[i, j]
76+
77+
78+
@cython.boundscheck(False)
79+
@cython.wraparound(False)
80+
def group_nth_bin_object(ndarray[object, ndim=2] out,
81+
ndarray[int64_t] counts,
82+
ndarray[object, ndim=2] values,
83+
ndarray[int64_t] bins, int64_t rank):
84+
"""
85+
Only aggregates on axis=0
86+
"""
87+
cdef:
88+
Py_ssize_t i, j, N, K, ngroups, b
89+
object val
90+
float64_t count
91+
ndarray[object, ndim=2] resx
92+
ndarray[float64_t, ndim=2] nobs
93+
94+
nobs = np.zeros((<object> out).shape, dtype=np.float64)
95+
resx = np.empty((<object> out).shape, dtype=object)
96+
97+
if len(bins) == 0:
98+
return
99+
if bins[len(bins) - 1] == len(values):
100+
ngroups = len(bins)
101+
else:
102+
ngroups = len(bins) + 1
103+
104+
N, K = (<object> values).shape
105+
106+
b = 0
107+
for i in range(N):
108+
while b < ngroups - 1 and i >= bins[b]:
109+
b += 1
110+
111+
counts[b] += 1
112+
for j in range(K):
113+
val = values[i, j]
114+
115+
# not nan
116+
if val == val:
117+
nobs[b, j] += 1
118+
if nobs[b, j] == rank:
119+
resx[b, j] = val
120+
121+
for i in range(ngroups):
122+
for j in range(K):
123+
if nobs[i, j] == 0:
124+
out[i, j] = nan
125+
else:
126+
out[i, j] = resx[i, j]
127+
128+
129+
@cython.boundscheck(False)
130+
@cython.wraparound(False)
131+
def group_last_object(ndarray[object, ndim=2] out,
132+
ndarray[int64_t] counts,
133+
ndarray[object, ndim=2] values,
134+
ndarray[int64_t] labels):
135+
"""
136+
Only aggregates on axis=0
137+
"""
138+
cdef:
139+
Py_ssize_t i, j, N, K, lab
140+
object val
141+
float64_t count
142+
ndarray[object, ndim=2] resx
143+
ndarray[int64_t, ndim=2] nobs
144+
145+
nobs = np.zeros((<object> out).shape, dtype=np.int64)
146+
resx = np.empty((<object> out).shape, dtype=object)
147+
148+
N, K = (<object> values).shape
149+
150+
for i in range(N):
151+
lab = labels[i]
152+
if lab < 0:
153+
continue
154+
155+
counts[lab] += 1
156+
for j in range(K):
157+
val = values[i, j]
158+
159+
# not nan
160+
if val == val:
161+
nobs[lab, j] += 1
162+
resx[lab, j] = val
163+
164+
for i in range(len(counts)):
165+
for j in range(K):
166+
if nobs[i, j] == 0:
167+
out[i, j] = nan
168+
else:
169+
out[i, j] = resx[i, j]
170+
171+
172+
@cython.boundscheck(False)
173+
@cython.wraparound(False)
174+
def group_last_bin_object(ndarray[object, ndim=2] out,
175+
ndarray[int64_t] counts,
176+
ndarray[object, ndim=2] values,
177+
ndarray[int64_t] bins):
178+
"""
179+
Only aggregates on axis=0
180+
"""
181+
cdef:
182+
Py_ssize_t i, j, N, K, ngroups, b
183+
object val
184+
float64_t count
185+
ndarray[object, ndim=2] resx
186+
ndarray[float64_t, ndim=2] nobs
187+
188+
nobs = np.zeros((<object> out).shape, dtype=np.float64)
189+
resx = np.empty((<object> out).shape, dtype=object)
190+
191+
if len(bins) == 0:
192+
return
193+
if bins[len(bins) - 1] == len(values):
194+
ngroups = len(bins)
195+
else:
196+
ngroups = len(bins) + 1
197+
198+
N, K = (<object> values).shape
199+
200+
b = 0
201+
for i in range(N):
202+
while b < ngroups - 1 and i >= bins[b]:
203+
b += 1
204+
205+
counts[b] += 1
206+
for j in range(K):
207+
val = values[i, j]
208+
209+
# not nan
210+
if val == val:
211+
nobs[b, j] += 1
212+
resx[b, j] = val
213+
214+
for i in range(ngroups):
215+
for j in range(K):
216+
if nobs[i, j] == 0:
217+
out[i, j] = nan
218+
else:
219+
out[i, j] = resx[i, j]
220+
221+
222+
cdef inline float64_t _median_linear(float64_t* a, int n) nogil:
223+
cdef int i, j, na_count = 0
224+
cdef float64_t result
225+
cdef float64_t* tmp
226+
227+
if n == 0:
228+
return NaN
229+
230+
# count NAs
231+
for i in range(n):
232+
if a[i] != a[i]:
233+
na_count += 1
234+
235+
if na_count:
236+
if na_count == n:
237+
return NaN
238+
239+
tmp = <float64_t*> malloc((n - na_count) * sizeof(float64_t))
240+
241+
j = 0
242+
for i in range(n):
243+
if a[i] == a[i]:
244+
tmp[j] = a[i]
245+
j += 1
246+
247+
a = tmp
248+
n -= na_count
249+
250+
if n % 2:
251+
result = kth_smallest_c( a, n / 2, n)
252+
else:
253+
result = (kth_smallest_c(a, n / 2, n) +
254+
kth_smallest_c(a, n / 2 - 1, n)) / 2
255+
256+
if na_count:
257+
free(a)
258+
259+
return result
260+
261+
262+
cdef inline float64_t kth_smallest_c(float64_t* a,
263+
Py_ssize_t k,
264+
Py_ssize_t n) nogil:
265+
cdef:
266+
Py_ssize_t i, j, l, m
267+
double_t x, t
268+
269+
l = 0
270+
m = n -1
271+
while (l<m):
272+
x = a[k]
273+
i = l
274+
j = m
275+
276+
while 1:
277+
while a[i] < x: i += 1
278+
while x < a[j]: j -= 1
279+
if i <= j:
280+
swap(&a[i], &a[j])
281+
i += 1; j -= 1
282+
283+
if i > j: break
284+
285+
if j < k: l = i
286+
if k < i: m = j
287+
return a[k]
288+
289+
290+
# generated from template
291+
include "groupby_helper.pxi"

‎pandas/_libs/algos_groupby_helper.pxi.in renamed to ‎pandas/_libs/groupby_helper.pxi.in

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -681,6 +681,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
681681
#----------------------------------------------------------------------
682682

683683

684+
@cython.boundscheck(False)
685+
@cython.wraparound(False)
684686
def group_median_float64(ndarray[float64_t, ndim=2] out,
685687
ndarray[int64_t] counts,
686688
ndarray[float64_t, ndim=2] values,
@@ -704,13 +706,15 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
704706

705707
take_2d_axis1_float64_float64(values.T, indexer, out=data)
706708

707-
for i in range(K):
708-
# exclude NA group
709-
ptr += _counts[0]
710-
for j in range(ngroups):
711-
size = _counts[j + 1]
712-
out[j, i] = _median_linear(ptr, size)
713-
ptr += size
709+
with nogil:
710+
711+
for i in range(K):
712+
# exclude NA group
713+
ptr += _counts[0]
714+
for j in range(ngroups):
715+
size = _counts[j + 1]
716+
out[j, i] = _median_linear(ptr, size)
717+
ptr += size
714718

715719

716720
@cython.boundscheck(False)

‎pandas/core/groupby.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
import pandas.core.common as com
6161
from pandas.core.config import option_context
6262

63-
from pandas._libs import lib, algos as libalgos, Timestamp, NaT, iNaT
63+
from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT
6464
from pandas._libs.lib import count_level_2d
6565

6666
_doc_template = """
@@ -1474,7 +1474,7 @@ def shift(self, periods=1, freq=None, axis=0):
14741474

14751475
# filled in by Cython
14761476
indexer = np.zeros_like(labels)
1477-
libalgos.group_shift_indexer(indexer, labels, ngroups, periods)
1477+
libgroupby.group_shift_indexer(indexer, labels, ngroups, periods)
14781478

14791479
output = {}
14801480
for name, obj in self._iterate_slices():
@@ -1815,13 +1815,13 @@ def _get_cython_function(self, kind, how, values, is_numeric):
18151815
def get_func(fname):
18161816
# see if there is a fused-type version of function
18171817
# only valid for numeric
1818-
f = getattr(libalgos, fname, None)
1818+
f = getattr(libgroupby, fname, None)
18191819
if f is not None and is_numeric:
18201820
return f
18211821

18221822
# otherwise find dtype-specific version, falling back to object
18231823
for dt in [dtype_str, 'object']:
1824-
f = getattr(libalgos, "%s_%s" % (fname, dtype_str), None)
1824+
f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
18251825
if f is not None:
18261826
return f
18271827

@@ -3118,7 +3118,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
31183118
out = _ensure_int64(out)
31193119
return Series(out, index=mi, name=self.name)
31203120

3121-
# for compat. with libalgos.value_counts need to ensure every
3121+
# for compat. with libgroupby.value_counts need to ensure every
31223122
# bin is present at every index level, null filled with zeros
31233123
diff = np.zeros(len(out), dtype='bool')
31243124
for lab in labels[:-1]:

‎pandas/tests/groupby/test_bin_groupby.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
from pandas import Index, isnull
88
from pandas.util.testing import assert_almost_equal
99
import pandas.util.testing as tm
10-
import pandas._libs.lib as lib
11-
import pandas._libs.algos as algos
10+
from pandas._libs import lib, groupby
1211

1312

1413
def test_series_grouper():
@@ -92,7 +91,7 @@ def _check(dtype):
9291
labels = _ensure_int64(np.repeat(np.arange(3),
9392
np.diff(np.r_[0, bins])))
9493

95-
func = getattr(algos, 'group_ohlc_%s' % dtype)
94+
func = getattr(groupby, 'group_ohlc_%s' % dtype)
9695
func(out, counts, obj[:, None], labels)
9796

9897
def _ohlc(group):

‎pandas/tests/groupby/test_transform.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range
77
from pandas.types.common import _ensure_platform_int, is_timedelta64_dtype
88
from pandas.compat import StringIO
9-
from pandas._libs import algos
9+
from pandas._libs import groupby
1010
from .common import MixIn, assert_fp_equal
1111

1212
from pandas.util.testing import assert_frame_equal, assert_series_equal
@@ -418,8 +418,8 @@ def test_cython_group_transform_algos(self):
418418
dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,
419419
np.uint64, np.float32, np.float64]
420420

421-
ops = [(algos.group_cumprod_float64, np.cumproduct, [np.float64]),
422-
(algos.group_cumsum, np.cumsum, dtypes)]
421+
ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]),
422+
(groupby.group_cumsum, np.cumsum, dtypes)]
423423

424424
is_datetimelike = False
425425
for pd_op, np_op, dtypes in ops:
@@ -437,22 +437,22 @@ def test_cython_group_transform_algos(self):
437437
data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
438438
actual = np.zeros_like(data)
439439
actual.fill(np.nan)
440-
algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
440+
groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
441441
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
442442
self.assert_numpy_array_equal(actual[:, 0], expected)
443443

444444
actual = np.zeros_like(data)
445445
actual.fill(np.nan)
446-
algos.group_cumsum(actual, data, labels, is_datetimelike)
446+
groupby.group_cumsum(actual, data, labels, is_datetimelike)
447447
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
448448
self.assert_numpy_array_equal(actual[:, 0], expected)
449449

450450
# timedelta
451451
is_datetimelike = True
452452
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
453453
actual = np.zeros_like(data, dtype='int64')
454-
algos.group_cumsum(actual, data.view('int64'), labels,
455-
is_datetimelike)
454+
groupby.group_cumsum(actual, data.view('int64'), labels,
455+
is_datetimelike)
456456
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
457457
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
458458
np.timedelta64(5, 'ns')])

‎pandas/tests/test_algos.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
import pandas as pd
1111

1212
from pandas import compat
13-
from pandas._libs import algos as libalgos, hashtable
13+
from pandas._libs import (groupby as libgroupby, algos as libalgos,
14+
hashtable)
1415
from pandas._libs.hashtable import unique_label_indices
1516
from pandas.compat import lrange
1617
import pandas.core.algorithms as algos
@@ -891,7 +892,7 @@ def test_group_var_constant(self):
891892
class TestGroupVarFloat64(tm.TestCase, GroupVarTestMixin):
892893
__test__ = True
893894

894-
algo = algos.algos.group_var_float64
895+
algo = libgroupby.group_var_float64
895896
dtype = np.float64
896897
rtol = 1e-5
897898

@@ -914,7 +915,7 @@ def test_group_var_large_inputs(self):
914915
class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin):
915916
__test__ = True
916917

917-
algo = algos.algos.group_var_float32
918+
algo = libgroupby.group_var_float32
918919
dtype = np.float32
919920
rtol = 1e-2
920921

‎setup.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,9 @@ def is_platform_mac():
110110

111111

112112
_pxi_dep_template = {
113-
'algos': ['_libs/algos_common_helper.pxi.in', '_libs/algos_groupby_helper.pxi.in',
113+
'algos': ['_libs/algos_common_helper.pxi.in',
114114
'_libs/algos_take_helper.pxi.in', '_libs/algos_rank_helper.pxi.in'],
115+
'groupby': ['_libs/groupby_helper.pxi.in'],
115116
'join': ['_libs/join_helper.pxi.in', '_libs/join_func_helper.pxi.in'],
116117
'reshape': ['_libs/reshape_helper.pxi.in'],
117118
'hashtable': ['_libs/hashtable_class_helper.pxi.in',
@@ -496,8 +497,11 @@ def pxd(name):
496497
'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
497498
'depends': _pxi_dep['index']},
498499
'_libs.algos': {'pyxfile': '_libs/algos',
499-
'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
500+
'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'],
500501
'depends': _pxi_dep['algos']},
502+
'_libs.groupby': {'pyxfile': '_libs/groupby',
503+
'pxdfiles': ['_libs/src/util', '_libs/algos'],
504+
'depends': _pxi_dep['groupby']},
501505
'_libs.join': {'pyxfile': '_libs/join',
502506
'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
503507
'depends': _pxi_dep['join']},

0 commit comments

Comments
 (0)
Please sign in to comment.