pandas-dev · Mar 22, 2017
diff --git a/‎pandas/_libs/algos.pxd
Lines changed: 13 additions & 0 deletions b/‎pandas/_libs/algos.pxd
Lines changed: 13 additions & 0 deletions
diff --git a/‎pandas/_libs/algos.pyx
Lines changed: 135 additions & 395 deletions b/‎pandas/_libs/algos.pyx
Lines changed: 135 additions & 395 deletions
diff --git a/‎pandas/_libs/groupby.pyx
Lines changed: 291 additions & 0 deletions b/‎pandas/_libs/groupby.pyx
Lines changed: 291 additions & 0 deletions
diff --git a/‎pandas/_libs/algos_groupby_helper.pxi.in renamed to ‎pandas/_libs/groupby_helper.pxi.in
Lines changed: 11 additions & 7 deletions b/‎pandas/_libs/algos_groupby_helper.pxi.in renamed to ‎pandas/_libs/groupby_helper.pxi.in
Lines changed: 11 additions & 7 deletions
diff --git a/‎pandas/core/groupby.py
Lines changed: 5 additions & 5 deletions b/‎pandas/core/groupby.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎pandas/tests/groupby/test_bin_groupby.py
Lines changed: 2 additions & 3 deletions b/‎pandas/tests/groupby/test_bin_groupby.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎pandas/tests/groupby/test_transform.py
Lines changed: 7 additions & 7 deletions b/‎pandas/tests/groupby/test_transform.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎pandas/tests/test_algos.py
Lines changed: 4 additions & 3 deletions b/‎pandas/tests/test_algos.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎setup.py
Lines changed: 6 additions & 2 deletions b/‎setup.py
Lines changed: 6 additions & 2 deletions
@@ -0,0 +1,13 @@
+from util cimport numeric
+from numpy cimport float64_t, double_t
+
+cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil
+
+cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
+    cdef numeric t
+
+    # cython doesn't allow pointer dereference so use array syntax
+    t = a[0]
+    a[0] = b[0]
+    b[0] = t
+    return 0
@@ -0,0 +1,291 @@
+# cython: profile=False
+
+from numpy cimport *
+cimport numpy as np
+import numpy as np
+
+cimport cython
+
+import_array()
+
+cimport util
+
+from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+                    uint32_t, uint64_t, float16_t, float32_t, float64_t)
+
+from libc.stdlib cimport malloc, free
+
+from util cimport numeric, get_nat
+from algos cimport swap
+from algos import take_2d_axis1_float64_float64, groupsort_indexer
+
+cdef int64_t iNaT = get_nat()
+
+cdef double NaN = <double> np.NaN
+cdef double nan = NaN
+
+
+# TODO: aggregate multiple columns in single pass
+#----------------------------------------------------------------------
+# first, nth, last
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_object(ndarray[object, ndim=2] out,
+                     ndarray[int64_t] counts,
+                     ndarray[object, ndim=2] values,
+                     ndarray[int64_t] labels,
+                     int64_t rank):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        object val
+        float64_t count
+        ndarray[int64_t, ndim=2] nobs
+        ndarray[object, ndim=2] resx
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                if nobs[lab, j] == rank:
+                    resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = <object> nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_bin_object(ndarray[object, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[object, ndim=2] values,
+                         ndarray[int64_t] bins, int64_t rank):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        object val
+        float64_t count
+        ndarray[object, ndim=2] resx
+        ndarray[float64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.float64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    if len(bins) == 0:
+        return
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                if nobs[b, j] == rank:
+                    resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_last_object(ndarray[object, ndim=2] out,
+                      ndarray[int64_t] counts,
+                      ndarray[object, ndim=2] values,
+                      ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        object val
+        float64_t count
+        ndarray[object, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_last_bin_object(ndarray[object, ndim=2] out,
+                          ndarray[int64_t] counts,
+                          ndarray[object, ndim=2] values,
+                          ndarray[int64_t] bins):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        object val
+        float64_t count
+        ndarray[object, ndim=2] resx
+        ndarray[float64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.float64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    if len(bins) == 0:
+        return
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+cdef inline float64_t _median_linear(float64_t* a, int n) nogil:
+    cdef int i, j, na_count = 0
+    cdef float64_t result
+    cdef float64_t* tmp
+
+    if n == 0:
+        return NaN
+
+    # count NAs
+    for i in range(n):
+        if a[i] != a[i]:
+            na_count += 1
+
+    if na_count:
+        if na_count == n:
+            return NaN
+
+        tmp = <float64_t*> malloc((n - na_count) * sizeof(float64_t))
+
+        j = 0
+        for i in range(n):
+            if a[i] == a[i]:
+                tmp[j] = a[i]
+                j += 1
+
+        a = tmp
+        n -= na_count
+
+    if n % 2:
+        result = kth_smallest_c( a, n / 2, n)
+    else:
+        result = (kth_smallest_c(a, n / 2, n) +
+                  kth_smallest_c(a, n / 2 - 1, n)) / 2
+
+    if na_count:
+        free(a)
+
+    return result
+
+
+cdef inline float64_t kth_smallest_c(float64_t* a,
+                                     Py_ssize_t k,
+                                     Py_ssize_t n) nogil:
+    cdef:
+        Py_ssize_t i, j, l, m
+        double_t x, t
+
+    l = 0
+    m = n -1
+    while (l<m):
+        x = a[k]
+        i = l
+        j = m
+
+        while 1:
+            while a[i] < x: i += 1
+            while x < a[j]: j -= 1
+            if i <= j:
+                swap(&a[i], &a[j])
+                i += 1; j -= 1
+
+            if i > j: break
+
+        if j < k: l = i
+        if k < i: m = j
+    return a[k]
+
+
+# generated from template
+include "groupby_helper.pxi"
@@ -681,6 +681,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 #----------------------------------------------------------------------
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 def group_median_float64(ndarray[float64_t, ndim=2] out,
                          ndarray[int64_t] counts,
                          ndarray[float64_t, ndim=2] values,
@@ -704,13 +706,15 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
 
     take_2d_axis1_float64_float64(values.T, indexer, out=data)
 
-    for i in range(K):
-        # exclude NA group
-        ptr += _counts[0]
-        for j in range(ngroups):
-            size = _counts[j + 1]
-            out[j, i] = _median_linear(ptr, size)
-            ptr += size
+    with nogil:
+
+        for i in range(K):
+            # exclude NA group
+            ptr += _counts[0]
+            for j in range(ngroups):
+                size = _counts[j + 1]
+                out[j, i] = _median_linear(ptr, size)
+                ptr += size
 
 
 @cython.boundscheck(False)
 
@@ -60,7 +60,7 @@
 import pandas.core.common as com
 from pandas.core.config import option_context
 
-from pandas._libs import lib, algos as libalgos, Timestamp, NaT, iNaT
+from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT
 from pandas._libs.lib import count_level_2d
 
 _doc_template = """
@@ -1474,7 +1474,7 @@ def shift(self, periods=1, freq=None, axis=0):
 
         # filled in by Cython
         indexer = np.zeros_like(labels)
-        libalgos.group_shift_indexer(indexer, labels, ngroups, periods)
+        libgroupby.group_shift_indexer(indexer, labels, ngroups, periods)
 
         output = {}
         for name, obj in self._iterate_slices():
@@ -1815,13 +1815,13 @@ def _get_cython_function(self, kind, how, values, is_numeric):
         def get_func(fname):
             # see if there is a fused-type version of function
             # only valid for numeric
-            f = getattr(libalgos, fname, None)
+            f = getattr(libgroupby, fname, None)
             if f is not None and is_numeric:
                 return f
 
             # otherwise find dtype-specific version, falling back to object
             for dt in [dtype_str, 'object']:
-                f = getattr(libalgos, "%s_%s" % (fname, dtype_str), None)
+                f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
                 if f is not None:
                     return f
 
@@ -3118,7 +3118,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
                 out = _ensure_int64(out)
             return Series(out, index=mi, name=self.name)
 
-        # for compat. with libalgos.value_counts need to ensure every
+        # for compat. with libgroupby.value_counts need to ensure every
         # bin is present at every index level, null filled with zeros
         diff = np.zeros(len(out), dtype='bool')
         for lab in labels[:-1]:
 
@@ -7,8 +7,7 @@
 from pandas import Index, isnull
 from pandas.util.testing import assert_almost_equal
 import pandas.util.testing as tm
-import pandas._libs.lib as lib
-import pandas._libs.algos as algos
+from pandas._libs import lib, groupby
 
 
 def test_series_grouper():
@@ -92,7 +91,7 @@ def _check(dtype):
         labels = _ensure_int64(np.repeat(np.arange(3),
                                          np.diff(np.r_[0, bins])))
 
-        func = getattr(algos, 'group_ohlc_%s' % dtype)
+        func = getattr(groupby, 'group_ohlc_%s' % dtype)
         func(out, counts, obj[:, None], labels)
 
         def _ohlc(group):
 
@@ -6,7 +6,7 @@
 from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range
 from pandas.types.common import _ensure_platform_int, is_timedelta64_dtype
 from pandas.compat import StringIO
-from pandas._libs import algos
+from pandas._libs import groupby
 from .common import MixIn, assert_fp_equal
 
 from pandas.util.testing import assert_frame_equal, assert_series_equal
@@ -418,8 +418,8 @@ def test_cython_group_transform_algos(self):
         dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,
                   np.uint64, np.float32, np.float64]
 
-        ops = [(algos.group_cumprod_float64, np.cumproduct, [np.float64]),
-               (algos.group_cumsum, np.cumsum, dtypes)]
+        ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]),
+               (groupby.group_cumsum, np.cumsum, dtypes)]
 
         is_datetimelike = False
         for pd_op, np_op, dtypes in ops:
@@ -437,22 +437,22 @@ def test_cython_group_transform_algos(self):
         data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
         actual = np.zeros_like(data)
         actual.fill(np.nan)
-        algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
+        groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
         expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
         self.assert_numpy_array_equal(actual[:, 0], expected)
 
         actual = np.zeros_like(data)
         actual.fill(np.nan)
-        algos.group_cumsum(actual, data, labels, is_datetimelike)
+        groupby.group_cumsum(actual, data, labels, is_datetimelike)
         expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
         self.assert_numpy_array_equal(actual[:, 0], expected)
 
         # timedelta
         is_datetimelike = True
         data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
         actual = np.zeros_like(data, dtype='int64')
-        algos.group_cumsum(actual, data.view('int64'), labels,
-                           is_datetimelike)
+        groupby.group_cumsum(actual, data.view('int64'), labels,
+                             is_datetimelike)
         expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
             2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
             np.timedelta64(5, 'ns')])
 
@@ -10,7 +10,8 @@
 import pandas as pd
 
 from pandas import compat
-from pandas._libs import algos as libalgos, hashtable
+from pandas._libs import (groupby as libgroupby, algos as libalgos,
+                          hashtable)
 from pandas._libs.hashtable import unique_label_indices
 from pandas.compat import lrange
 import pandas.core.algorithms as algos
@@ -891,7 +892,7 @@ def test_group_var_constant(self):
 class TestGroupVarFloat64(tm.TestCase, GroupVarTestMixin):
     __test__ = True
 
-    algo = algos.algos.group_var_float64
+    algo = libgroupby.group_var_float64
     dtype = np.float64
     rtol = 1e-5
 
@@ -914,7 +915,7 @@ def test_group_var_large_inputs(self):
 class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin):
     __test__ = True
 
-    algo = algos.algos.group_var_float32
+    algo = libgroupby.group_var_float32
     dtype = np.float32
     rtol = 1e-2
 
 
@@ -110,8 +110,9 @@ def is_platform_mac():
 
 
 _pxi_dep_template = {
-    'algos': ['_libs/algos_common_helper.pxi.in', '_libs/algos_groupby_helper.pxi.in',
+    'algos': ['_libs/algos_common_helper.pxi.in',
               '_libs/algos_take_helper.pxi.in', '_libs/algos_rank_helper.pxi.in'],
+    'groupby': ['_libs/groupby_helper.pxi.in'],
     'join': ['_libs/join_helper.pxi.in', '_libs/join_func_helper.pxi.in'],
     'reshape': ['_libs/reshape_helper.pxi.in'],
     'hashtable': ['_libs/hashtable_class_helper.pxi.in',
@@ -496,8 +497,11 @@ def pxd(name):
                     'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
                     'depends': _pxi_dep['index']},
     '_libs.algos': {'pyxfile': '_libs/algos',
-                    'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
+                    'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'],
                     'depends': _pxi_dep['algos']},
+    '_libs.groupby': {'pyxfile': '_libs/groupby',
+                    'pxdfiles': ['_libs/src/util', '_libs/algos'],
+                    'depends': _pxi_dep['groupby']},
     '_libs.join': {'pyxfile': '_libs/join',
                    'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
                    'depends': _pxi_dep['join']},