diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0f91f612994c7..cab8bc8e799d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -771,7 +771,420 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic +# ---------------------------------------------------------------------- +# rank_1d, rank_2d +# ---------------------------------------------------------------------- + +ctypedef fused rank_t: + object + float64_t + uint64_t + int64_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def rank_1d(rank_t[:] in_arr, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 + + ndarray[rank_t] sorted_data, values + + ndarray[float64_t] ranks + ndarray[int64_t] argsorted + ndarray[uint8_t, cast=True] sorted_mask + + rank_t val, nan_value + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + bint isnan, condition + float64_t count = 0.0 + + tiebreak = tiebreakers[ties_method] + + if rank_t is float64_t: + values = np.asarray(in_arr).copy() + elif rank_t is object: + values = np.array(in_arr, copy=True) + + if values.dtype != np.object_: + values = values.astype('O') + else: + values = np.asarray(in_arr) + + keep_na = na_option == 'keep' + + if rank_t is object: + mask = missing.isnaobj(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + # create copy in case of NPY_NAT + # values are mutated inplace + if mask.any(): + values = values.copy() + + # double sort first by mask and then by values to ensure nan values are + # either at the beginning or the end. mask/(~mask) controls padding at + # tail or the head + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + order = (values, mask) + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).min + + order = (values, ~mask) + np.putmask(values, mask, nan_value) + else: + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + if rank_t is object: + _as = np.lexsort(keys=order) + else: + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = np.lexsort(keys=order) + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = np.lexsort(keys=order) + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + sorted_mask = mask.take(_as) + _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] + non_na_idx = _indices[0] if len(_indices) > 0 else -1 + argsorted = _as.astype('i8') + + if rank_t is object: + # TODO: de-duplicate once cython supports conditional nogil + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = ( + i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx + ) + else: + condition = ( + i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx + ) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + + else: + with nogil: + # TODO: why does the 2d version not have a nogil block? + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = ( + i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx + ) + else: + condition = ( + i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx + ) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + + if pct: + if tiebreak == TIEBREAK_DENSE: + return ranks / total_tie_count + else: + return ranks / count + else: + return ranks + + +rank_1d_object = rank_1d["object"] +rank_1d_float64 = rank_1d["float64_t"] +rank_1d_uint64 = rank_1d["uint64_t"] +rank_1d_int64 = rank_1d["int64_t"] + + +def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + + Py_ssize_t infs + + ndarray[float64_t, ndim=2] ranks + ndarray[rank_t, ndim=2] values + + ndarray[int64_t, ndim=2] argsorted + + rank_t val, nan_value + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float64_t count = 0.0 + bint condition, skip_condition + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + if axis == 0: + values = np.asarray(in_arr).T.copy() + else: + values = np.asarray(in_arr).copy() + + if rank_t is object: + if values.dtype != np.object_: + values = values.astype('O') + + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = NPY_NAT + + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + np.putmask(values, mask, nan_value) + + n, k = (values).shape + ranks = np.empty((n, k), dtype='f8') + + if rank_t is object: + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks + else: + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + if rank_t is object: + dups = sum_ranks = infs = 0 + else: + dups = sum_ranks = 0 + + total_tie_count = 0 + count = 0.0 + for j in range(k): + if rank_t is not object: + sum_ranks += j + 1 + dups += 1 + + val = values[i, j] + + if rank_t is not uint64_t: + if rank_t is object: + skip_condition = (val is nan_value) and keep_na + else: + skip_condition = (val == nan_value) and keep_na + if skip_condition: + ranks[i, argsorted[i, j]] = NaN + + if rank_t is object: + infs += 1 + + continue + + count += 1.0 + + if rank_t is object: + sum_ranks += (j - infs) + 1 + dups += 1 + + if rank_t is object: + condition = j == k - 1 or are_diff(values[i, j + 1], val) + else: + condition = j == k - 1 or values[i, j + 1] != val + + if condition: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported ' + 'for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + if tiebreak == TIEBREAK_DENSE: + ranks[i, :] /= total_tie_count + else: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + + +rank_2d_object = rank_2d["object"] +rank_2d_float64 = rank_2d["float64_t"] +rank_2d_uint64 = rank_2d["uint64_t"] +rank_2d_int64 = rank_2d["int64_t"] + + # generated from template include "algos_common_helper.pxi" -include "algos_rank_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in deleted file mode 100644 index d5a31b6a13010..0000000000000 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ /dev/null @@ -1,410 +0,0 @@ -""" -Template for each `dtype` helper function for rank - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -# ---------------------------------------------------------------------- -# rank_1d, rank_2d -# ---------------------------------------------------------------------- - -ctypedef fused rank_t: - object - float64_t - uint64_t - int64_t - - -@cython.wraparound(False) -@cython.boundscheck(False) -def rank_1d(rank_t[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - - ndarray[rank_t] sorted_data, values - - ndarray[float64_t] ranks - ndarray[int64_t] argsorted - ndarray[uint8_t, cast=True] sorted_mask - - rank_t val, nan_value - - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - bint isnan, condition - float64_t count = 0.0 - - tiebreak = tiebreakers[ties_method] - - if rank_t is float64_t: - values = np.asarray(in_arr).copy() - elif rank_t is object: - values = np.array(in_arr, copy=True) - - if values.dtype != np.object_: - values = values.astype('O') - else: - values = np.asarray(in_arr) - - keep_na = na_option == 'keep' - - if rank_t is object: - mask = missing.isnaobj(values) - elif rank_t is float64_t: - mask = np.isnan(values) - elif rank_t is int64_t: - mask = values == NPY_NAT - - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - - # double sort first by mask and then by values to ensure nan values are - # either at the beginning or the end. mask/(~mask) controls padding at - # tail or the head - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - order = (values, mask) - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).min - - order = (values, ~mask) - np.putmask(values, mask, nan_value) - else: - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - if rank_t is object: - _as = np.lexsort(keys=order) - else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = np.lexsort(keys=order) - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - sorted_mask = mask.take(_as) - _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] - non_na_idx = _indices[0] if len(_indices) > 0 else -1 - argsorted = _as.astype('i8') - - if rank_t is object: - # TODO: de-duplicate once cython supports conditional nogil - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = (i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx) - else: - condition = (i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - else: - with nogil: - # TODO: why does the 2d version not have a nogil block? - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = (i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx) - else: - condition = (i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - if pct: - if tiebreak == TIEBREAK_DENSE: - return ranks / total_tie_count - else: - return ranks / count - else: - return ranks - - -rank_1d_object = rank_1d["object"] -rank_1d_float64 = rank_1d["float64_t"] -rank_1d_uint64 = rank_1d["uint64_t"] -rank_1d_int64 = rank_1d["int64_t"] - - -def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - - Py_ssize_t infs - - ndarray[float64_t, ndim=2] ranks - ndarray[rank_t, ndim=2] values - - ndarray[int64_t, ndim=2] argsorted - - rank_t val, nan_value - - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float64_t count = 0.0 - bint condition, skip_condition - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - if axis == 0: - values = np.asarray(in_arr).T.copy() - else: - values = np.asarray(in_arr).copy() - - if rank_t is object: - if values.dtype != np.object_: - values = values.astype('O') - - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = NPY_NAT - - if rank_t is object: - mask = missing.isnaobj2d(values) - elif rank_t is float64_t: - mask = np.isnan(values) - elif rank_t is int64_t: - mask = values == NPY_NAT - - np.putmask(values, mask, nan_value) - - n, k = (values).shape - ranks = np.empty((n, k), dtype='f8') - - if rank_t is object: - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - if rank_t is object: - dups = sum_ranks = infs = 0 - else: - dups = sum_ranks = 0 - - total_tie_count = 0 - count = 0.0 - for j in range(k): - if rank_t is not object: - sum_ranks += j + 1 - dups += 1 - - val = values[i, j] - - if rank_t is not uint64_t: - if rank_t is object: - skip_condition = (val is nan_value) and keep_na - else: - skip_condition = (val == nan_value) and keep_na - if skip_condition: - ranks[i, argsorted[i, j]] = NaN - - if rank_t is object: - infs += 1 - - continue - - count += 1.0 - - if rank_t is object: - sum_ranks += (j - infs) + 1 - dups += 1 - - if rank_t is object: - condition = j == k - 1 or are_diff(values[i, j + 1], val) - else: - condition = j == k - 1 or values[i, j + 1] != val - - if condition: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported ' - 'for non-numeric data') - else: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - if tiebreak == TIEBREAK_DENSE: - ranks[i, :] /= total_tie_count - else: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - - -rank_2d_object = rank_2d["object"] -rank_2d_float64 = rank_2d["float64_t"] -rank_2d_uint64 = rank_2d["uint64_t"] -rank_2d_int64 = rank_2d["int64_t"] diff --git a/setup.py b/setup.py index 2892cd0b2e294..0dd1980088db8 100755 --- a/setup.py +++ b/setup.py @@ -86,7 +86,6 @@ def is_platform_mac(): "algos": [ "_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in", - "_libs/algos_rank_helper.pxi.in", ], "hashtable": [ "_libs/hashtable_class_helper.pxi.in",