Skip to content

Commit 23844bb

Browse files
committed
BUG: Patch rank() uint64 behavior
Adds uint64 ranking functions to algos.pyx to allow for proper ranking with uint64. Also introduces partial patch for factorize() by adding uint64 hashtables and vectors for usage. However, this patch is only partial because the larger bug of non-support for uint64 in Index has not been fixed. This commit also removes uint8 from testing in test_merge.py because DataFrame.sort_values() incorrectly sorts when handling uint's.
1 parent 4c3d4d4 commit 23844bb

File tree

5 files changed

+262
-19
lines changed

5 files changed

+262
-19
lines changed

pandas/algos.pyx

+179
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,21 @@ cdef _take_2d_int64(ndarray[int64_t, ndim=2] values,
104104
result[i, j] = values[i, indexer[i, j]]
105105
return result
106106

107+
cdef _take_2d_uint64(ndarray[uint64_t, ndim=2] values,
108+
object idx):
109+
cdef:
110+
Py_ssize_t i, j, N, K
111+
ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx
112+
ndarray[uint64_t, ndim=2] result
113+
object val
114+
115+
N, K = (<object> values).shape
116+
result = np.empty_like(values)
117+
for i in range(N):
118+
for j in range(K):
119+
result[i, j] = values[i, indexer[i, j]]
120+
return result
121+
107122
cdef _take_2d_object(ndarray[object, ndim=2] values,
108123
object idx):
109124
cdef:
@@ -286,6 +301,83 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
286301
return ranks
287302

288303

304+
def rank_1d_uint64(object in_arr, ties_method='average', ascending=True,
305+
na_option='keep', pct=False):
306+
"""
307+
Fast NaN-friendly version of scipy.stats.rankdata
308+
"""
309+
310+
cdef:
311+
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
312+
ndarray[uint64_t] sorted_data, values
313+
ndarray[float64_t] ranks
314+
ndarray[int64_t] argsorted
315+
int64_t val, nan_value
316+
float64_t sum_ranks = 0
317+
bint keep_na
318+
int tiebreak = 0
319+
float count = 0.0
320+
tiebreak = tiebreakers[ties_method]
321+
322+
keep_na = na_option == 'keep'
323+
324+
# uint64 has no NaN value, so we just
325+
# create the "values" array and proceed.
326+
values = np.asarray(in_arr)
327+
328+
n = len(values)
329+
ranks = np.empty(n, dtype='f8')
330+
331+
# py2.5/win32 hack, can't pass i8
332+
if tiebreak == TIEBREAK_FIRST:
333+
# need to use a stable sort here
334+
_as = values.argsort(kind='mergesort')
335+
if not ascending:
336+
tiebreak = TIEBREAK_FIRST_DESCENDING
337+
else:
338+
_as = values.argsort()
339+
340+
if not ascending:
341+
_as = _as[::-1]
342+
343+
sorted_data = values.take(_as)
344+
argsorted = _as.astype('i8')
345+
346+
for i in range(n):
347+
sum_ranks += i + 1
348+
dups += 1
349+
val = sorted_data[i]
350+
if (val == nan_value) and keep_na:
351+
ranks[argsorted[i]] = nan
352+
continue
353+
count += 1.0
354+
if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0:
355+
if tiebreak == TIEBREAK_AVERAGE:
356+
for j in range(i - dups + 1, i + 1):
357+
ranks[argsorted[j]] = sum_ranks / dups
358+
elif tiebreak == TIEBREAK_MIN:
359+
for j in range(i - dups + 1, i + 1):
360+
ranks[argsorted[j]] = i - dups + 2
361+
elif tiebreak == TIEBREAK_MAX:
362+
for j in range(i - dups + 1, i + 1):
363+
ranks[argsorted[j]] = i + 1
364+
elif tiebreak == TIEBREAK_FIRST:
365+
for j in range(i - dups + 1, i + 1):
366+
ranks[argsorted[j]] = j + 1
367+
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
368+
for j in range(i - dups + 1, i + 1):
369+
ranks[argsorted[j]] = 2 * i - j - dups + 2
370+
elif tiebreak == TIEBREAK_DENSE:
371+
total_tie_count += 1
372+
for j in range(i - dups + 1, i + 1):
373+
ranks[argsorted[j]] = total_tie_count
374+
sum_ranks = dups = 0
375+
if pct:
376+
return ranks / count
377+
else:
378+
return ranks
379+
380+
289381
def rank_2d_float64(object in_arr, axis=0, ties_method='average',
290382
ascending=True, na_option='keep', pct=False):
291383
"""
@@ -472,6 +564,93 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
472564
return ranks
473565

474566

567+
def rank_2d_uint64(object in_arr, axis=0, ties_method='average',
568+
ascending=True, na_option='keep', pct=False):
569+
"""
570+
Fast NaN-friendly version of scipy.stats.rankdata
571+
"""
572+
573+
cdef:
574+
Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
575+
ndarray[float64_t, ndim=2] ranks
576+
ndarray[int64_t, ndim=2] argsorted
577+
ndarray[uint64_t, ndim=2, cast=True] values
578+
int64_t val, nan_value
579+
float64_t sum_ranks = 0
580+
bint keep_na = 0
581+
int tiebreak = 0
582+
float count = 0.0
583+
tiebreak = tiebreakers[ties_method]
584+
585+
keep_na = na_option == 'keep'
586+
587+
in_arr = np.asarray(in_arr)
588+
589+
# uint64 has no NaN value, so we just
590+
# create the "values" array and proceed.
591+
if axis == 0:
592+
values = in_arr.T.copy()
593+
else:
594+
values = in_arr.copy()
595+
596+
n, k = (<object> values).shape
597+
ranks = np.empty((n, k), dtype='f8')
598+
599+
if tiebreak == TIEBREAK_FIRST:
600+
# need to use a stable sort here
601+
_as = values.argsort(axis=1, kind='mergesort')
602+
if not ascending:
603+
tiebreak = TIEBREAK_FIRST_DESCENDING
604+
else:
605+
_as = values.argsort(1)
606+
607+
if not ascending:
608+
_as = _as[:, ::-1]
609+
610+
values = _take_2d_uint64(values, _as)
611+
argsorted = _as.astype('i8')
612+
613+
for i in range(n):
614+
dups = sum_ranks = 0
615+
total_tie_count = 0
616+
count = 0.0
617+
for j in range(k):
618+
sum_ranks += j + 1
619+
dups += 1
620+
val = values[i, j]
621+
if val == nan_value and keep_na:
622+
ranks[i, argsorted[i, j]] = nan
623+
continue
624+
count += 1.0
625+
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
626+
if tiebreak == TIEBREAK_AVERAGE:
627+
for z in range(j - dups + 1, j + 1):
628+
ranks[i, argsorted[i, z]] = sum_ranks / dups
629+
elif tiebreak == TIEBREAK_MIN:
630+
for z in range(j - dups + 1, j + 1):
631+
ranks[i, argsorted[i, z]] = j - dups + 2
632+
elif tiebreak == TIEBREAK_MAX:
633+
for z in range(j - dups + 1, j + 1):
634+
ranks[i, argsorted[i, z]] = j + 1
635+
elif tiebreak == TIEBREAK_FIRST:
636+
for z in range(j - dups + 1, j + 1):
637+
ranks[i, argsorted[i, z]] = z + 1
638+
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
639+
for z in range(j - dups + 1, j + 1):
640+
ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
641+
elif tiebreak == TIEBREAK_DENSE:
642+
total_tie_count += 1
643+
for z in range(j - dups + 1, j + 1):
644+
ranks[i, argsorted[i, z]] = total_tie_count
645+
sum_ranks = dups = 0
646+
if pct:
647+
ranks[i, :] /= count
648+
if axis == 0:
649+
return ranks.T
650+
else:
651+
return ranks
652+
653+
475654
def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
476655
ascending=True, na_option='keep', pct=False):
477656
"""

pandas/core/algorithms.py

+37-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
1111
from pandas.types.generic import ABCSeries, ABCIndex
12-
from pandas.types.common import (is_integer_dtype,
12+
from pandas.types.common import (is_unsigned_integer_dtype,
13+
is_signed_integer_dtype,
14+
is_integer_dtype,
1315
is_int64_dtype,
1416
is_categorical_dtype,
1517
is_extension_type,
@@ -363,6 +365,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
363365
if isinstance(values, Index):
364366
uniques = values._shallow_copy(uniques, name=None)
365367
elif isinstance(values, Series):
368+
# TODO: This constructor is bugged for uint's, especially
369+
# np.uint64 due to overflow. Test this for uint behavior
370+
# once constructor has been fixed.
366371
uniques = Index(uniques)
367372
return labels, uniques
368373

@@ -574,7 +579,27 @@ def mode(values):
574579
def rank(values, axis=0, method='average', na_option='keep',
575580
ascending=True, pct=False):
576581
"""
582+
Rank the values along a given axis.
577583
584+
Parameters
585+
----------
586+
values : array-like
587+
Array whose values will be ranked. The number of dimensions in this
588+
array must not exceed 2.
589+
axis : int, default 0
590+
Axis over which to perform rankings.
591+
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
592+
The method by which tiebreaks are broken during the ranking.
593+
na_option : {'keep', 'top'}, default 'keep'
594+
The method by which NaNs are placed in the ranking.
595+
- ``keep``: rank each NaN value with a NaN ranking
596+
- ``top``: replace each NaN with either +/- inf so that they
597+
there are ranked at the top
598+
ascending : boolean, default True
599+
Whether or not the elements should be ranked in ascending order.
600+
pct : boolean, default False
601+
Whether or not to the display the returned rankings in integer form
602+
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
578603
"""
579604
if values.ndim == 1:
580605
f, values = _get_data_algo(values, _rank1d_functions)
@@ -584,6 +609,8 @@ def rank(values, axis=0, method='average', na_option='keep',
584609
f, values = _get_data_algo(values, _rank2d_functions)
585610
ranks = f(values, axis=axis, ties_method=method,
586611
ascending=ascending, na_option=na_option, pct=pct)
612+
else:
613+
raise TypeError("Array with ndim > 2 are not supported.")
587614

588615
return ranks
589616

@@ -679,12 +706,14 @@ def _broadcast(arr_or_scalar, shape):
679706
_rank1d_functions = {
680707
'float64': algos.rank_1d_float64,
681708
'int64': algos.rank_1d_int64,
709+
'uint64': algos.rank_1d_uint64,
682710
'generic': algos.rank_1d_generic
683711
}
684712

685713
_rank2d_functions = {
686714
'float64': algos.rank_2d_float64,
687715
'int64': algos.rank_2d_int64,
716+
'uint64': algos.rank_2d_uint64,
688717
'generic': algos.rank_2d_generic
689718
}
690719

@@ -911,6 +940,7 @@ def _hashtable_algo(f, values, return_dtype=None):
911940

912941
_hashtables = {
913942
'float64': (htable.Float64HashTable, htable.Float64Vector),
943+
'uint64': (htable.UInt64HashTable, htable.UInt64Vector),
914944
'int64': (htable.Int64HashTable, htable.Int64Vector),
915945
'string': (htable.StringHashTable, htable.ObjectVector),
916946
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
@@ -928,11 +958,15 @@ def _get_data_algo(values, func_map):
928958
f = func_map['int64']
929959
values = values.view('i8')
930960

931-
elif is_integer_dtype(values):
961+
elif is_signed_integer_dtype(values):
932962
f = func_map['int64']
933963
values = _ensure_int64(values)
934-
else:
935964

965+
elif is_unsigned_integer_dtype(values):
966+
f = func_map['uint64']
967+
values = _ensure_uint64(values)
968+
969+
else:
936970
values = _ensure_object(values)
937971

938972
# its cheaper to use a String Hash Table than Object

pandas/tests/test_algos.py

+31-15
Original file line numberDiff line numberDiff line change
@@ -962,21 +962,37 @@ def test_unique_label_indices():
962962
check_dtype=False)
963963

964964

965-
def test_rank():
966-
tm._skip_if_no_scipy()
967-
from scipy.stats import rankdata
968-
969-
def _check(arr):
970-
mask = ~np.isfinite(arr)
971-
arr = arr.copy()
972-
result = _algos.rank_1d_float64(arr)
973-
arr[mask] = np.inf
974-
exp = rankdata(arr)
975-
exp[mask] = nan
976-
assert_almost_equal(result, exp)
977-
978-
_check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan]))
979-
_check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan]))
965+
class TestRank(tm.TestCase):
966+
967+
def test_scipy_compat(self):
968+
tm._skip_if_no_scipy()
969+
from scipy.stats import rankdata
970+
971+
def _check(arr):
972+
mask = ~np.isfinite(arr)
973+
arr = arr.copy()
974+
result = _algos.rank_1d_float64(arr)
975+
arr[mask] = np.inf
976+
exp = rankdata(arr)
977+
exp[mask] = nan
978+
assert_almost_equal(result, exp)
979+
980+
_check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan]))
981+
_check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan]))
982+
983+
def test_basic(self):
984+
exp = np.array([1, 2], dtype=np.float64)
985+
986+
for dtype in np.typecodes['AllInteger']:
987+
s = Series([1, 100], dtype=dtype)
988+
tm.assert_numpy_array_equal(algos.rank(s), exp)
989+
990+
def test_uint64_overflow(self):
991+
exp = np.array([1, 2], dtype=np.float64)
992+
993+
for dtype in [np.float64, np.uint64]:
994+
s = Series([1, 2**63], dtype=dtype)
995+
tm.assert_numpy_array_equal(algos.rank(s), exp)
980996

981997

982998
def test_pad_backfill_object_segfault():

pandas/tools/tests/test_merge.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1049,7 +1049,9 @@ def _test(dtype1, dtype2):
10491049
expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True)
10501050
tm.assert_frame_equal(result, expected)
10511051

1052-
for d1 in [np.int64, np.int32, np.int16, np.int8, np.uint8]:
1052+
# TODO: Reinstate np.uint8 (at least) for testing once
1053+
# DataFrame sorting has been fixed for uint's
1054+
for d1 in [np.int64, np.int32, np.int16, np.int8]:
10531055
for d2 in [np.int64, np.float64, np.float32, np.float16]:
10541056
_test(np.dtype(d1), np.dtype(d2))
10551057

pandas/types/common.py

+12
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,18 @@ def is_integer_dtype(arr_or_dtype):
155155
not issubclass(tipo, (np.datetime64, np.timedelta64)))
156156

157157

158+
def is_signed_integer_dtype(arr_or_dtype):
159+
tipo = _get_dtype_type(arr_or_dtype)
160+
return (issubclass(tipo, np.signedinteger) and
161+
not issubclass(tipo, (np.datetime64, np.timedelta64)))
162+
163+
164+
def is_unsigned_integer_dtype(arr_or_dtype):
165+
tipo = _get_dtype_type(arr_or_dtype)
166+
return (issubclass(tipo, np.unsignedinteger) and
167+
not issubclass(tipo, (np.datetime64, np.timedelta64)))
168+
169+
158170
def is_int64_dtype(arr_or_dtype):
159171
tipo = _get_dtype_type(arr_or_dtype)
160172
return issubclass(tipo, np.int64)

0 commit comments

Comments
 (0)