Skip to content

Commit 812b062

Browse files
committed
BUG: Patch rank() uint64 behavior
Adds uint64 ranking functions to algos.pyx to allow for proper ranking with uint64. Also introduces partial patch for factorize() by adding uint64 hashtables and vectors for usage. However, this patch is only partial because the larger bug of non-support for uint64 in Index has not been fixed. Also patches bug in UInt64HashTable that had an erroneous null condition that was caught during testing and was hence removed.
1 parent f79bc7a commit 812b062

8 files changed

+491
-630
lines changed

pandas/algos.pyx

+1-609
Large diffs are not rendered by default.

pandas/core/algorithms.py

+37-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
1111
from pandas.types.generic import ABCSeries, ABCIndex
12-
from pandas.types.common import (is_integer_dtype,
12+
from pandas.types.common import (is_unsigned_integer_dtype,
13+
is_signed_integer_dtype,
14+
is_integer_dtype,
1315
is_int64_dtype,
1416
is_categorical_dtype,
1517
is_extension_type,
@@ -363,6 +365,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
363365
if isinstance(values, Index):
364366
uniques = values._shallow_copy(uniques, name=None)
365367
elif isinstance(values, Series):
368+
# TODO: This constructor is bugged for uint's, especially
369+
# np.uint64 due to overflow. Test this for uint behavior
370+
# once constructor has been fixed.
366371
uniques = Index(uniques)
367372
return labels, uniques
368373

@@ -574,7 +579,27 @@ def mode(values):
574579
def rank(values, axis=0, method='average', na_option='keep',
575580
ascending=True, pct=False):
576581
"""
582+
Rank the values along a given axis.
577583
584+
Parameters
585+
----------
586+
values : array-like
587+
Array whose values will be ranked. The number of dimensions in this
588+
array must not exceed 2.
589+
axis : int, default 0
590+
Axis over which to perform rankings.
591+
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
592+
The method by which tiebreaks are broken during the ranking.
593+
na_option : {'keep', 'top'}, default 'keep'
594+
The method by which NaNs are placed in the ranking.
595+
- ``keep``: rank each NaN value with a NaN ranking
596+
- ``top``: replace each NaN with either +/- inf so that they
597+
there are ranked at the top
598+
ascending : boolean, default True
599+
Whether or not the elements should be ranked in ascending order.
600+
pct : boolean, default False
601+
Whether or not to the display the returned rankings in integer form
602+
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
578603
"""
579604
if values.ndim == 1:
580605
f, values = _get_data_algo(values, _rank1d_functions)
@@ -584,6 +609,8 @@ def rank(values, axis=0, method='average', na_option='keep',
584609
f, values = _get_data_algo(values, _rank2d_functions)
585610
ranks = f(values, axis=axis, ties_method=method,
586611
ascending=ascending, na_option=na_option, pct=pct)
612+
else:
613+
raise TypeError("Array with ndim > 2 are not supported.")
587614

588615
return ranks
589616

@@ -679,12 +706,14 @@ def _broadcast(arr_or_scalar, shape):
679706
_rank1d_functions = {
680707
'float64': algos.rank_1d_float64,
681708
'int64': algos.rank_1d_int64,
709+
'uint64': algos.rank_1d_uint64,
682710
'generic': algos.rank_1d_generic
683711
}
684712

685713
_rank2d_functions = {
686714
'float64': algos.rank_2d_float64,
687715
'int64': algos.rank_2d_int64,
716+
'uint64': algos.rank_2d_uint64,
688717
'generic': algos.rank_2d_generic
689718
}
690719

@@ -911,6 +940,7 @@ def _hashtable_algo(f, values, return_dtype=None):
911940

912941
_hashtables = {
913942
'float64': (htable.Float64HashTable, htable.Float64Vector),
943+
'uint64': (htable.UInt64HashTable, htable.UInt64Vector),
914944
'int64': (htable.Int64HashTable, htable.Int64Vector),
915945
'string': (htable.StringHashTable, htable.ObjectVector),
916946
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
@@ -928,11 +958,15 @@ def _get_data_algo(values, func_map):
928958
f = func_map['int64']
929959
values = values.view('i8')
930960

931-
elif is_integer_dtype(values):
961+
elif is_signed_integer_dtype(values):
932962
f = func_map['int64']
933963
values = _ensure_int64(values)
934-
else:
935964

965+
elif is_unsigned_integer_dtype(values):
966+
f = func_map['uint64']
967+
values = _ensure_uint64(values)
968+
969+
else:
936970
values = _ensure_object(values)
937971

938972
# its cheaper to use a String Hash Table than Object

0 commit comments

Comments
 (0)