31
31
32
32
import pandas .core .common as com
33
33
import pandas .algos as algos
34
- import pandas .hashtable as htable
34
+ import pandas .hashtable as _htable
35
35
from pandas .compat import string_types
36
36
from pandas .tslib import iNaT
37
37
@@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
65
65
values = np .array (values , dtype = 'O' )
66
66
67
67
f = lambda htype , caster : _match_generic (to_match , values , htype , caster )
68
- result = _hashtable_algo (f , values . dtype , np .int64 )
68
+ result = _hashtable_algo (f , values , np .int64 )
69
69
70
70
if na_sentinel != - 1 :
71
71
@@ -102,7 +102,7 @@ def unique(values):
102
102
values = com ._asarray_tuplesafe (values )
103
103
104
104
f = lambda htype , caster : _unique_generic (values , htype , caster )
105
- return _hashtable_algo (f , values . dtype )
105
+ return _hashtable_algo (f , values )
106
106
107
107
108
108
def _unique_generic (values , table_type , type_caster ):
@@ -419,7 +419,7 @@ def _value_counts_arraylike(values, dropna=True):
419
419
freq = values .freq
420
420
421
421
values = values .view (np .int64 )
422
- keys , counts = htable .value_count_int64 (values , dropna )
422
+ keys , counts = _htable .value_count_int64 (values , dropna )
423
423
424
424
if dropna :
425
425
msk = keys != iNaT
@@ -436,14 +436,14 @@ def _value_counts_arraylike(values, dropna=True):
436
436
437
437
elif is_integer_dtype (dtype ):
438
438
values = _ensure_int64 (values )
439
- keys , counts = htable .value_count_int64 (values , dropna )
439
+ keys , counts = _htable .value_count_int64 (values , dropna )
440
440
elif is_float_dtype (dtype ):
441
441
values = _ensure_float64 (values )
442
- keys , counts = htable .value_count_float64 (values , dropna )
442
+ keys , counts = _htable .value_count_float64 (values , dropna )
443
443
else :
444
444
values = _ensure_object (values )
445
445
mask = isnull (values )
446
- keys , counts = htable .value_count_object (values , mask )
446
+ keys , counts = _htable .value_count_object (values , mask )
447
447
if not dropna and mask .any ():
448
448
keys = np .insert (keys , 0 , np .NaN )
449
449
counts = np .insert (counts , 0 , mask .sum ())
@@ -486,13 +486,13 @@ def duplicated(values, keep='first'):
486
486
487
487
if is_integer_dtype (dtype ):
488
488
values = _ensure_int64 (values )
489
- duplicated = htable .duplicated_int64 (values , keep = keep )
489
+ duplicated = _htable .duplicated_int64 (values , keep = keep )
490
490
elif is_float_dtype (dtype ):
491
491
values = _ensure_float64 (values )
492
- duplicated = htable .duplicated_float64 (values , keep = keep )
492
+ duplicated = _htable .duplicated_float64 (values , keep = keep )
493
493
else :
494
494
values = _ensure_object (values )
495
- duplicated = htable .duplicated_object (values , keep = keep )
495
+ duplicated = _htable .duplicated_object (values , keep = keep )
496
496
497
497
return duplicated
498
498
@@ -512,19 +512,19 @@ def mode(values):
512
512
dtype = values .dtype
513
513
if is_integer_dtype (values ):
514
514
values = _ensure_int64 (values )
515
- result = constructor (sorted (htable .mode_int64 (values )), dtype = dtype )
515
+ result = constructor (sorted (_htable .mode_int64 (values )), dtype = dtype )
516
516
517
517
elif issubclass (values .dtype .type , (np .datetime64 , np .timedelta64 )):
518
518
dtype = values .dtype
519
519
values = values .view (np .int64 )
520
- result = constructor (sorted (htable .mode_int64 (values )), dtype = dtype )
520
+ result = constructor (sorted (_htable .mode_int64 (values )), dtype = dtype )
521
521
522
522
elif is_categorical_dtype (values ):
523
523
result = constructor (values .mode ())
524
524
else :
525
525
mask = isnull (values )
526
526
values = _ensure_object (values )
527
- res = htable .mode_object (values , mask )
527
+ res = _htable .mode_object (values , mask )
528
528
try :
529
529
res = sorted (res )
530
530
except TypeError as e :
@@ -733,27 +733,35 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
733
733
# helpers #
734
734
# ------- #
735
735
736
- def _hashtable_algo (f , dtype , return_dtype = None ):
736
+ def _hashtable_algo (f , values , return_dtype = None ):
737
737
"""
738
738
f(HashTable, type_caster) -> result
739
739
"""
740
+
741
+ dtype = values .dtype
740
742
if is_float_dtype (dtype ):
741
- return f (htable .Float64HashTable , _ensure_float64 )
743
+ return f (_htable .Float64HashTable , _ensure_float64 )
742
744
elif is_integer_dtype (dtype ):
743
- return f (htable .Int64HashTable , _ensure_int64 )
745
+ return f (_htable .Int64HashTable , _ensure_int64 )
744
746
elif is_datetime64_dtype (dtype ):
745
747
return_dtype = return_dtype or 'M8[ns]'
746
- return f (htable .Int64HashTable , _ensure_int64 ).view (return_dtype )
748
+ return f (_htable .Int64HashTable , _ensure_int64 ).view (return_dtype )
747
749
elif is_timedelta64_dtype (dtype ):
748
750
return_dtype = return_dtype or 'm8[ns]'
749
- return f (htable .Int64HashTable , _ensure_int64 ).view (return_dtype )
750
- else :
751
- return f (htable .PyObjectHashTable , _ensure_object )
751
+ return f (_htable .Int64HashTable , _ensure_int64 ).view (return_dtype )
752
+
753
+ # its cheaper to use a String Hash Table than Object
754
+ if lib .infer_dtype (values ) in ['string' , 'unicode' ]:
755
+ return f (_htable .StringHashTable , _ensure_object )
756
+
757
+ # use Object
758
+ return f (_htable .PyObjectHashTable , _ensure_object )
752
759
753
760
_hashtables = {
754
- 'float64' : (htable .Float64HashTable , htable .Float64Vector ),
755
- 'int64' : (htable .Int64HashTable , htable .Int64Vector ),
756
- 'generic' : (htable .PyObjectHashTable , htable .ObjectVector )
761
+ 'float64' : (_htable .Float64HashTable , _htable .Float64Vector ),
762
+ 'int64' : (_htable .Int64HashTable , _htable .Int64Vector ),
763
+ 'string' : (_htable .StringHashTable , _htable .ObjectVector ),
764
+ 'generic' : (_htable .PyObjectHashTable , _htable .ObjectVector )
757
765
}
758
766
759
767
@@ -770,8 +778,15 @@ def _get_data_algo(values, func_map):
770
778
f = func_map ['int64' ]
771
779
values = _ensure_int64 (values )
772
780
else :
773
- f = func_map [ 'generic' ]
781
+
774
782
values = _ensure_object (values )
783
+
784
+ # its cheaper to use a String Hash Table than Object
785
+ if lib .infer_dtype (values ) in ['string' , 'unicode' ]:
786
+ f = func_map ['string' ]
787
+ else :
788
+ f = func_map ['generic' ]
789
+
775
790
return f , values
776
791
777
792
0 commit comments