Skip to content

Commit a295e83

Browse files
committed
PERF: use StringHastTable for strings
1 parent 725453d commit a295e83

File tree

5 files changed

+306
-83
lines changed

5 files changed

+306
-83
lines changed

pandas/core/algorithms.py

+39-24
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
import pandas.core.common as com
3333
import pandas.algos as algos
34-
import pandas.hashtable as htable
34+
import pandas.hashtable as _htable
3535
from pandas.compat import string_types
3636
from pandas.tslib import iNaT
3737

@@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
6565
values = np.array(values, dtype='O')
6666

6767
f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
68-
result = _hashtable_algo(f, values.dtype, np.int64)
68+
result = _hashtable_algo(f, values, np.int64)
6969

7070
if na_sentinel != -1:
7171

@@ -102,7 +102,7 @@ def unique(values):
102102
values = com._asarray_tuplesafe(values)
103103

104104
f = lambda htype, caster: _unique_generic(values, htype, caster)
105-
return _hashtable_algo(f, values.dtype)
105+
return _hashtable_algo(f, values)
106106

107107

108108
def _unique_generic(values, table_type, type_caster):
@@ -419,7 +419,7 @@ def _value_counts_arraylike(values, dropna=True):
419419
freq = values.freq
420420

421421
values = values.view(np.int64)
422-
keys, counts = htable.value_count_int64(values, dropna)
422+
keys, counts = _htable.value_count_int64(values, dropna)
423423

424424
if dropna:
425425
msk = keys != iNaT
@@ -436,14 +436,14 @@ def _value_counts_arraylike(values, dropna=True):
436436

437437
elif is_integer_dtype(dtype):
438438
values = _ensure_int64(values)
439-
keys, counts = htable.value_count_int64(values, dropna)
439+
keys, counts = _htable.value_count_int64(values, dropna)
440440
elif is_float_dtype(dtype):
441441
values = _ensure_float64(values)
442-
keys, counts = htable.value_count_float64(values, dropna)
442+
keys, counts = _htable.value_count_float64(values, dropna)
443443
else:
444444
values = _ensure_object(values)
445445
mask = isnull(values)
446-
keys, counts = htable.value_count_object(values, mask)
446+
keys, counts = _htable.value_count_object(values, mask)
447447
if not dropna and mask.any():
448448
keys = np.insert(keys, 0, np.NaN)
449449
counts = np.insert(counts, 0, mask.sum())
@@ -486,13 +486,13 @@ def duplicated(values, keep='first'):
486486

487487
if is_integer_dtype(dtype):
488488
values = _ensure_int64(values)
489-
duplicated = htable.duplicated_int64(values, keep=keep)
489+
duplicated = _htable.duplicated_int64(values, keep=keep)
490490
elif is_float_dtype(dtype):
491491
values = _ensure_float64(values)
492-
duplicated = htable.duplicated_float64(values, keep=keep)
492+
duplicated = _htable.duplicated_float64(values, keep=keep)
493493
else:
494494
values = _ensure_object(values)
495-
duplicated = htable.duplicated_object(values, keep=keep)
495+
duplicated = _htable.duplicated_object(values, keep=keep)
496496

497497
return duplicated
498498

@@ -512,19 +512,19 @@ def mode(values):
512512
dtype = values.dtype
513513
if is_integer_dtype(values):
514514
values = _ensure_int64(values)
515-
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
515+
result = constructor(sorted(_htable.mode_int64(values)), dtype=dtype)
516516

517517
elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
518518
dtype = values.dtype
519519
values = values.view(np.int64)
520-
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
520+
result = constructor(sorted(_htable.mode_int64(values)), dtype=dtype)
521521

522522
elif is_categorical_dtype(values):
523523
result = constructor(values.mode())
524524
else:
525525
mask = isnull(values)
526526
values = _ensure_object(values)
527-
res = htable.mode_object(values, mask)
527+
res = _htable.mode_object(values, mask)
528528
try:
529529
res = sorted(res)
530530
except TypeError as e:
@@ -733,27 +733,35 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
733733
# helpers #
734734
# ------- #
735735

736-
def _hashtable_algo(f, dtype, return_dtype=None):
736+
def _hashtable_algo(f, values, return_dtype=None):
737737
"""
738738
f(HashTable, type_caster) -> result
739739
"""
740+
741+
dtype = values.dtype
740742
if is_float_dtype(dtype):
741-
return f(htable.Float64HashTable, _ensure_float64)
743+
return f(_htable.Float64HashTable, _ensure_float64)
742744
elif is_integer_dtype(dtype):
743-
return f(htable.Int64HashTable, _ensure_int64)
745+
return f(_htable.Int64HashTable, _ensure_int64)
744746
elif is_datetime64_dtype(dtype):
745747
return_dtype = return_dtype or 'M8[ns]'
746-
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
748+
return f(_htable.Int64HashTable, _ensure_int64).view(return_dtype)
747749
elif is_timedelta64_dtype(dtype):
748750
return_dtype = return_dtype or 'm8[ns]'
749-
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
750-
else:
751-
return f(htable.PyObjectHashTable, _ensure_object)
751+
return f(_htable.Int64HashTable, _ensure_int64).view(return_dtype)
752+
753+
# its cheaper to use a String Hash Table than Object
754+
if lib.infer_dtype(values) in ['string', 'unicode']:
755+
return f(_htable.StringHashTable, _ensure_object)
756+
757+
# use Object
758+
return f(_htable.PyObjectHashTable, _ensure_object)
752759

753760
_hashtables = {
754-
'float64': (htable.Float64HashTable, htable.Float64Vector),
755-
'int64': (htable.Int64HashTable, htable.Int64Vector),
756-
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
761+
'float64': (_htable.Float64HashTable, _htable.Float64Vector),
762+
'int64': (_htable.Int64HashTable, _htable.Int64Vector),
763+
'string': (_htable.StringHashTable, _htable.ObjectVector),
764+
'generic': (_htable.PyObjectHashTable, _htable.ObjectVector)
757765
}
758766

759767

@@ -770,8 +778,15 @@ def _get_data_algo(values, func_map):
770778
f = func_map['int64']
771779
values = _ensure_int64(values)
772780
else:
773-
f = func_map['generic']
781+
774782
values = _ensure_object(values)
783+
784+
# its cheaper to use a String Hash Table than Object
785+
if lib.infer_dtype(values) in ['string', 'unicode']:
786+
f = func_map['string']
787+
else:
788+
f = func_map['generic']
789+
775790
return f, values
776791

777792

pandas/hashtable.pxd

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
1+
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
22

33
# prototypes for sharing
44

@@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable):
2222

2323
cpdef get_item(self, object val)
2424
cpdef set_item(self, object key, Py_ssize_t val)
25+
26+
cdef class StringHashTable(HashTable):
27+
cdef kh_str_t *table
28+
29+
cpdef get_item(self, object val)
30+
cpdef set_item(self, object key, Py_ssize_t val)

pandas/hashtable.pyx

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
44

55
from khash cimport *
66
from numpy cimport *
7-
from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
7+
8+
from libc.stdlib cimport malloc, free
9+
from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
10+
PyString_Check, PyBytes_Check,
11+
PyUnicode_Check)
812

913
from util cimport _checknan
1014
cimport util
@@ -33,7 +37,7 @@ PyDateTime_IMPORT
3337
cdef extern from "Python.h":
3438
int PySlice_Check(object)
3539

36-
cdef size_t _INIT_VEC_CAP = 32
40+
cdef size_t _INIT_VEC_CAP = 128
3741

3842

3943
include "hashtable_class_helper.pxi"

0 commit comments

Comments
 (0)