Skip to content

Commit 0102c3d

Browse files
committed
PERF: use StringHashTable for strings in factorizing
allows releasing the GIL on these dtypes xref #13745
1 parent e833096 commit 0102c3d

File tree

7 files changed

+334
-67
lines changed

7 files changed

+334
-67
lines changed

asv_bench/benchmarks/algorithms.py

+4
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,15 @@ def setup(self):
2323
self.arrpos = np.arange(1000000)
2424
self.arrneg = np.arange(-1000000, 0)
2525
self.arrmixed = np.array([1, -1]).repeat(500000)
26+
self.strings = tm.makeStringIndex(100000)
2627

2728
# match
2829
self.uniques = tm.makeStringIndex(1000).values
2930
self.all = self.uniques.repeat(10)
3031

32+
def time_factorize_int(self):
33+
self.strings.factorize()
34+
3135
def time_factorize_int(self):
3236
self.int.factorize()
3337

asv_bench/benchmarks/gil.py

+41
Original file line numberDiff line numberDiff line change
@@ -379,3 +379,44 @@ def pg_read_csv_datetime(self):
379379

380380
def time_read_csv_datetime(self):
381381
self.pg_read_csv_datetime()
382+
383+
384+
class nogil_factorize(object):
385+
number = 1
386+
repeat = 5
387+
388+
def setup(self):
389+
if (not have_real_test_parallel):
390+
raise NotImplementedError
391+
392+
self.strings = tm.makeStringIndex(100000)
393+
394+
def factorize_strings(self):
395+
pd.factorize(self.strings)
396+
397+
@test_parallel(num_threads=8)
398+
def _pg_factorize_strings_8(self):
399+
self.factorize_strings()
400+
401+
def time_factorize_strings_8(self):
402+
self._pg_factorize_strings_8()
403+
404+
@test_parallel(num_threads=4)
405+
def _pg_factorize_strings_4(self):
406+
self.factorize_strings()
407+
408+
def time_factorize_strings_4(self):
409+
for i in range(2):
410+
self._pg_factorize_strings_4()
411+
412+
@test_parallel(num_threads=2)
413+
def _pg_factorize_strings_2(self):
414+
self.factorize_strings()
415+
416+
def time_factorize_strings_2(self):
417+
for i in range(4):
418+
self._pg_factorize_strings_2()
419+
420+
def time_factorize_strings(self):
421+
for i in range(8):
422+
self.factorize_strings()

doc/source/whatsnew/v0.20.0.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ Other enhancements
5454
- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
5555
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
5656
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`
57-
57+
5858
- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`)
5959
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
6060
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
@@ -110,7 +110,7 @@ Removal of prior version deprecations/changes
110110
Performance Improvements
111111
~~~~~~~~~~~~~~~~~~~~~~~~
112112

113-
113+
- increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:``)
114114

115115

116116

pandas/core/algorithms.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
6565
values = np.array(values, dtype='O')
6666

6767
f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
68-
result = _hashtable_algo(f, values.dtype, np.int64)
68+
result = _hashtable_algo(f, values, np.int64)
6969

7070
if na_sentinel != -1:
7171

@@ -102,7 +102,7 @@ def unique(values):
102102
values = com._asarray_tuplesafe(values)
103103

104104
f = lambda htype, caster: _unique_generic(values, htype, caster)
105-
return _hashtable_algo(f, values.dtype)
105+
return _hashtable_algo(f, values)
106106

107107

108108
def _unique_generic(values, table_type, type_caster):
@@ -759,10 +759,12 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
759759
# helpers #
760760
# ------- #
761761

762-
def _hashtable_algo(f, dtype, return_dtype=None):
762+
def _hashtable_algo(f, values, return_dtype=None):
763763
"""
764764
f(HashTable, type_caster) -> result
765765
"""
766+
767+
dtype = values.dtype
766768
if is_float_dtype(dtype):
767769
return f(htable.Float64HashTable, _ensure_float64)
768770
elif is_integer_dtype(dtype):
@@ -773,17 +775,25 @@ def _hashtable_algo(f, dtype, return_dtype=None):
773775
elif is_timedelta64_dtype(dtype):
774776
return_dtype = return_dtype or 'm8[ns]'
775777
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
776-
else:
777-
return f(htable.PyObjectHashTable, _ensure_object)
778+
779+
# its cheaper to use a String Hash Table than Object
780+
if lib.infer_dtype(values) in ['string']:
781+
return f(htable.StringHashTable, _ensure_object)
782+
783+
# use Object
784+
return f(htable.PyObjectHashTable, _ensure_object)
778785

779786
_hashtables = {
780787
'float64': (htable.Float64HashTable, htable.Float64Vector),
781788
'int64': (htable.Int64HashTable, htable.Int64Vector),
789+
'string': (htable.StringHashTable, htable.ObjectVector),
782790
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
783791
}
784792

785793

786794
def _get_data_algo(values, func_map):
795+
796+
f = None
787797
if is_float_dtype(values):
788798
f = func_map['float64']
789799
values = _ensure_float64(values)
@@ -796,8 +806,19 @@ def _get_data_algo(values, func_map):
796806
f = func_map['int64']
797807
values = _ensure_int64(values)
798808
else:
799-
f = func_map['generic']
809+
800810
values = _ensure_object(values)
811+
812+
# its cheaper to use a String Hash Table than Object
813+
if lib.infer_dtype(values) in ['string']:
814+
try:
815+
f = func_map['string']
816+
except KeyError:
817+
pass
818+
819+
if f is None:
820+
f = func_map['generic']
821+
801822
return f, values
802823

803824

pandas/hashtable.pxd

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
1+
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
22

33
# prototypes for sharing
44

@@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable):
2222

2323
cpdef get_item(self, object val)
2424
cpdef set_item(self, object key, Py_ssize_t val)
25+
26+
cdef class StringHashTable(HashTable):
27+
cdef kh_str_t *table
28+
29+
cpdef get_item(self, object val)
30+
cpdef set_item(self, object key, Py_ssize_t val)

pandas/hashtable.pyx

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
44

55
from khash cimport *
66
from numpy cimport *
7-
from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
7+
8+
from libc.stdlib cimport malloc, free
9+
from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
10+
PyString_Check, PyBytes_Check,
11+
PyUnicode_Check)
812

913
from util cimport _checknan
1014
cimport util
@@ -33,7 +37,7 @@ PyDateTime_IMPORT
3337
cdef extern from "Python.h":
3438
int PySlice_Check(object)
3539

36-
cdef size_t _INIT_VEC_CAP = 32
40+
cdef size_t _INIT_VEC_CAP = 128
3741

3842

3943
include "hashtable_class_helper.pxi"

0 commit comments

Comments
 (0)