jreback
diff --git a/‎pandas/core/algorithms.py
Lines changed: 39 additions & 24 deletions b/‎pandas/core/algorithms.py
Lines changed: 39 additions & 24 deletions
diff --git a/‎pandas/hashtable.pxd
Lines changed: 7 additions & 1 deletion b/‎pandas/hashtable.pxd
Lines changed: 7 additions & 1 deletion
diff --git a/‎pandas/hashtable.pyx
Lines changed: 6 additions & 2 deletions b/‎pandas/hashtable.pyx
Lines changed: 6 additions & 2 deletions
@@ -31,7 +31,7 @@
 
 import pandas.core.common as com
 import pandas.algos as algos
-import pandas.hashtable as htable
+import pandas.hashtable as _htable
 from pandas.compat import string_types
 from pandas.tslib import iNaT
 
@@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
         values = np.array(values, dtype='O')
 
     f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
-    result = _hashtable_algo(f, values.dtype, np.int64)
+    result = _hashtable_algo(f, values, np.int64)
 
     if na_sentinel != -1:
 
@@ -102,7 +102,7 @@ def unique(values):
     values = com._asarray_tuplesafe(values)
 
     f = lambda htype, caster: _unique_generic(values, htype, caster)
-    return _hashtable_algo(f, values.dtype)
+    return _hashtable_algo(f, values)
 
 
 def _unique_generic(values, table_type, type_caster):
@@ -419,7 +419,7 @@ def _value_counts_arraylike(values, dropna=True):
             freq = values.freq
 
         values = values.view(np.int64)
-        keys, counts = htable.value_count_int64(values, dropna)
+        keys, counts = _htable.value_count_int64(values, dropna)
 
         if dropna:
             msk = keys != iNaT
@@ -436,14 +436,14 @@ def _value_counts_arraylike(values, dropna=True):
 
     elif is_integer_dtype(dtype):
         values = _ensure_int64(values)
-        keys, counts = htable.value_count_int64(values, dropna)
+        keys, counts = _htable.value_count_int64(values, dropna)
     elif is_float_dtype(dtype):
         values = _ensure_float64(values)
-        keys, counts = htable.value_count_float64(values, dropna)
+        keys, counts = _htable.value_count_float64(values, dropna)
     else:
         values = _ensure_object(values)
         mask = isnull(values)
-        keys, counts = htable.value_count_object(values, mask)
+        keys, counts = _htable.value_count_object(values, mask)
         if not dropna and mask.any():
             keys = np.insert(keys, 0, np.NaN)
             counts = np.insert(counts, 0, mask.sum())
@@ -486,13 +486,13 @@ def duplicated(values, keep='first'):
 
     if is_integer_dtype(dtype):
         values = _ensure_int64(values)
-        duplicated = htable.duplicated_int64(values, keep=keep)
+        duplicated = _htable.duplicated_int64(values, keep=keep)
     elif is_float_dtype(dtype):
         values = _ensure_float64(values)
-        duplicated = htable.duplicated_float64(values, keep=keep)
+        duplicated = _htable.duplicated_float64(values, keep=keep)
     else:
         values = _ensure_object(values)
-        duplicated = htable.duplicated_object(values, keep=keep)
+        duplicated = _htable.duplicated_object(values, keep=keep)
 
     return duplicated
 
@@ -512,19 +512,19 @@ def mode(values):
     dtype = values.dtype
     if is_integer_dtype(values):
         values = _ensure_int64(values)
-        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
+        result = constructor(sorted(_htable.mode_int64(values)), dtype=dtype)
 
     elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
         dtype = values.dtype
         values = values.view(np.int64)
-        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
+        result = constructor(sorted(_htable.mode_int64(values)), dtype=dtype)
 
     elif is_categorical_dtype(values):
         result = constructor(values.mode())
     else:
         mask = isnull(values)
         values = _ensure_object(values)
-        res = htable.mode_object(values, mask)
+        res = _htable.mode_object(values, mask)
         try:
             res = sorted(res)
         except TypeError as e:
@@ -733,27 +733,35 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
 # helpers #
 # ------- #
 
-def _hashtable_algo(f, dtype, return_dtype=None):
+def _hashtable_algo(f, values, return_dtype=None):
     """
     f(HashTable, type_caster) -> result
     """
+
+    dtype = values.dtype
     if is_float_dtype(dtype):
-        return f(htable.Float64HashTable, _ensure_float64)
+        return f(_htable.Float64HashTable, _ensure_float64)
     elif is_integer_dtype(dtype):
-        return f(htable.Int64HashTable, _ensure_int64)
+        return f(_htable.Int64HashTable, _ensure_int64)
     elif is_datetime64_dtype(dtype):
         return_dtype = return_dtype or 'M8[ns]'
-        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
+        return f(_htable.Int64HashTable, _ensure_int64).view(return_dtype)
     elif is_timedelta64_dtype(dtype):
         return_dtype = return_dtype or 'm8[ns]'
-        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
-    else:
-        return f(htable.PyObjectHashTable, _ensure_object)
+        return f(_htable.Int64HashTable, _ensure_int64).view(return_dtype)
+
+    # its cheaper to use a String Hash Table than Object
+    if lib.infer_dtype(values) in ['string', 'unicode']:
+        return f(_htable.StringHashTable, _ensure_object)
+
+    # use Object
+    return f(_htable.PyObjectHashTable, _ensure_object)
 
 _hashtables = {
-    'float64': (htable.Float64HashTable, htable.Float64Vector),
-    'int64': (htable.Int64HashTable, htable.Int64Vector),
-    'generic': (htable.PyObjectHashTable, htable.ObjectVector)
+    'float64': (_htable.Float64HashTable, _htable.Float64Vector),
+    'int64': (_htable.Int64HashTable, _htable.Int64Vector),
+    'string': (_htable.StringHashTable, _htable.ObjectVector),
+    'generic': (_htable.PyObjectHashTable, _htable.ObjectVector)
 }
 
 
@@ -770,8 +778,15 @@ def _get_data_algo(values, func_map):
         f = func_map['int64']
         values = _ensure_int64(values)
     else:
-        f = func_map['generic']
+
         values = _ensure_object(values)
+
+        # its cheaper to use a String Hash Table than Object
+        if lib.infer_dtype(values) in ['string', 'unicode']:
+            f = func_map['string']
+        else:
+            f = func_map['generic']
+
     return f, values
 
 
 
@@ -1,4 +1,4 @@
-from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
+from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
 
 # prototypes for sharing
 
@@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable):
 
     cpdef get_item(self, object val)
     cpdef set_item(self, object key, Py_ssize_t val)
+
+cdef class StringHashTable(HashTable):
+    cdef kh_str_t *table
+
+    cpdef get_item(self, object val)
+    cpdef set_item(self, object key, Py_ssize_t val)
@@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
 
 from khash cimport *
 from numpy cimport *
-from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
+
+from libc.stdlib cimport malloc, free
+from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
+                      PyString_Check, PyBytes_Check,
+                      PyUnicode_Check)
 
 from util cimport _checknan
 cimport util
@@ -33,7 +37,7 @@ PyDateTime_IMPORT
 cdef extern from "Python.h":
     int PySlice_Check(object)
 
-cdef size_t _INIT_VEC_CAP = 32
+cdef size_t _INIT_VEC_CAP = 128
 
 
 include "hashtable_class_helper.pxi"