From f6e9fff74f13c3d3073ea51f98cb886115f01c10 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Wed, 4 Jun 2014 09:39:39 -0400
Subject: [PATCH] PERF: add dtype inference vbenches (GH7332)

PERF: recognize int64 to timedelta64[ns] conversions and perform faster

PERF: use names/kinds for dtype inference on known types

DOC: performance docs
---
 doc/source/v0.14.1.txt       |  4 +++
 pandas/core/common.py        |  2 +-
 pandas/core/ops.py           |  2 +-
 pandas/src/inference.pyx     | 63 ++++++++++++++++++++++++------------
 pandas/tseries/timedeltas.py | 15 ++++++---
 pandas/tslib.pyx             | 44 ++++++++++++-------------
 vb_suite/inference.py        | 36 +++++++++++++++++++++
 vb_suite/suite.py            |  1 +
 8 files changed, 118 insertions(+), 49 deletions(-)
 create mode 100644 vb_suite/inference.py

diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
index bb4ecddd58f16..2cc08abf9c57c 100644
--- a/doc/source/v0.14.1.txt
+++ b/doc/source/v0.14.1.txt
@@ -75,6 +75,10 @@ Enhancements
 Performance
 ~~~~~~~~~~~
 
+- Improvements in dtype inference for numeric operations involving yielding performance gains
+for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`)
+
+
 Experimental
 ~~~~~~~~~~~~
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
index d993112933fa9..e9ae26d0c7c81 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -1753,7 +1753,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
                         elif is_timedelta64:
                             from pandas.tseries.timedeltas import \
                                 _possibly_cast_to_timedelta
-                            value = _possibly_cast_to_timedelta(value, coerce='compat')
+                            value = _possibly_cast_to_timedelta(value, coerce='compat', dtype=dtype)
                     except:
                         pass
 
diff --git a/pandas/core/ops.py b/pandas/core/ops.py
index 72a31296ba456..0f19634cb5a38 100644
--- a/pandas/core/ops.py
+++ b/pandas/core/ops.py
@@ -333,7 +333,7 @@ def _convert_to_array(self, values, name=None, other=None):
                 values = values.to_series()
         elif inferred_type in ('timedelta', 'timedelta64'):
             # have a timedelta, convert to to ns here
-            values = _possibly_cast_to_timedelta(values, coerce=coerce)
+            values = _possibly_cast_to_timedelta(values, coerce=coerce, dtype='timedelta64[ns]')
         elif inferred_type == 'integer':
             # py3 compat where dtype is 'm' but is an integer
             if values.dtype.kind == 'm':
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
index 34060d0c57a4e..3aa71ad02ba6a 100644
--- a/pandas/src/inference.pyx
+++ b/pandas/src/inference.pyx
@@ -17,29 +17,47 @@ def is_complex(object obj):
     return util.is_complex_object(obj)
 
 _TYPE_MAP = {
-    np.int8: 'integer',
-    np.int16: 'integer',
-    np.int32: 'integer',
-    np.int64: 'integer',
-    np.uint8: 'integer',
-    np.uint16: 'integer',
-    np.uint32: 'integer',
-    np.uint64: 'integer',
-    np.float32: 'floating',
-    np.float64: 'floating',
-    np.complex128: 'complex',
-    np.complex128: 'complex',
-    np.string_: 'string',
-    np.unicode_: 'unicode',
-    np.bool_: 'boolean',
-    np.datetime64 : 'datetime64',
-    np.timedelta64 : 'timedelta64'
+    'int8': 'integer',
+    'int16': 'integer',
+    'int32': 'integer',
+    'int64': 'integer',
+    'i' : 'integer',
+    'uint8': 'integer',
+    'uint16': 'integer',
+    'uint32': 'integer',
+    'uint64': 'integer',
+    'u' : 'integer',
+    'float32': 'floating',
+    'float64': 'floating',
+    'f' : 'floating',
+    'complex128': 'complex',
+    'c' : 'complex',
+    'string': 'string',
+    'S' : 'string',
+    'unicode': 'unicode',
+    'U' : 'unicode',
+    'bool': 'boolean',
+    'b' : 'boolean',
+    'datetime64[ns]' : 'datetime64',
+    'M' : 'datetime64',
+    'timedelta64[ns]' : 'timedelta64',
+    'm' : 'timedelta64',
 }
 
+# types only exist on certain platform
 try:
-    _TYPE_MAP[np.float128] = 'floating'
-    _TYPE_MAP[np.complex256] = 'complex'
-    _TYPE_MAP[np.float16] = 'floating'
+    np.float128
+    _TYPE_MAP['float128'] = 'floating'
+except AttributeError:
+    pass
+try:
+    np.complex256
+    _TYPE_MAP['complex256'] = 'complex'
+except AttributeError:
+    pass
+try:
+    np.float16
+    _TYPE_MAP['float16'] = 'floating'
 except AttributeError:
     pass
 
@@ -60,7 +78,10 @@ def infer_dtype(object _values):
 
     values = getattr(values, 'values', values)
 
-    val_kind = values.dtype.type
+    val_name = values.dtype.name
+    if val_name in _TYPE_MAP:
+        return _TYPE_MAP[val_name]
+    val_kind = values.dtype.kind
     if val_kind in _TYPE_MAP:
         return _TYPE_MAP[val_kind]
 
diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py
index 0a5693cc55466..b812c0637b0ad 100644
--- a/pandas/tseries/timedeltas.py
+++ b/pandas/tseries/timedeltas.py
@@ -156,12 +156,13 @@ def convert(r=None, unit=None, m=m):
     # no converter
     raise ValueError("cannot create timedelta string converter for [{0}]".format(r))
 
-def _possibly_cast_to_timedelta(value, coerce=True):
+def _possibly_cast_to_timedelta(value, coerce=True, dtype=None):
     """ try to cast to timedelta64, if already a timedeltalike, then make
         sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards,
         don't force the conversion unless coerce is True
 
         if coerce='compat' force a compatibilty coercerion (to timedeltas) if needeed
+        if dtype is passed then this is the target dtype
         """
 
     # coercion compatability
@@ -201,10 +202,16 @@ def convert(td, dtype):
         return np.array([ convert(v,dtype) for v in value ], dtype='m8[ns]')
 
     # deal with numpy not being able to handle certain timedelta operations
-    if isinstance(value, (ABCSeries, np.ndarray)) and value.dtype.kind == 'm':
-        if value.dtype != 'timedelta64[ns]':
+    if isinstance(value, (ABCSeries, np.ndarray)):
+
+        # i8 conversions
+        if value.dtype == 'int64' and np.dtype(dtype) == 'timedelta64[ns]':
             value = value.astype('timedelta64[ns]')
-        return value
+            return value
+        elif value.dtype.kind == 'm':
+            if value.dtype != 'timedelta64[ns]':
+                value = value.astype('timedelta64[ns]')
+            return value
 
     # we don't have a timedelta, but we want to try to convert to one (but
     # don't force it)
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index e7385400e5962..491997d680ce7 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -148,7 +148,7 @@ cdef inline bint _is_fixed_offset(object tz):
         else:
             return 0
     return 1
-        
+
 
 _zero_time = datetime_time(0, 0)
 
@@ -340,7 +340,7 @@ class Timestamp(_Timestamp):
     @property
     def is_year_end(self):
         return self._get_start_end_field('is_year_end')
-    
+
     def tz_localize(self, tz):
         """
         Convert naive Timestamp to local time zone
@@ -994,7 +994,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz):
                 pandas_datetime_to_datetimestruct(obj.value + deltas[0],
                                                   PANDAS_FR_ns, &obj.dts)
             else:
-                pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts)        
+                pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts)
             obj.tzinfo = tz
         elif _treat_tz_as_pytz(tz):
             inf = tz._transition_info[pos]
@@ -1044,7 +1044,7 @@ cdef inline object _get_zone(object tz):
 cpdef inline object maybe_get_tz(object tz):
     '''
     (Maybe) Construct a timezone object from a string. If tz is a string, use it to construct a timezone object.
-    Otherwise, just return tz. 
+    Otherwise, just return tz.
     '''
     if isinstance(tz, string_types):
         split_tz = tz.split('/', 1)
@@ -1338,7 +1338,7 @@ def array_to_timedelta64(ndarray[object] values, coerce=False):
 def convert_to_timedelta(object ts, object unit='ns', coerce=False):
     return convert_to_timedelta64(ts, unit, coerce)
 
-cdef convert_to_timedelta64(object ts, object unit, object coerce):
+cdef inline convert_to_timedelta64(object ts, object unit, object coerce):
     """
     Convert an incoming object to a timedelta64 if possible
 
@@ -1953,9 +1953,9 @@ cdef inline bint _treat_tz_as_dateutil(object tz):
 cdef inline object _tz_cache_key(object tz):
     """
     Return the key in the cache for the timezone info object or None if unknown.
-    
+
     The key is currently the tz string for pytz timezones, the filename for dateutil timezones.
-    
+
     Notes
     =====
     This cannot just be the hash of a timezone object. Unfortunately, the hashes of two dateutil tz objects
@@ -2137,7 +2137,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, bint infer_dst=False):
     # right side
     idx_shifted = _ensure_int64(
         np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1))
- 
+
     for i in range(n):
         v = vals[i] - deltas[idx_shifted[i]]
         pos = bisect_right_i8(tdata, v, ntrans) - 1
@@ -2517,7 +2517,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
 
                 pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts)
                 dom = dts.day
-                
+
                 if dom == 1:
                     out[i] = 1
             return out.view(bool)
@@ -2535,7 +2535,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
                 doy = mo_off + dom
                 ldom = _month_offset[isleap, dts.month]
                 dow = ts_dayofweek(ts)
-                
+
                 if (ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2)):
                     out[i] = 1
             return out.view(bool)
@@ -2549,9 +2549,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
                 dom = dts.day
                 doy = mo_off + dom
                 ldom = _month_offset[isleap, dts.month]
-                
+
                 if ldom == doy:
-                    out[i] = 1 
+                    out[i] = 1
             return out.view(bool)
 
     elif field == 'is_quarter_start':
@@ -2565,7 +2565,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
                 dow = ts_dayofweek(ts)
 
                 if ((dts.month - start_month) % 3 == 0) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)):
-                    out[i] = 1 
+                    out[i] = 1
             return out.view(bool)
         else:
             for i in range(count):
@@ -2573,9 +2573,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
 
                 pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts)
                 dom = dts.day
-                
+
                 if ((dts.month - start_month) % 3 == 0) and dom == 1:
-                    out[i] = 1 
+                    out[i] = 1
             return out.view(bool)
 
     elif field == 'is_quarter_end':
@@ -2591,9 +2591,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
                 doy = mo_off + dom
                 ldom = _month_offset[isleap, dts.month]
                 dow = ts_dayofweek(ts)
-                
+
                 if ((dts.month - end_month) % 3 == 0) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))):
-                    out[i] = 1 
+                    out[i] = 1
             return out.view(bool)
         else:
             for i in range(count):
@@ -2605,9 +2605,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
                 dom = dts.day
                 doy = mo_off + dom
                 ldom = _month_offset[isleap, dts.month]
-                
+
                 if ((dts.month - end_month) % 3 == 0) and (ldom == doy):
-                    out[i] = 1 
+                    out[i] = 1
             return out.view(bool)
 
     elif field == 'is_year_start':
@@ -2621,7 +2621,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
                 dow = ts_dayofweek(ts)
 
                 if (dts.month == start_month) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)):
-                    out[i] = 1 
+                    out[i] = 1
             return out.view(bool)
         else:
             for i in range(count):
@@ -2649,7 +2649,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
                 ldom = _month_offset[isleap, dts.month]
 
                 if (dts.month == end_month) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))):
-                    out[i] = 1 
+                    out[i] = 1
             return out.view(bool)
         else:
             for i in range(count):
@@ -2666,7 +2666,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
                 if (dts.month == end_month) and (ldom == doy):
                     out[i] = 1
             return out.view(bool)
-    
+
     raise ValueError("Field %s not supported" % field)
 
 
diff --git a/vb_suite/inference.py b/vb_suite/inference.py
new file mode 100644
index 0000000000000..8855f7e654bb1
--- /dev/null
+++ b/vb_suite/inference.py
@@ -0,0 +1,36 @@
+from vbench.api import Benchmark
+from datetime import datetime
+import sys
+
+# from GH 7332
+
+setup = """from pandas_vb_common import *
+import pandas as pd
+N = 500000
+df_int64 = DataFrame(dict(A = np.arange(N,dtype='int64'), B = np.arange(N,dtype='int64')))
+df_int32 = DataFrame(dict(A = np.arange(N,dtype='int32'), B = np.arange(N,dtype='int32')))
+df_uint32 = DataFrame(dict(A = np.arange(N,dtype='uint32'), B = np.arange(N,dtype='uint32')))
+df_float64 = DataFrame(dict(A = np.arange(N,dtype='float64'), B = np.arange(N,dtype='float64')))
+df_float32 = DataFrame(dict(A = np.arange(N,dtype='float32'), B = np.arange(N,dtype='float32')))
+df_datetime64 = DataFrame(dict(A = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms'),
+                               B = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms')))
+df_timedelta64 = DataFrame(dict(A = df_datetime64['A']-df_datetime64['B'],
+                                B = df_datetime64['B']))
+"""
+
+dtype_infer_int64 = Benchmark('df_int64["A"] + df_int64["B"]', setup,
+                               start_date=datetime(2014, 1, 1))
+dtype_infer_int32 = Benchmark('df_int32["A"] + df_int32["B"]', setup,
+                               start_date=datetime(2014, 1, 1))
+dtype_infer_uint32 = Benchmark('df_uint32["A"] + df_uint32["B"]', setup,
+                               start_date=datetime(2014, 1, 1))
+dtype_infer_float64 = Benchmark('df_float64["A"] + df_float64["B"]', setup,
+                               start_date=datetime(2014, 1, 1))
+dtype_infer_float32 = Benchmark('df_float32["A"] + df_float32["B"]', setup,
+                               start_date=datetime(2014, 1, 1))
+dtype_infer_datetime64 = Benchmark('df_datetime64["A"] - df_datetime64["B"]', setup,
+                               start_date=datetime(2014, 1, 1))
+dtype_infer_timedelta64_1 = Benchmark('df_timedelta64["A"] + df_timedelta64["B"]', setup,
+                               start_date=datetime(2014, 1, 1))
+dtype_infer_timedelta64_2 = Benchmark('df_timedelta64["A"] + df_timedelta64["A"]', setup,
+                               start_date=datetime(2014, 1, 1))
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
index a1b38e8509e4e..be9aa03801641 100644
--- a/vb_suite/suite.py
+++ b/vb_suite/suite.py
@@ -12,6 +12,7 @@
            'index_object',
            'indexing',
            'io_bench',
+           'inference',
            'hdfstore_bench',
            'join_merge',
            'miscellaneous',