From f6e9fff74f13c3d3073ea51f98cb886115f01c10 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 4 Jun 2014 09:39:39 -0400 Subject: [PATCH] PERF: add dtype inference vbenches (GH7332) PERF: recognize int64 to timedelta64[ns] conversions and perform faster PERF: use names/kinds for dtype inference on known types DOC: performance docs --- doc/source/v0.14.1.txt | 4 +++ pandas/core/common.py | 2 +- pandas/core/ops.py | 2 +- pandas/src/inference.pyx | 63 ++++++++++++++++++++++++------------ pandas/tseries/timedeltas.py | 15 ++++++--- pandas/tslib.pyx | 44 ++++++++++++------------- vb_suite/inference.py | 36 +++++++++++++++++++++ vb_suite/suite.py | 1 + 8 files changed, 118 insertions(+), 49 deletions(-) create mode 100644 vb_suite/inference.py diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index bb4ecddd58f16..2cc08abf9c57c 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -75,6 +75,10 @@ Enhancements Performance ~~~~~~~~~~~ +- Improvements in dtype inference for numeric operations involving yielding performance gains +for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`) + + Experimental ~~~~~~~~~~~~ diff --git a/pandas/core/common.py b/pandas/core/common.py index d993112933fa9..e9ae26d0c7c81 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1753,7 +1753,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): elif is_timedelta64: from pandas.tseries.timedeltas import \ _possibly_cast_to_timedelta - value = _possibly_cast_to_timedelta(value, coerce='compat') + value = _possibly_cast_to_timedelta(value, coerce='compat', dtype=dtype) except: pass diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 72a31296ba456..0f19634cb5a38 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -333,7 +333,7 @@ def _convert_to_array(self, values, name=None, other=None): values = values.to_series() elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here - values = _possibly_cast_to_timedelta(values, coerce=coerce) + values = _possibly_cast_to_timedelta(values, coerce=coerce, dtype='timedelta64[ns]') elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 34060d0c57a4e..3aa71ad02ba6a 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -17,29 +17,47 @@ def is_complex(object obj): return util.is_complex_object(obj) _TYPE_MAP = { - np.int8: 'integer', - np.int16: 'integer', - np.int32: 'integer', - np.int64: 'integer', - np.uint8: 'integer', - np.uint16: 'integer', - np.uint32: 'integer', - np.uint64: 'integer', - np.float32: 'floating', - np.float64: 'floating', - np.complex128: 'complex', - np.complex128: 'complex', - np.string_: 'string', - np.unicode_: 'unicode', - np.bool_: 'boolean', - np.datetime64 : 'datetime64', - np.timedelta64 : 'timedelta64' + 'int8': 'integer', + 'int16': 'integer', + 'int32': 'integer', + 'int64': 'integer', + 'i' : 'integer', + 'uint8': 'integer', + 'uint16': 'integer', + 'uint32': 'integer', + 'uint64': 'integer', + 'u' : 'integer', + 'float32': 'floating', + 'float64': 'floating', + 'f' : 'floating', + 'complex128': 'complex', + 'c' : 'complex', + 'string': 'string', + 'S' : 'string', + 'unicode': 'unicode', + 'U' : 'unicode', + 'bool': 'boolean', + 'b' : 'boolean', + 'datetime64[ns]' : 'datetime64', + 'M' : 'datetime64', + 'timedelta64[ns]' : 'timedelta64', + 'm' : 'timedelta64', } +# types only exist on certain platform try: - _TYPE_MAP[np.float128] = 'floating' - _TYPE_MAP[np.complex256] = 'complex' - _TYPE_MAP[np.float16] = 'floating' + np.float128 + _TYPE_MAP['float128'] = 'floating' +except AttributeError: + pass +try: + np.complex256 + _TYPE_MAP['complex256'] = 'complex' +except AttributeError: + pass +try: + np.float16 + _TYPE_MAP['float16'] = 'floating' except AttributeError: pass @@ -60,7 +78,10 @@ def infer_dtype(object _values): values = getattr(values, 'values', values) - val_kind = values.dtype.type + val_name = values.dtype.name + if val_name in _TYPE_MAP: + return _TYPE_MAP[val_name] + val_kind = values.dtype.kind if val_kind in _TYPE_MAP: return _TYPE_MAP[val_kind] diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 0a5693cc55466..b812c0637b0ad 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -156,12 +156,13 @@ def convert(r=None, unit=None, m=m): # no converter raise ValueError("cannot create timedelta string converter for [{0}]".format(r)) -def _possibly_cast_to_timedelta(value, coerce=True): +def _possibly_cast_to_timedelta(value, coerce=True, dtype=None): """ try to cast to timedelta64, if already a timedeltalike, then make sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards, don't force the conversion unless coerce is True if coerce='compat' force a compatibilty coercerion (to timedeltas) if needeed + if dtype is passed then this is the target dtype """ # coercion compatability @@ -201,10 +202,16 @@ def convert(td, dtype): return np.array([ convert(v,dtype) for v in value ], dtype='m8[ns]') # deal with numpy not being able to handle certain timedelta operations - if isinstance(value, (ABCSeries, np.ndarray)) and value.dtype.kind == 'm': - if value.dtype != 'timedelta64[ns]': + if isinstance(value, (ABCSeries, np.ndarray)): + + # i8 conversions + if value.dtype == 'int64' and np.dtype(dtype) == 'timedelta64[ns]': value = value.astype('timedelta64[ns]') - return value + return value + elif value.dtype.kind == 'm': + if value.dtype != 'timedelta64[ns]': + value = value.astype('timedelta64[ns]') + return value # we don't have a timedelta, but we want to try to convert to one (but # don't force it) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index e7385400e5962..491997d680ce7 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -148,7 +148,7 @@ cdef inline bint _is_fixed_offset(object tz): else: return 0 return 1 - + _zero_time = datetime_time(0, 0) @@ -340,7 +340,7 @@ class Timestamp(_Timestamp): @property def is_year_end(self): return self._get_start_end_field('is_year_end') - + def tz_localize(self, tz): """ Convert naive Timestamp to local time zone @@ -994,7 +994,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pandas_datetime_to_datetimestruct(obj.value + deltas[0], PANDAS_FR_ns, &obj.dts) else: - pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz elif _treat_tz_as_pytz(tz): inf = tz._transition_info[pos] @@ -1044,7 +1044,7 @@ cdef inline object _get_zone(object tz): cpdef inline object maybe_get_tz(object tz): ''' (Maybe) Construct a timezone object from a string. If tz is a string, use it to construct a timezone object. - Otherwise, just return tz. + Otherwise, just return tz. ''' if isinstance(tz, string_types): split_tz = tz.split('/', 1) @@ -1338,7 +1338,7 @@ def array_to_timedelta64(ndarray[object] values, coerce=False): def convert_to_timedelta(object ts, object unit='ns', coerce=False): return convert_to_timedelta64(ts, unit, coerce) -cdef convert_to_timedelta64(object ts, object unit, object coerce): +cdef inline convert_to_timedelta64(object ts, object unit, object coerce): """ Convert an incoming object to a timedelta64 if possible @@ -1953,9 +1953,9 @@ cdef inline bint _treat_tz_as_dateutil(object tz): cdef inline object _tz_cache_key(object tz): """ Return the key in the cache for the timezone info object or None if unknown. - + The key is currently the tz string for pytz timezones, the filename for dateutil timezones. - + Notes ===== This cannot just be the hash of a timezone object. Unfortunately, the hashes of two dateutil tz objects @@ -2137,7 +2137,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, bint infer_dst=False): # right side idx_shifted = _ensure_int64( np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1)) - + for i in range(n): v = vals[i] - deltas[idx_shifted[i]] pos = bisect_right_i8(tdata, v, ntrans) - 1 @@ -2517,7 +2517,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day - + if dom == 1: out[i] = 1 return out.view(bool) @@ -2535,7 +2535,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N doy = mo_off + dom ldom = _month_offset[isleap, dts.month] dow = ts_dayofweek(ts) - + if (ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2)): out[i] = 1 return out.view(bool) @@ -2549,9 +2549,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N dom = dts.day doy = mo_off + dom ldom = _month_offset[isleap, dts.month] - + if ldom == doy: - out[i] = 1 + out[i] = 1 return out.view(bool) elif field == 'is_quarter_start': @@ -2565,7 +2565,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N dow = ts_dayofweek(ts) if ((dts.month - start_month) % 3 == 0) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): - out[i] = 1 + out[i] = 1 return out.view(bool) else: for i in range(count): @@ -2573,9 +2573,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day - + if ((dts.month - start_month) % 3 == 0) and dom == 1: - out[i] = 1 + out[i] = 1 return out.view(bool) elif field == 'is_quarter_end': @@ -2591,9 +2591,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N doy = mo_off + dom ldom = _month_offset[isleap, dts.month] dow = ts_dayofweek(ts) - + if ((dts.month - end_month) % 3 == 0) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))): - out[i] = 1 + out[i] = 1 return out.view(bool) else: for i in range(count): @@ -2605,9 +2605,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N dom = dts.day doy = mo_off + dom ldom = _month_offset[isleap, dts.month] - + if ((dts.month - end_month) % 3 == 0) and (ldom == doy): - out[i] = 1 + out[i] = 1 return out.view(bool) elif field == 'is_year_start': @@ -2621,7 +2621,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N dow = ts_dayofweek(ts) if (dts.month == start_month) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): - out[i] = 1 + out[i] = 1 return out.view(bool) else: for i in range(count): @@ -2649,7 +2649,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N ldom = _month_offset[isleap, dts.month] if (dts.month == end_month) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))): - out[i] = 1 + out[i] = 1 return out.view(bool) else: for i in range(count): @@ -2666,7 +2666,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N if (dts.month == end_month) and (ldom == doy): out[i] = 1 return out.view(bool) - + raise ValueError("Field %s not supported" % field) diff --git a/vb_suite/inference.py b/vb_suite/inference.py new file mode 100644 index 0000000000000..8855f7e654bb1 --- /dev/null +++ b/vb_suite/inference.py @@ -0,0 +1,36 @@ +from vbench.api import Benchmark +from datetime import datetime +import sys + +# from GH 7332 + +setup = """from pandas_vb_common import * +import pandas as pd +N = 500000 +df_int64 = DataFrame(dict(A = np.arange(N,dtype='int64'), B = np.arange(N,dtype='int64'))) +df_int32 = DataFrame(dict(A = np.arange(N,dtype='int32'), B = np.arange(N,dtype='int32'))) +df_uint32 = DataFrame(dict(A = np.arange(N,dtype='uint32'), B = np.arange(N,dtype='uint32'))) +df_float64 = DataFrame(dict(A = np.arange(N,dtype='float64'), B = np.arange(N,dtype='float64'))) +df_float32 = DataFrame(dict(A = np.arange(N,dtype='float32'), B = np.arange(N,dtype='float32'))) +df_datetime64 = DataFrame(dict(A = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms'), + B = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms'))) +df_timedelta64 = DataFrame(dict(A = df_datetime64['A']-df_datetime64['B'], + B = df_datetime64['B'])) +""" + +dtype_infer_int64 = Benchmark('df_int64["A"] + df_int64["B"]', setup, + start_date=datetime(2014, 1, 1)) +dtype_infer_int32 = Benchmark('df_int32["A"] + df_int32["B"]', setup, + start_date=datetime(2014, 1, 1)) +dtype_infer_uint32 = Benchmark('df_uint32["A"] + df_uint32["B"]', setup, + start_date=datetime(2014, 1, 1)) +dtype_infer_float64 = Benchmark('df_float64["A"] + df_float64["B"]', setup, + start_date=datetime(2014, 1, 1)) +dtype_infer_float32 = Benchmark('df_float32["A"] + df_float32["B"]', setup, + start_date=datetime(2014, 1, 1)) +dtype_infer_datetime64 = Benchmark('df_datetime64["A"] - df_datetime64["B"]', setup, + start_date=datetime(2014, 1, 1)) +dtype_infer_timedelta64_1 = Benchmark('df_timedelta64["A"] + df_timedelta64["B"]', setup, + start_date=datetime(2014, 1, 1)) +dtype_infer_timedelta64_2 = Benchmark('df_timedelta64["A"] + df_timedelta64["A"]', setup, + start_date=datetime(2014, 1, 1)) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index a1b38e8509e4e..be9aa03801641 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -12,6 +12,7 @@ 'index_object', 'indexing', 'io_bench', + 'inference', 'hdfstore_bench', 'join_merge', 'miscellaneous',