Skip to content

PERF: better dtype inference for perf gains (GH7332) #7342

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 4, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ Enhancements
Performance
~~~~~~~~~~~

- Improvements in dtype inference for numeric operations involving yielding performance gains
for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`)


Experimental
~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1753,7 +1753,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
elif is_timedelta64:
from pandas.tseries.timedeltas import \
_possibly_cast_to_timedelta
value = _possibly_cast_to_timedelta(value, coerce='compat')
value = _possibly_cast_to_timedelta(value, coerce='compat', dtype=dtype)
except:
pass

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def _convert_to_array(self, values, name=None, other=None):
values = values.to_series()
elif inferred_type in ('timedelta', 'timedelta64'):
# have a timedelta, convert to to ns here
values = _possibly_cast_to_timedelta(values, coerce=coerce)
values = _possibly_cast_to_timedelta(values, coerce=coerce, dtype='timedelta64[ns]')
elif inferred_type == 'integer':
# py3 compat where dtype is 'm' but is an integer
if values.dtype.kind == 'm':
Expand Down
63 changes: 42 additions & 21 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,47 @@ def is_complex(object obj):
return util.is_complex_object(obj)

_TYPE_MAP = {
np.int8: 'integer',
np.int16: 'integer',
np.int32: 'integer',
np.int64: 'integer',
np.uint8: 'integer',
np.uint16: 'integer',
np.uint32: 'integer',
np.uint64: 'integer',
np.float32: 'floating',
np.float64: 'floating',
np.complex128: 'complex',
np.complex128: 'complex',
np.string_: 'string',
np.unicode_: 'unicode',
np.bool_: 'boolean',
np.datetime64 : 'datetime64',
np.timedelta64 : 'timedelta64'
'int8': 'integer',
'int16': 'integer',
'int32': 'integer',
'int64': 'integer',
'i' : 'integer',
'uint8': 'integer',
'uint16': 'integer',
'uint32': 'integer',
'uint64': 'integer',
'u' : 'integer',
'float32': 'floating',
'float64': 'floating',
'f' : 'floating',
'complex128': 'complex',
'c' : 'complex',
'string': 'string',
'S' : 'string',
'unicode': 'unicode',
'U' : 'unicode',
'bool': 'boolean',
'b' : 'boolean',
'datetime64[ns]' : 'datetime64',
'M' : 'datetime64',
'timedelta64[ns]' : 'timedelta64',
'm' : 'timedelta64',
}

# types only exist on certain platform
try:
_TYPE_MAP[np.float128] = 'floating'
_TYPE_MAP[np.complex256] = 'complex'
_TYPE_MAP[np.float16] = 'floating'
np.float128
_TYPE_MAP['float128'] = 'floating'
except AttributeError:
pass
try:
np.complex256
_TYPE_MAP['complex256'] = 'complex'
except AttributeError:
pass
try:
np.float16
_TYPE_MAP['float16'] = 'floating'
except AttributeError:
pass

Expand All @@ -60,7 +78,10 @@ def infer_dtype(object _values):

values = getattr(values, 'values', values)

val_kind = values.dtype.type
val_name = values.dtype.name
if val_name in _TYPE_MAP:
return _TYPE_MAP[val_name]
val_kind = values.dtype.kind
if val_kind in _TYPE_MAP:
return _TYPE_MAP[val_kind]

Expand Down
15 changes: 11 additions & 4 deletions pandas/tseries/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,13 @@ def convert(r=None, unit=None, m=m):
# no converter
raise ValueError("cannot create timedelta string converter for [{0}]".format(r))

def _possibly_cast_to_timedelta(value, coerce=True):
def _possibly_cast_to_timedelta(value, coerce=True, dtype=None):
""" try to cast to timedelta64, if already a timedeltalike, then make
sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards,
don't force the conversion unless coerce is True

if coerce='compat' force a compatibilty coercerion (to timedeltas) if needeed
if dtype is passed then this is the target dtype
"""

# coercion compatability
Expand Down Expand Up @@ -201,10 +202,16 @@ def convert(td, dtype):
return np.array([ convert(v,dtype) for v in value ], dtype='m8[ns]')

# deal with numpy not being able to handle certain timedelta operations
if isinstance(value, (ABCSeries, np.ndarray)) and value.dtype.kind == 'm':
if value.dtype != 'timedelta64[ns]':
if isinstance(value, (ABCSeries, np.ndarray)):

# i8 conversions
if value.dtype == 'int64' and np.dtype(dtype) == 'timedelta64[ns]':
value = value.astype('timedelta64[ns]')
return value
return value
elif value.dtype.kind == 'm':
if value.dtype != 'timedelta64[ns]':
value = value.astype('timedelta64[ns]')
return value

# we don't have a timedelta, but we want to try to convert to one (but
# don't force it)
Expand Down
44 changes: 22 additions & 22 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ cdef inline bint _is_fixed_offset(object tz):
else:
return 0
return 1


_zero_time = datetime_time(0, 0)

Expand Down Expand Up @@ -340,7 +340,7 @@ class Timestamp(_Timestamp):
@property
def is_year_end(self):
return self._get_start_end_field('is_year_end')

def tz_localize(self, tz):
"""
Convert naive Timestamp to local time zone
Expand Down Expand Up @@ -994,7 +994,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz):
pandas_datetime_to_datetimestruct(obj.value + deltas[0],
PANDAS_FR_ns, &obj.dts)
else:
pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts)
pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts)
obj.tzinfo = tz
elif _treat_tz_as_pytz(tz):
inf = tz._transition_info[pos]
Expand Down Expand Up @@ -1044,7 +1044,7 @@ cdef inline object _get_zone(object tz):
cpdef inline object maybe_get_tz(object tz):
'''
(Maybe) Construct a timezone object from a string. If tz is a string, use it to construct a timezone object.
Otherwise, just return tz.
Otherwise, just return tz.
'''
if isinstance(tz, string_types):
split_tz = tz.split('/', 1)
Expand Down Expand Up @@ -1338,7 +1338,7 @@ def array_to_timedelta64(ndarray[object] values, coerce=False):
def convert_to_timedelta(object ts, object unit='ns', coerce=False):
return convert_to_timedelta64(ts, unit, coerce)

cdef convert_to_timedelta64(object ts, object unit, object coerce):
cdef inline convert_to_timedelta64(object ts, object unit, object coerce):
"""
Convert an incoming object to a timedelta64 if possible

Expand Down Expand Up @@ -1953,9 +1953,9 @@ cdef inline bint _treat_tz_as_dateutil(object tz):
cdef inline object _tz_cache_key(object tz):
"""
Return the key in the cache for the timezone info object or None if unknown.

The key is currently the tz string for pytz timezones, the filename for dateutil timezones.

Notes
=====
This cannot just be the hash of a timezone object. Unfortunately, the hashes of two dateutil tz objects
Expand Down Expand Up @@ -2137,7 +2137,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, bint infer_dst=False):
# right side
idx_shifted = _ensure_int64(
np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1))

for i in range(n):
v = vals[i] - deltas[idx_shifted[i]]
pos = bisect_right_i8(tdata, v, ntrans) - 1
Expand Down Expand Up @@ -2517,7 +2517,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N

pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts)
dom = dts.day

if dom == 1:
out[i] = 1
return out.view(bool)
Expand All @@ -2535,7 +2535,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
doy = mo_off + dom
ldom = _month_offset[isleap, dts.month]
dow = ts_dayofweek(ts)

if (ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2)):
out[i] = 1
return out.view(bool)
Expand All @@ -2549,9 +2549,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
dom = dts.day
doy = mo_off + dom
ldom = _month_offset[isleap, dts.month]

if ldom == doy:
out[i] = 1
out[i] = 1
return out.view(bool)

elif field == 'is_quarter_start':
Expand All @@ -2565,17 +2565,17 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
dow = ts_dayofweek(ts)

if ((dts.month - start_month) % 3 == 0) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)):
out[i] = 1
out[i] = 1
return out.view(bool)
else:
for i in range(count):
if dtindex[i] == NPY_NAT: out[i] = -1; continue

pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts)
dom = dts.day

if ((dts.month - start_month) % 3 == 0) and dom == 1:
out[i] = 1
out[i] = 1
return out.view(bool)

elif field == 'is_quarter_end':
Expand All @@ -2591,9 +2591,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
doy = mo_off + dom
ldom = _month_offset[isleap, dts.month]
dow = ts_dayofweek(ts)

if ((dts.month - end_month) % 3 == 0) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))):
out[i] = 1
out[i] = 1
return out.view(bool)
else:
for i in range(count):
Expand All @@ -2605,9 +2605,9 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
dom = dts.day
doy = mo_off + dom
ldom = _month_offset[isleap, dts.month]

if ((dts.month - end_month) % 3 == 0) and (ldom == doy):
out[i] = 1
out[i] = 1
return out.view(bool)

elif field == 'is_year_start':
Expand All @@ -2621,7 +2621,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
dow = ts_dayofweek(ts)

if (dts.month == start_month) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)):
out[i] = 1
out[i] = 1
return out.view(bool)
else:
for i in range(count):
Expand Down Expand Up @@ -2649,7 +2649,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
ldom = _month_offset[isleap, dts.month]

if (dts.month == end_month) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))):
out[i] = 1
out[i] = 1
return out.view(bool)
else:
for i in range(count):
Expand All @@ -2666,7 +2666,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N
if (dts.month == end_month) and (ldom == doy):
out[i] = 1
return out.view(bool)

raise ValueError("Field %s not supported" % field)


Expand Down
36 changes: 36 additions & 0 deletions vb_suite/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from vbench.api import Benchmark
from datetime import datetime
import sys

# from GH 7332

setup = """from pandas_vb_common import *
import pandas as pd
N = 500000
df_int64 = DataFrame(dict(A = np.arange(N,dtype='int64'), B = np.arange(N,dtype='int64')))
df_int32 = DataFrame(dict(A = np.arange(N,dtype='int32'), B = np.arange(N,dtype='int32')))
df_uint32 = DataFrame(dict(A = np.arange(N,dtype='uint32'), B = np.arange(N,dtype='uint32')))
df_float64 = DataFrame(dict(A = np.arange(N,dtype='float64'), B = np.arange(N,dtype='float64')))
df_float32 = DataFrame(dict(A = np.arange(N,dtype='float32'), B = np.arange(N,dtype='float32')))
df_datetime64 = DataFrame(dict(A = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms'),
B = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms')))
df_timedelta64 = DataFrame(dict(A = df_datetime64['A']-df_datetime64['B'],
B = df_datetime64['B']))
"""

dtype_infer_int64 = Benchmark('df_int64["A"] + df_int64["B"]', setup,
start_date=datetime(2014, 1, 1))
dtype_infer_int32 = Benchmark('df_int32["A"] + df_int32["B"]', setup,
start_date=datetime(2014, 1, 1))
dtype_infer_uint32 = Benchmark('df_uint32["A"] + df_uint32["B"]', setup,
start_date=datetime(2014, 1, 1))
dtype_infer_float64 = Benchmark('df_float64["A"] + df_float64["B"]', setup,
start_date=datetime(2014, 1, 1))
dtype_infer_float32 = Benchmark('df_float32["A"] + df_float32["B"]', setup,
start_date=datetime(2014, 1, 1))
dtype_infer_datetime64 = Benchmark('df_datetime64["A"] - df_datetime64["B"]', setup,
start_date=datetime(2014, 1, 1))
dtype_infer_timedelta64_1 = Benchmark('df_timedelta64["A"] + df_timedelta64["B"]', setup,
start_date=datetime(2014, 1, 1))
dtype_infer_timedelta64_2 = Benchmark('df_timedelta64["A"] + df_timedelta64["A"]', setup,
start_date=datetime(2014, 1, 1))
1 change: 1 addition & 0 deletions vb_suite/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
'index_object',
'indexing',
'io_bench',
'inference',
'hdfstore_bench',
'join_merge',
'miscellaneous',
Expand Down