From ed3cdf0d58a3d4d430c0a40a80e195ba33da3bd2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 29 Apr 2016 18:39:38 -0400 Subject: [PATCH] BUG: to_datetime when called with a unit and coerce is buggy closes #11758 --- doc/source/whatsnew/v0.18.1.txt | 26 +++- pandas/io/json.py | 2 +- pandas/tseries/tests/test_timeseries.py | 39 ++++++ pandas/tseries/tools.py | 17 ++- pandas/tslib.pyx | 173 +++++++++++++++++++++--- 5 files changed, 231 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 01ed9bea53c87..2474bf8377d6e 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -428,6 +428,30 @@ In addition to this error change, several others have been made as well: - ``pd.read_csv()`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`) +.. _whatsnew_0181.api.to_datetime: + +``to_datetime`` error changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Bugs in ``pd.to_datetime()`` when passing a ``unit`` with convertible entries and ``errors='coerce'`` or non-convertible with ``errors='ignore'`` (:issue:`11758`) + +Previous behaviour: + +.. code-block:: python + + In [27]: pd.to_datetime(1420043460, unit='s', errors='coerce') + Out[27]: NaT + + In [28]: pd.to_datetime(11111111, unit='D', errors='ignore') + OverflowError: Python int too large to convert to C long + +New behaviour: + +.. ipython:: python + + pd.to_datetime(1420043460, unit='s', errors='coerce') + pd.to_datetime(11111111, unit='D', errors='ignore') + .. _whatsnew_0181.api.other: Other API changes @@ -444,7 +468,6 @@ Other API changes - ``pd.concat(ignore_index=True)`` now uses ``RangeIndex`` as default (:issue:`12695`) - ``pd.merge()`` and ``DataFrame.join()`` will show a ``UserWarning`` when merging/joining a single- with a multi-leveled dataframe (:issue:`9455`, :issue:`12219`) - .. _whatsnew_0181.deprecations: Deprecations @@ -514,7 +537,6 @@ Bug Fixes - Bug in aligning a ``Series`` with a ``DataFrame`` (:issue:`13037`) - - Bug in consistency of ``.name`` on ``.groupby(..).apply(..)`` cases (:issue:`12363`) diff --git a/pandas/io/json.py b/pandas/io/json.py index f06ec72062ffa..08bfd8d7796a0 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -400,7 +400,7 @@ def _try_convert_to_date(self, data): try: new_data = to_datetime(new_data, errors='raise', unit=date_unit) - except OverflowError: + except ValueError: continue except: break diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 794409888e4e2..d5accc2a65eb8 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4151,6 +4151,7 @@ def test_basics_nanos(self): self.assertEqual(stamp.nanosecond, 500) def test_unit(self): + def check(val, unit=None, h=1, s=1, us=0): stamp = Timestamp(val, unit=unit) self.assertEqual(stamp.year, 2000) @@ -4217,6 +4218,44 @@ def check(val, unit=None, h=1, s=1, us=0): result = Timestamp('NaT') self.assertIs(result, NaT) + def test_unit_errors(self): + # GH 11758 + # test proper behavior with erros + + with self.assertRaises(ValueError): + to_datetime([1], unit='D', format='%Y%m%d') + + values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, + 'NaT', ''] + result = to_datetime(values, unit='D', errors='ignore') + expected = Index([11111111, Timestamp('1970-01-02'), + Timestamp('1970-01-02'), pd.NaT, + pd.NaT, pd.NaT, pd.NaT, pd.NaT], + dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, unit='D', errors='coerce') + expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', + 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + to_datetime(values, unit='D', errors='raise') + + values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] + + result = to_datetime(values, errors='ignore', unit='s') + expected = Index([1420043460000, pd.NaT, pd.NaT, + pd.NaT, pd.NaT], dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, errors='coerce', unit='s') + expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + to_datetime(values, errors='raise', unit='s') + def test_roundtrip(self): # test value to string and back conversions diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 3bcc88827dee3..10ead73968f76 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -170,7 +170,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): mapping={True: 'coerce', False: 'raise'}) def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, coerce=None, - unit='ns', infer_datetime_format=False): + unit=None, infer_datetime_format=False): """ Convert argument to datetime. @@ -293,7 +293,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, - unit='ns', freq=None, infer_datetime_format=False): + unit=None, freq=None, infer_datetime_format=False): """ Same as to_datetime, but accept freq for DatetimeIndex internal construction @@ -323,9 +323,17 @@ def _convert_listlike(arg, box, format, name=None): arg = arg.tz_convert(None).tz_localize('UTC') return arg - elif format is None and com.is_integer_dtype(arg) and unit == 'ns': - result = arg.astype('datetime64[ns]') + elif unit is not None: + if format is not None: + raise ValueError("cannot specify both format and unit") + arg = getattr(arg, 'values', arg) + result = tslib.array_with_unit_to_datetime(arg, unit, + errors=errors) if box: + if errors == 'ignore': + from pandas import Index + return Index(result, dtype=object) + return DatetimeIndex(result, tz='utc' if utc else None, name=name) return result @@ -387,7 +395,6 @@ def _convert_listlike(arg, box, format, name=None): dayfirst=dayfirst, yearfirst=yearfirst, freq=freq, - unit=unit, require_iso8601=require_iso8601 ) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index a325c140d36d9..757a73a43758f 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -802,6 +802,7 @@ cdef pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS pandas_datetime_to_datetimestruct(_NS_LOWER_BOUND, PANDAS_FR_ns, &_NS_MIN_DTS) pandas_datetime_to_datetimestruct(_NS_UPPER_BOUND, PANDAS_FR_ns, &_NS_MAX_DTS) +# Resolution is in nanoseconds Timestamp.min = Timestamp(_NS_LOWER_BOUND) Timestamp.max = Timestamp(_NS_UPPER_BOUND) @@ -1344,6 +1345,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, cpdef convert_str_to_tsobject(object ts, object tz, object unit, dayfirst=False, yearfirst=False): + """ ts must be a string """ + cdef: _TSObject obj int out_local = 0, out_tzoffset = 0 @@ -1353,7 +1356,9 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, obj = _TSObject() - if ts in _nat_strings: + assert util.is_string_object(ts) + + if len(ts) == 0 or ts in _nat_strings: ts = NaT elif ts == 'now': # Issue 9000, we short-circuit rather than going @@ -1778,6 +1783,10 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, int year, quarter, month, mnum, date_len # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 + assert util.is_string_object(date_string) + + # len(date_string) == 0 + # should be NaT??? if date_string in _nat_strings: return NaT, NaT, '' @@ -1961,9 +1970,141 @@ cpdef object _get_rule_month(object source, object default='DEC'): return source.split('-')[1] +cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): + """ + convert the ndarray according to the unit + if errors: + - raise: return converted values or raise + - ignore: return non-convertible values as the same unit + - coerce: NaT for non-convertibles + """ + cdef: + Py_ssize_t i, j, n=len(values) + int64_t m + ndarray[float64_t] fvalues + ndarray mask + bint is_ignore=errors=='ignore', is_coerce=errors=='coerce', is_raise=errors=='raise' + ndarray[int64_t] iresult + ndarray[object] oresult + + assert is_ignore or is_coerce or is_raise + + m = cast_from_unit(None, unit) + + if is_raise: + + # we can simply raise if there is a conversion + # issue; but we need to mask the nulls + # we need to guard against out-of-range conversions + # to i8 + try: + iresult = values.astype('i8') + mask = iresult == iNaT + iresult[mask] = 0 + except: + + # we might have a directly convertible M8[ns] + if unit == 'ns': + try: + return values.astype('M8[ns]') + except: + pass + + # we have nulls embedded + from pandas import isnull + + values = values.astype('object') + mask = isnull(values) + values[mask] = 0 + iresult = values.astype('i8') + + fvalues = iresult.astype('f8') * m + if (fvalues < _NS_LOWER_BOUND).any() or (fvalues > _NS_UPPER_BOUND).any(): + raise ValueError("cannot convert input with unit: {0}".format(unit)) + result = (values*m).astype('M8[ns]') + iresult = result.view('i8') + iresult[mask] = iNaT + return result + + # coerce or ignore + result = np.empty(n, dtype='M8[ns]') + iresult = result.view('i8') + + try: + for i in range(n): + val = values[i] + + if _checknull_with_nat(val): + iresult[i] = NPY_NAT + + elif is_integer_object(val) or is_float_object(val): + + if val != val or val == NPY_NAT: + iresult[i] = NPY_NAT + else: + try: + iresult[i] = cast_from_unit(val, unit) + except: + if is_ignore: + raise + iresult[i] = NPY_NAT + + elif util.is_string_object(val): + if len(val) == 0 or val in _nat_strings: + iresult[i] = NPY_NAT + + else: + try: + iresult[i] = cast_from_unit(float(val), unit) + except: + if is_ignore: + raise + iresult[i] = NPY_NAT + + else: + + if is_ignore: + raise Exception + iresult[i] = NPY_NAT + + return result + + except: + pass + + # we have hit an exception + # and are in ignore mode + # redo as object + + oresult = np.empty(n, dtype=object) + for i in range(n): + val = values[i] + + if _checknull_with_nat(val): + oresult[i] = NaT + elif is_integer_object(val) or is_float_object(val): + + if val != val or val == NPY_NAT: + oresult[i] = NaT + else: + try: + oresult[i] = Timestamp(cast_from_unit(val, unit)) + except: + oresult[i] = val + + elif util.is_string_object(val): + if len(val) == 0 or val in _nat_strings: + oresult[i] = NaT + + else: + oresult[i] = val + + return oresult + + cpdef array_to_datetime(ndarray[object] values, errors='raise', dayfirst=False, yearfirst=False, freq=None, - format=None, utc=None, unit=None, + format=None, utc=None, require_iso8601=False): cdef: Py_ssize_t i, n = len(values) @@ -1974,8 +2115,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', bint utc_convert = bool(utc), seen_integer=0, seen_datetime=0 bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' _TSObject _ts - int64_t m = cast_from_unit(None,unit) - int out_local = 0, out_tzoffset = 0 + int out_local=0, out_tzoffset=0 # specify error conditions assert is_raise or is_ignore or is_coerce @@ -1991,7 +2131,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', seen_datetime=1 if val.tzinfo is not None: if utc_convert: - _ts = convert_to_tsobject(val, None, unit, 0, 0) + _ts = convert_to_tsobject(val, None, 'ns', 0, 0) iresult[i] = _ts.value try: _check_dts_bounds(&_ts.dts) @@ -2043,23 +2183,20 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', if val == NPY_NAT: iresult[i] = NPY_NAT else: - iresult[i] = val*m + iresult[i] = val seen_integer=1 elif is_float_object(val) and not is_coerce: if val != val or val == NPY_NAT: iresult[i] = NPY_NAT else: - iresult[i] = cast_from_unit(val,unit) + iresult[i] = val seen_integer=1 else: try: - if len(val) == 0: + if len(val) == 0 or val in _nat_strings: iresult[i] = NPY_NAT continue - elif val in _nat_strings: - iresult[i] = NPY_NAT - continue _string_to_dts(val, &dts, &out_local, &out_tzoffset) value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) if out_local == 1: @@ -2139,10 +2276,11 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', if _checknull_with_nat(val): oresult[i] = val elif util.is_string_object(val): - if len(val) == 0: - # TODO: ?? + + if len(val) == 0 or val in _nat_strings: oresult[i] = 'NaT' continue + try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq) @@ -2725,10 +2863,9 @@ class Timedelta(_Timedelta): __pos__ = _op_unary_method(lambda x: x, '__pos__') __abs__ = _op_unary_method(lambda x: abs(x), '__abs__') - -# Resolution is in nanoseconds -Timedelta.min = Timedelta(np.iinfo(np.int64).min+1, 'ns') -Timedelta.max = Timedelta(np.iinfo(np.int64).max, 'ns') +# resolution in ns +Timedelta.min = Timedelta(np.iinfo(np.int64).min+1) +Timedelta.max = Timedelta(np.iinfo(np.int64).max) cdef PyTypeObject* td_type = Timedelta @@ -2856,7 +2993,7 @@ cdef inline parse_timedelta_string(object ts, coerce=False): # have_value : track if we have at least 1 leading unit # have_hhmmss : tracks if we have a regular format hh:mm:ss - if ts in _nat_strings or not len(ts): + if len(ts) == 0 or ts in _nat_strings: return NPY_NAT # decode ts if necessary