From ed3cdf0d58a3d4d430c0a40a80e195ba33da3bd2 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Fri, 29 Apr 2016 18:39:38 -0400
Subject: [PATCH] BUG: to_datetime when called with a unit and coerce is buggy

closes #11758
---
 doc/source/whatsnew/v0.18.1.txt         |  26 +++-
 pandas/io/json.py                       |   2 +-
 pandas/tseries/tests/test_timeseries.py |  39 ++++++
 pandas/tseries/tools.py                 |  17 ++-
 pandas/tslib.pyx                        | 173 +++++++++++++++++++++---
 5 files changed, 231 insertions(+), 26 deletions(-)

diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
index 01ed9bea53c87..2474bf8377d6e 100644
--- a/doc/source/whatsnew/v0.18.1.txt
+++ b/doc/source/whatsnew/v0.18.1.txt
@@ -428,6 +428,30 @@ In addition to this error change, several others have been made as well:
 - ``pd.read_csv()`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`)
 
 
+.. _whatsnew_0181.api.to_datetime:
+
+``to_datetime`` error changes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Bugs in ``pd.to_datetime()`` when passing a ``unit`` with convertible entries and ``errors='coerce'`` or non-convertible with ``errors='ignore'`` (:issue:`11758`)
+
+Previous behaviour:
+
+.. code-block:: python
+
+   In [27]: pd.to_datetime(1420043460, unit='s', errors='coerce')
+   Out[27]: NaT
+
+   In [28]: pd.to_datetime(11111111, unit='D', errors='ignore')
+   OverflowError: Python int too large to convert to C long
+
+New behaviour:
+
+.. ipython:: python
+
+   pd.to_datetime(1420043460, unit='s', errors='coerce')
+   pd.to_datetime(11111111, unit='D', errors='ignore')
+
 .. _whatsnew_0181.api.other:
 
 Other API changes
@@ -444,7 +468,6 @@ Other API changes
 - ``pd.concat(ignore_index=True)`` now uses ``RangeIndex`` as default (:issue:`12695`)
 - ``pd.merge()`` and ``DataFrame.join()`` will show a ``UserWarning`` when merging/joining a single- with a multi-leveled dataframe (:issue:`9455`, :issue:`12219`)
 
-
 .. _whatsnew_0181.deprecations:
 
 Deprecations
@@ -514,7 +537,6 @@ Bug Fixes
 - Bug in aligning a ``Series`` with a ``DataFrame`` (:issue:`13037`)
 
 
-
 - Bug in consistency of ``.name`` on ``.groupby(..).apply(..)`` cases (:issue:`12363`)
 
 
diff --git a/pandas/io/json.py b/pandas/io/json.py
index f06ec72062ffa..08bfd8d7796a0 100644
--- a/pandas/io/json.py
+++ b/pandas/io/json.py
@@ -400,7 +400,7 @@ def _try_convert_to_date(self, data):
             try:
                 new_data = to_datetime(new_data, errors='raise',
                                        unit=date_unit)
-            except OverflowError:
+            except ValueError:
                 continue
             except:
                 break
diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
index 794409888e4e2..d5accc2a65eb8 100644
--- a/pandas/tseries/tests/test_timeseries.py
+++ b/pandas/tseries/tests/test_timeseries.py
@@ -4151,6 +4151,7 @@ def test_basics_nanos(self):
         self.assertEqual(stamp.nanosecond, 500)
 
     def test_unit(self):
+
         def check(val, unit=None, h=1, s=1, us=0):
             stamp = Timestamp(val, unit=unit)
             self.assertEqual(stamp.year, 2000)
@@ -4217,6 +4218,44 @@ def check(val, unit=None, h=1, s=1, us=0):
         result = Timestamp('NaT')
         self.assertIs(result, NaT)
 
+    def test_unit_errors(self):
+        # GH 11758
+        # test proper behavior with erros
+
+        with self.assertRaises(ValueError):
+            to_datetime([1], unit='D', format='%Y%m%d')
+
+        values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan,
+                  'NaT', '']
+        result = to_datetime(values, unit='D', errors='ignore')
+        expected = Index([11111111, Timestamp('1970-01-02'),
+                          Timestamp('1970-01-02'), pd.NaT,
+                          pd.NaT, pd.NaT, pd.NaT, pd.NaT],
+                         dtype=object)
+        tm.assert_index_equal(result, expected)
+
+        result = to_datetime(values, unit='D', errors='coerce')
+        expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02',
+                                  'NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
+        tm.assert_index_equal(result, expected)
+
+        with self.assertRaises(ValueError):
+            to_datetime(values, unit='D', errors='raise')
+
+        values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT']
+
+        result = to_datetime(values, errors='ignore', unit='s')
+        expected = Index([1420043460000, pd.NaT, pd.NaT,
+                          pd.NaT, pd.NaT], dtype=object)
+        tm.assert_index_equal(result, expected)
+
+        result = to_datetime(values, errors='coerce', unit='s')
+        expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
+        tm.assert_index_equal(result, expected)
+
+        with self.assertRaises(ValueError):
+            to_datetime(values, errors='raise', unit='s')
+
     def test_roundtrip(self):
 
         # test value to string and back conversions
diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py
index 3bcc88827dee3..10ead73968f76 100644
--- a/pandas/tseries/tools.py
+++ b/pandas/tseries/tools.py
@@ -170,7 +170,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
                  mapping={True: 'coerce', False: 'raise'})
 def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
                 utc=None, box=True, format=None, exact=True, coerce=None,
-                unit='ns', infer_datetime_format=False):
+                unit=None, infer_datetime_format=False):
     """
     Convert argument to datetime.
 
@@ -293,7 +293,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
 
 def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
                  utc=None, box=True, format=None, exact=True,
-                 unit='ns', freq=None, infer_datetime_format=False):
+                 unit=None, freq=None, infer_datetime_format=False):
     """
     Same as to_datetime, but accept freq for
     DatetimeIndex internal construction
@@ -323,9 +323,17 @@ def _convert_listlike(arg, box, format, name=None):
                 arg = arg.tz_convert(None).tz_localize('UTC')
             return arg
 
-        elif format is None and com.is_integer_dtype(arg) and unit == 'ns':
-            result = arg.astype('datetime64[ns]')
+        elif unit is not None:
+            if format is not None:
+                raise ValueError("cannot specify both format and unit")
+            arg = getattr(arg, 'values', arg)
+            result = tslib.array_with_unit_to_datetime(arg, unit,
+                                                       errors=errors)
             if box:
+                if errors == 'ignore':
+                    from pandas import Index
+                    return Index(result, dtype=object)
+
                 return DatetimeIndex(result, tz='utc' if utc else None,
                                      name=name)
             return result
@@ -387,7 +395,6 @@ def _convert_listlike(arg, box, format, name=None):
                     dayfirst=dayfirst,
                     yearfirst=yearfirst,
                     freq=freq,
-                    unit=unit,
                     require_iso8601=require_iso8601
                 )
 
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index a325c140d36d9..757a73a43758f 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -802,6 +802,7 @@ cdef pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
 pandas_datetime_to_datetimestruct(_NS_LOWER_BOUND, PANDAS_FR_ns, &_NS_MIN_DTS)
 pandas_datetime_to_datetimestruct(_NS_UPPER_BOUND, PANDAS_FR_ns, &_NS_MAX_DTS)
 
+# Resolution is in nanoseconds
 Timestamp.min = Timestamp(_NS_LOWER_BOUND)
 Timestamp.max = Timestamp(_NS_UPPER_BOUND)
 
@@ -1344,6 +1345,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit,
 
 cpdef convert_str_to_tsobject(object ts, object tz, object unit,
                               dayfirst=False, yearfirst=False):
+    """ ts must be a string """
+
     cdef:
         _TSObject obj
         int out_local = 0, out_tzoffset = 0
@@ -1353,7 +1356,9 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit,
 
     obj = _TSObject()
 
-    if ts in _nat_strings:
+    assert util.is_string_object(ts)
+
+    if len(ts) == 0 or ts in _nat_strings:
         ts = NaT
     elif ts == 'now':
         # Issue 9000, we short-circuit rather than going
@@ -1778,6 +1783,10 @@ cdef inline object _parse_dateabbr_string(object date_string, object default,
         int year, quarter, month, mnum, date_len
 
     # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
+    assert util.is_string_object(date_string)
+
+    # len(date_string) == 0
+    # should be NaT???
 
     if date_string in _nat_strings:
         return NaT, NaT, ''
@@ -1961,9 +1970,141 @@ cpdef object _get_rule_month(object source, object default='DEC'):
         return source.split('-')[1]
 
 
+cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'):
+    """
+    convert the ndarray according to the unit
+    if errors:
+      - raise: return converted values or raise
+      - ignore: return non-convertible values as the same unit
+      - coerce: NaT for non-convertibles
+    """
+    cdef:
+        Py_ssize_t i, j, n=len(values)
+        int64_t m
+        ndarray[float64_t] fvalues
+        ndarray mask
+        bint is_ignore=errors=='ignore', is_coerce=errors=='coerce', is_raise=errors=='raise'
+        ndarray[int64_t] iresult
+        ndarray[object] oresult
+
+    assert is_ignore or is_coerce or is_raise
+
+    m = cast_from_unit(None, unit)
+
+    if is_raise:
+
+        # we can simply raise if there is a conversion
+        # issue; but we need to mask the nulls
+        # we need to guard against out-of-range conversions
+        # to i8
+        try:
+            iresult = values.astype('i8')
+            mask = iresult == iNaT
+            iresult[mask] = 0
+        except:
+
+            # we might have a directly convertible M8[ns]
+            if unit == 'ns':
+                try:
+                    return values.astype('M8[ns]')
+                except:
+                    pass
+
+            # we have nulls embedded
+            from pandas import isnull
+
+            values = values.astype('object')
+            mask = isnull(values)
+            values[mask] = 0
+            iresult = values.astype('i8')
+
+        fvalues = iresult.astype('f8') * m
+        if (fvalues < _NS_LOWER_BOUND).any() or (fvalues > _NS_UPPER_BOUND).any():
+            raise ValueError("cannot convert input with unit: {0}".format(unit))
+        result = (values*m).astype('M8[ns]')
+        iresult = result.view('i8')
+        iresult[mask] = iNaT
+        return result
+
+    # coerce or ignore
+    result = np.empty(n, dtype='M8[ns]')
+    iresult = result.view('i8')
+
+    try:
+        for i in range(n):
+            val = values[i]
+
+            if _checknull_with_nat(val):
+                iresult[i] = NPY_NAT
+
+            elif is_integer_object(val) or is_float_object(val):
+
+                if val != val or val == NPY_NAT:
+                    iresult[i] = NPY_NAT
+                else:
+                    try:
+                        iresult[i] = cast_from_unit(val, unit)
+                    except:
+                        if is_ignore:
+                            raise
+                        iresult[i] = NPY_NAT
+
+            elif util.is_string_object(val):
+                if len(val) == 0 or val in _nat_strings:
+                    iresult[i] = NPY_NAT
+
+                else:
+                    try:
+                        iresult[i] = cast_from_unit(float(val), unit)
+                    except:
+                        if is_ignore:
+                            raise
+                        iresult[i] = NPY_NAT
+
+            else:
+
+                if is_ignore:
+                    raise Exception
+                iresult[i] = NPY_NAT
+
+        return result
+
+    except:
+        pass
+
+        # we have hit an exception
+        # and are in ignore mode
+        # redo as object
+
+    oresult = np.empty(n, dtype=object)
+    for i in range(n):
+        val = values[i]
+
+        if _checknull_with_nat(val):
+            oresult[i] = NaT
+        elif is_integer_object(val) or is_float_object(val):
+
+            if val != val or val == NPY_NAT:
+                oresult[i] = NaT
+            else:
+                try:
+                    oresult[i] = Timestamp(cast_from_unit(val, unit))
+                except:
+                    oresult[i] = val
+
+        elif util.is_string_object(val):
+            if len(val) == 0 or val in _nat_strings:
+                oresult[i] = NaT
+
+            else:
+                oresult[i] = val
+
+    return oresult
+
+
 cpdef array_to_datetime(ndarray[object] values, errors='raise',
                         dayfirst=False, yearfirst=False, freq=None,
-                        format=None, utc=None, unit=None,
+                        format=None, utc=None,
                         require_iso8601=False):
     cdef:
         Py_ssize_t i, n = len(values)
@@ -1974,8 +2115,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
         bint utc_convert = bool(utc), seen_integer=0, seen_datetime=0
         bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce'
         _TSObject _ts
-        int64_t m = cast_from_unit(None,unit)
-        int out_local = 0, out_tzoffset = 0
+        int out_local=0, out_tzoffset=0
 
     # specify error conditions
     assert is_raise or is_ignore or is_coerce
@@ -1991,7 +2131,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
                 seen_datetime=1
                 if val.tzinfo is not None:
                     if utc_convert:
-                        _ts = convert_to_tsobject(val, None, unit, 0, 0)
+                        _ts = convert_to_tsobject(val, None, 'ns', 0, 0)
                         iresult[i] = _ts.value
                         try:
                             _check_dts_bounds(&_ts.dts)
@@ -2043,23 +2183,20 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
                 if val == NPY_NAT:
                     iresult[i] = NPY_NAT
                 else:
-                    iresult[i] = val*m
+                    iresult[i] = val
                     seen_integer=1
             elif is_float_object(val) and not is_coerce:
                 if val != val or val == NPY_NAT:
                     iresult[i] = NPY_NAT
                 else:
-                    iresult[i] = cast_from_unit(val,unit)
+                    iresult[i] = <int64_t>val
                     seen_integer=1
             else:
                 try:
-                    if len(val) == 0:
+                    if len(val) == 0 or val in _nat_strings:
                         iresult[i] = NPY_NAT
                         continue
 
-                    elif val in _nat_strings:
-                        iresult[i] = NPY_NAT
-                        continue
                     _string_to_dts(val, &dts, &out_local, &out_tzoffset)
                     value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
                     if out_local == 1:
@@ -2139,10 +2276,11 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
             if _checknull_with_nat(val):
                 oresult[i] = val
             elif util.is_string_object(val):
-                if len(val) == 0:
-                    # TODO: ??
+
+                if len(val) == 0 or val in _nat_strings:
                     oresult[i] = 'NaT'
                     continue
+
                 try:
                     oresult[i] = parse_datetime_string(val, dayfirst=dayfirst,
                                                     yearfirst=yearfirst, freq=freq)
@@ -2725,10 +2863,9 @@ class Timedelta(_Timedelta):
     __pos__ = _op_unary_method(lambda x: x, '__pos__')
     __abs__ = _op_unary_method(lambda x: abs(x), '__abs__')
 
-
-# Resolution is in nanoseconds
-Timedelta.min = Timedelta(np.iinfo(np.int64).min+1, 'ns')
-Timedelta.max = Timedelta(np.iinfo(np.int64).max, 'ns')
+# resolution in ns
+Timedelta.min = Timedelta(np.iinfo(np.int64).min+1)
+Timedelta.max = Timedelta(np.iinfo(np.int64).max)
 
 cdef PyTypeObject* td_type = <PyTypeObject*> Timedelta
 
@@ -2856,7 +2993,7 @@ cdef inline parse_timedelta_string(object ts, coerce=False):
     # have_value : track if we have at least 1 leading unit
     # have_hhmmss : tracks if we have a regular format hh:mm:ss
 
-    if ts in _nat_strings or not len(ts):
+    if len(ts) == 0 or ts in _nat_strings:
         return NPY_NAT
 
     # decode ts if necessary