diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 355dceba1b953..2e1cc396287ce 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -997,6 +997,7 @@ Conversion - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) - Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) +- Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`) Indexing ^^^^^^^^ diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index b0fb7048f154c..905f5278bcfd8 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -439,31 +439,86 @@ def infer_dtype(object value): return 'mixed' -cpdef bint is_possible_datetimelike_array(object arr): - # determine if we have a possible datetimelike (or null-like) array +cpdef object infer_datetimelike_array(object arr): + """ + infer if we have a datetime or timedelta array + - date: we have *only* date and maybe strings, nulls + - datetime: we have *only* datetimes and maybe strings, nulls + - timedelta: we have *only* timedeltas and maybe strings, nulls + - nat: we do not have *any* date, datetimes or timedeltas, but do have + at least a NaT + - mixed: other objects (strings or actual objects) + + Parameters + ---------- + arr : object array + + Returns + ------- + string: {datetime, timedelta, date, nat, mixed} + + """ + cdef: Py_ssize_t i, n = len(arr) - bint seen_timedelta = 0, seen_datetime = 0 + bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 + bint seen_nat = 0 + list objs = [] object v for i in range(n): v = arr[i] if util.is_string_object(v): - continue + objs.append(v) + + if len(objs) == 3: + break + elif util._checknull(v): - continue - elif is_datetime(v): - seen_datetime=1 - elif is_timedelta(v): - seen_timedelta=1 + # nan or None + pass + elif v is NaT: + seen_nat = 1 + elif is_datetime(v) or util.is_datetime64_object(v): + # datetime, or np.datetime64 + seen_datetime = 1 + elif is_date(v): + seen_date = 1 + elif is_timedelta(v) or util.is_timedelta64_object(v): + # timedelta, or timedelta64 + seen_timedelta = 1 else: - return False - return seen_datetime or seen_timedelta + return 'mixed' + + if seen_date and not (seen_datetime or seen_timedelta): + return 'date' + elif seen_datetime and not seen_timedelta: + return 'datetime' + elif seen_timedelta and not seen_datetime: + return 'timedelta' + elif seen_nat: + return 'nat' + + # short-circuit by trying to + # actually convert these strings + # this is for performance as we don't need to try + # convert *every* string array + if len(objs) == 3: + try: + tslib.array_to_datetime(objs, errors='raise') + return 'datetime' + except: + pass + + # we are *not* going to infer from strings + # for timedelta as too much ambiguity + + return 'mixed' cdef inline bint is_null_datetimelike(v): # determine if we have a null for a timedelta/datetime (or integer - # versions)x + # versions) if util._checknull(v): return True elif v is NaT: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1ab292649a973..6d28d3b4dfcd5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1366,6 +1366,15 @@ def test_constructor_with_datetimes(self): .reset_index(drop=True), 'b': i_no_tz}) tm.assert_frame_equal(df, expected) + def test_constructor_datetimes_with_nulls(self): + # gh-15869 + for arr in [np.array([None, None, None, None, + datetime.now(), None]), + np.array([None, None, datetime.now(), None])]: + result = DataFrame(arr).get_dtype_counts() + expected = Series({'datetime64[ns]': 1}) + tm.assert_series_equal(result, expected) + def test_constructor_for_list_with_dtypes(self): # TODO(wesm): unused intname = np.dtype(np.int_).name # noqa diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 42427df90401d..50fa0dca6bf04 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -12,7 +12,7 @@ from numpy.random import randn import numpy as np -from pandas import DataFrame, Series +from pandas import DataFrame, Series, date_range, timedelta_range import pandas as pd from pandas.util.testing import (assert_almost_equal, @@ -328,6 +328,16 @@ def test_empty_nonzero(self): self.assertTrue(df.empty) self.assertTrue(df.T.empty) + def test_with_datetimelikes(self): + + df = DataFrame({'A': date_range('20130101', periods=10), + 'B': timedelta_range('1 day', periods=10)}) + t = df.T + + result = t.get_dtype_counts() + expected = Series({'object': 10}) + tm.assert_series_equal(result, expected) + def test_inplace_return_self(self): # re #1893 diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 24e4355fa9f9a..dbe2db67359f3 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -327,6 +327,14 @@ def test_constructor_datelike_coercion(self): result = df.loc['216'] self.assertTrue(result.dtype == object) + def test_constructor_datetimes_with_nulls(self): + # gh-15869 + for arr in [np.array([None, None, None, None, + datetime.now(), None]), + np.array([None, None, datetime.now(), None])]: + result = Series(arr) + assert result.dtype == 'M8[ns]' + def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype='M8[ns]', index=lrange(5)) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 985e5b9f95831..4180ad1919315 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -774,6 +774,10 @@ def maybe_infer_to_datetimelike(value, convert_dates=False): if not v.ndim == 1: v = v.ravel() + # we only care about object dtypes + if not is_object_dtype(v): + return value + if len(v): def _try_datetime(v): @@ -806,25 +810,25 @@ def _try_timedelta(v): except: return v - # do a quick inference for perf - sample = v[:min(3, len(v))] - inferred_type = lib.infer_dtype(sample) + inferred_type = lib.infer_datetimelike_array(_ensure_object(v)) - if (inferred_type in ['datetime', 'datetime64'] or - (convert_dates and inferred_type in ['date'])): + if inferred_type == 'date' and convert_dates: + value = _try_datetime(v) + elif inferred_type == 'datetime': value = _try_datetime(v) - elif inferred_type in ['timedelta', 'timedelta64']: + elif inferred_type == 'timedelta': value = _try_timedelta(v) + elif inferred_type == 'nat': - # It's possible to have nulls intermixed within the datetime or - # timedelta. These will in general have an inferred_type of 'mixed', - # so have to try both datetime and timedelta. - - # try timedelta first to avoid spurious datetime conversions - # e.g. '00:00:01' is a timedelta but technically is also a datetime - elif inferred_type in ['mixed']: + # if all NaT, return as datetime + if isnull(v).all(): + value = _try_datetime(v) + else: - if lib.is_possible_datetimelike_array(_ensure_object(v)): + # We have at least a NaT and a string + # try timedelta first to avoid spurious datetime conversions + # e.g. '00:00:01' is a timedelta but + # technically is also a datetime value = _try_timedelta(v) if lib.infer_dtype(value) in ['mixed']: value = _try_datetime(v)