From cf1d8210bfc988ab285ee1e8c152ff444f5cad68 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 26 Oct 2022 16:32:35 -0700 Subject: [PATCH 1/4] API: allow mixed-datetimes-and-ints in to_datetime, DatetimeIndex --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslib.pyi | 1 - pandas/_libs/tslib.pyx | 9 ---- pandas/core/arrays/datetimes.py | 10 +--- pandas/core/dtypes/cast.py | 64 ++++++++++--------------- pandas/tests/frame/test_constructors.py | 11 ++--- pandas/tests/tools/test_to_datetime.py | 11 +++-- 7 files changed, 39 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 78ea78ec97a3a..a9ad21d8e9608 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -141,6 +141,7 @@ Other API changes - The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`) - When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`) - :meth:`Series.unique` with dtype "timedelta64[ns]" or "datetime64[ns]" now returns :class:`TimedeltaArray` or :class:`DatetimeArray` instead of ``numpy.ndarray`` (:issue:`49176`) +- :func:`to_datetime` and :class:`DatetimeIndex` now allow sequences containing both ``datetime`` objects and numeric entries, matching :class:`Series` behavior (:issue:`49037`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 8fec9ecf27f30..e59c60f8c8355 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -24,7 +24,6 @@ def array_to_datetime( yearfirst: bool = ..., utc: bool = ..., require_iso8601: bool = ..., - allow_mixed: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 03331f54db892..1aa112d3b29f6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -448,7 +448,6 @@ cpdef array_to_datetime( bint yearfirst=False, bint utc=False, bint require_iso8601=False, - bint allow_mixed=False, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -477,8 +476,6 @@ cpdef array_to_datetime( indicator whether the dates should be UTC require_iso8601 : bool, default False indicator whether the datetime string should be iso8601 - allow_mixed : bool, default False - Whether to allow mixed datetimes and integers. Returns ------- @@ -708,12 +705,6 @@ cpdef array_to_datetime( val = values[i] if is_integer_object(val) or is_float_object(val): result[i] = NPY_NAT - elif allow_mixed: - pass - elif is_raise: - raise ValueError("mixed datetimes and integers in passed array") - else: - return _array_to_datetime_object(values, errors, dayfirst, yearfirst) if seen_datetime_offset and not utc_convert: # GH#17697 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8395d54224f1d..021d8e4989351 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1991,7 +1991,6 @@ def sequence_to_datetimes(data, require_iso8601: bool = False) -> DatetimeArray: """ result, tz, freq = _sequence_to_dt64ns( data, - allow_mixed=True, require_iso8601=require_iso8601, ) @@ -2009,7 +2008,6 @@ def _sequence_to_dt64ns( dayfirst: bool = False, yearfirst: bool = False, ambiguous: TimeAmbiguous = "raise", - allow_mixed: bool = False, require_iso8601: bool = False, ): """ @@ -2022,8 +2020,6 @@ def _sequence_to_dt64ns( yearfirst : bool, default False ambiguous : str, bool, or arraylike, default 'raise' See pandas._libs.tslibs.tzconversion.tz_localize_to_utc. - allow_mixed : bool, default False - Interpret integers as timestamps when datetime objects are also present. require_iso8601 : bool, default False Only consider ISO-8601 formats when parsing strings. @@ -2071,7 +2067,6 @@ def _sequence_to_dt64ns( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, - allow_mixed=allow_mixed, require_iso8601=require_iso8601, ) if tz and inferred_tz: @@ -2161,7 +2156,6 @@ def objects_to_datetime64ns( errors: DateTimeErrorChoices = "raise", require_iso8601: bool = False, allow_object: bool = False, - allow_mixed: bool = False, ): """ Convert data to array of timestamps. @@ -2178,8 +2172,6 @@ def objects_to_datetime64ns( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. - allow_mixed : bool, default False - Interpret integers as timestamps when datetime objects are also present. Returns ------- @@ -2208,7 +2200,7 @@ def objects_to_datetime64ns( dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, - allow_mixed=allow_mixed, + allow_mixed=True, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index cabd85aed1bbe..b1b64de189ede 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -65,7 +65,6 @@ is_complex, is_complex_dtype, is_datetime64_dtype, - is_dtype_equal, is_extension_array_dtype, is_float, is_float_dtype, @@ -1316,7 +1315,7 @@ def maybe_cast_to_datetime( Caller is responsible for handling ExtensionDtype cases. """ - from pandas.core.arrays.datetimes import sequence_to_datetimes + from pandas.core.arrays.datetimes import DatetimeArray from pandas.core.arrays.timedeltas import TimedeltaArray if not is_list_like(value): @@ -1332,47 +1331,34 @@ def maybe_cast_to_datetime( res = TimedeltaArray._from_sequence(value, dtype=dtype) return res + elif is_datetime64_dtype(dtype): + # Incompatible types in assignment (expression has type + # "Union[dtype[Any], ExtensionDtype]", variable has type + # "Optional[dtype[Any]]") + dtype = _ensure_nanosecond_dtype(dtype) # type: ignore[assignment] + + try: + dta = DatetimeArray._from_sequence(value, dtype=dtype) + except ParserError: + # Note: this is dateutil's ParserError, not ours. + pass + except ValueError as err: + # We can give a Series-specific exception message. + if "cannot supply both a tz and a timezone-naive dtype" in str(err): + raise ValueError( + "Cannot convert timezone-aware data to " + "timezone-naive dtype. Use " + "pd.Series(values).dt.tz_localize(None) instead." + ) from err + raise + + return dta + if dtype is not None: - is_datetime64 = is_datetime64_dtype(dtype) vdtype = getattr(value, "dtype", None) - if is_datetime64: - # Incompatible types in assignment (expression has type - # "Union[dtype[Any], ExtensionDtype]", variable has type - # "Optional[dtype[Any]]") - dtype = _ensure_nanosecond_dtype(dtype) # type: ignore[assignment] - - value = np.array(value, copy=False) - - # we have an array of datetime or timedeltas & nulls - if value.size or not is_dtype_equal(value.dtype, dtype): - _disallow_mismatched_datetimelike(value, dtype) - - try: - dta = sequence_to_datetimes(value) - # GH 25843: Remove tz information since the dtype - # didn't specify one - - if dta.tz is not None: - raise ValueError( - "Cannot convert timezone-aware data to " - "timezone-naive dtype. Use " - "pd.Series(values).dt.tz_localize(None) instead." - ) - - # TODO(2.0): Do this astype in sequence_to_datetimes to - # avoid potential extra copy? - dta = dta.astype(dtype, copy=False) - value = dta - - except OutOfBoundsDatetime: - raise - except ParserError: - # Note: this is dateutil's ParserError, not ours. - pass - - elif getattr(vdtype, "kind", None) in ["m", "M"]: + if getattr(vdtype, "kind", None) in ["m", "M"]: # we are already datetimelike and want to coerce to non-datetimelike; # astype_nansafe will raise for anything other than object, then upcast. # see test_datetimelike_values_with_object_dtype diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 10b1f8406025a..7a21543dd0b65 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -3102,14 +3102,11 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls): scalar = cls("NaT", "ns") dtype = {np.datetime64: "m8[ns]", np.timedelta64: "M8[ns]"}[cls] - msg = "Cannot cast" if cls is np.datetime64: - msg = "|".join( - [ - r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]", - "Cannot cast", - ] - ) + msg1 = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" + else: + msg1 = r"dtype timedelta64\[ns\] cannot be converted to datetime64\[ns\]" + msg = "|".join(["Cannot cast", msg1]) with pytest.raises(TypeError, match=msg): constructor(scalar, dtype=dtype) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f524bc18793d8..72f8b74f76504 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1444,9 +1444,14 @@ def test_unit_mixed(self, cache, exp, arr): result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) - msg = "mixed datetimes and integers in passed array" - with pytest.raises(ValueError, match=msg): - to_datetime(arr, errors="raise", cache=cache) + # GH#49037 pre-2.0 this raised, but it always worked with Series, + # was never clear why it was disallowed + result = to_datetime(arr, errors="raise", cache=cache) + expected = Index([Timestamp(x) for x in arr], dtype="M8[ns]") + tm.assert_index_equal(result, expected) + + result = DatetimeIndex(arr) + tm.assert_index_equal(result, expected) def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors From 8d50009fc7ddda2203f969c9cee6aece9c49120d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 26 Oct 2022 18:07:50 -0700 Subject: [PATCH 2/4] typo fixup --- pandas/core/arrays/datetimes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 021d8e4989351..4e1110c09c084 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2200,7 +2200,6 @@ def objects_to_datetime64ns( dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, - allow_mixed=True, ) result = result.reshape(data.shape, order=order) except OverflowError as err: From c9feaa0a54f377d71f9e60bca9403eac8e5ed37d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Oct 2022 08:15:03 -0700 Subject: [PATCH 3/4] typo fixup, update import --- pandas/core/dtypes/cast.py | 2 +- pandas/tests/frame/methods/test_combine_first.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b1b64de189ede..46f25e8f0f694 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1341,7 +1341,7 @@ def maybe_cast_to_datetime( dta = DatetimeArray._from_sequence(value, dtype=dtype) except ParserError: # Note: this is dateutil's ParserError, not ours. - pass + return value except ValueError as err: # We can give a Series-specific exception message. if "cannot supply both a tz and a timezone-naive dtype" in str(err): diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 30aef0bc0ec98..e838c8fabf456 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -6,10 +6,8 @@ from pandas.compat import pa_version_under7p0 from pandas.errors import PerformanceWarning -from pandas.core.dtypes.cast import ( - find_common_type, - is_dtype_equal, -) +from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.common import is_dtype_equal import pandas as pd from pandas import ( From 718863156468651d11f142dffb3433c9d677d564 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Oct 2022 15:06:42 -0700 Subject: [PATCH 4/4] mypy fixup --- pandas/core/dtypes/cast.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 78b80fe695cef..ba3a70fd59287 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1322,16 +1322,17 @@ def maybe_cast_to_datetime( return res elif is_datetime64_dtype(dtype): - # Incompatible types in assignment (expression has type - # "Union[dtype[Any], ExtensionDtype]", variable has type - # "Optional[dtype[Any]]") - dtype = _ensure_nanosecond_dtype(dtype) # type: ignore[assignment] + # Argument 1 to "_ensure_nanosecond_dtype" has incompatible type + # "Optional[dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]" + # error: Incompatible types in assignment (expression has type + # "Union[dtype[Any], ExtensionDtype]", variable has type "Optional[dtype[Any]]") + dtype = _ensure_nanosecond_dtype(dtype) # type: ignore[arg-type,assignment] try: dta = DatetimeArray._from_sequence(value, dtype=dtype) except ParserError: # Note: this is dateutil's ParserError, not ours. - return value + return np.asarray(value) except ValueError as err: # We can give a Series-specific exception message. if "cannot supply both a tz and a timezone-naive dtype" in str(err):