Skip to content

ENH: implement astype_overflowsafe for dt64 #46478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 4 additions & 37 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
NPY_FR_ns,
_string_to_dts,
astype_overflowsafe,
check_dts_bounds,
dt64_to_dtstruct,
dtstruct_to_dt64,
Expand Down Expand Up @@ -215,54 +216,20 @@ def ensure_datetime64ns(arr: ndarray, copy: bool = True):
-------
ndarray with dtype datetime64[ns]
"""
cdef:
Py_ssize_t i, n = arr.size
const int64_t[:] ivalues
int64_t[:] iresult
NPY_DATETIMEUNIT unit
npy_datetimestruct dts

shape = (<object>arr).shape

if (<object>arr).dtype.byteorder == ">":
# GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap
dtype = arr.dtype
arr = arr.astype(dtype.newbyteorder("<"))

if arr.size == 0:
# Fastpath; doesn't matter but we have old tests for result.base
# being arr.
result = arr.view(DT64NS_DTYPE)
if copy:
result = result.copy()
return result

if arr.dtype.kind != "M":
raise TypeError("ensure_datetime64ns arr must have datetime64 dtype")
unit = get_unit_from_dtype(arr.dtype)
if unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
# without raising explicitly here, we end up with a SystemError
# built-in function ensure_datetime64ns returned a result with an error
raise ValueError("datetime64/timedelta64 must have a unit specified")

if unit == NPY_FR_ns:
# Check this before allocating result for perf, might save some memory
if copy:
return arr.copy()
return arr

ivalues = arr.view(np.int64).ravel("K")

result = np.empty_like(arr, dtype=DT64NS_DTYPE)
iresult = result.ravel("K").view(np.int64)

for i in range(n):
if ivalues[i] != NPY_NAT:
pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts)
iresult[i] = dtstruct_to_dt64(&dts)
check_dts_bounds(&dts)
else:
iresult[i] = NPY_NAT

return result
return astype_overflowsafe(arr, DT64NS_DTYPE, copy=copy)


def ensure_timedelta64ns(arr: ndarray, copy: bool = True):
Expand Down
10 changes: 9 additions & 1 deletion pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ cdef extern from "numpy/ndarraytypes.h":
NPY_FR_as
NPY_FR_GENERIC

int64_t NPY_DATETIME_NAT # elswhere we call this NPY_NAT

cdef extern from "src/datetime/np_datetime.h":
ctypedef struct pandas_timedeltastruct:
int64_t days
Expand All @@ -67,7 +69,7 @@ cdef extern from "src/datetime/np_datetime.h":

cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1

cdef check_dts_bounds(npy_datetimestruct *dts)
cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=?)

cdef int64_t dtstruct_to_dt64(npy_datetimestruct* dts) nogil
cdef void dt64_to_dtstruct(int64_t dt64, npy_datetimestruct* out) nogil
Expand All @@ -86,3 +88,9 @@ cdef int _string_to_dts(str val, npy_datetimestruct* dts,
bint want_exc) except? -1

cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype)

cpdef cnp.ndarray astype_overflowsafe(
cnp.ndarray values, # ndarray[datetime64[anyunit]]
cnp.dtype dtype, # ndarray[datetime64[anyunit]]
bint copy=*,
)
3 changes: 3 additions & 0 deletions pandas/_libs/tslibs/np_datetime.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ class OutOfBoundsDatetime(ValueError): ...

# only exposed for testing
def py_get_unit_from_dtype(dtype: np.dtype): ...
def astype_overflowsafe(
arr: np.ndarray, dtype: np.dtype, copy: bool = ...
) -> np.ndarray: ...
105 changes: 98 additions & 7 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ import_datetime()
cimport numpy as cnp

cnp.import_array()
from numpy cimport int64_t
from numpy cimport (
int64_t,
ndarray,
)

from pandas._libs.tslibs.util cimport get_c_string_buf_and_size

Expand All @@ -36,7 +39,12 @@ cdef extern from "src/datetime/np_datetime.h":
pandas_timedeltastruct *result
) nogil

# AS, FS, PS versions exist but are not imported because they are not used.
npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
npy_datetimestruct _US_MIN_DTS, _US_MAX_DTS
npy_datetimestruct _MS_MIN_DTS, _MS_MAX_DTS
npy_datetimestruct _S_MIN_DTS, _S_MAX_DTS
npy_datetimestruct _M_MIN_DTS, _M_MAX_DTS

PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(cnp.PyArray_Descr *dtype);

Expand Down Expand Up @@ -119,22 +127,40 @@ class OutOfBoundsDatetime(ValueError):
pass


cdef inline check_dts_bounds(npy_datetimestruct *dts):
cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=NPY_FR_ns):
"""Raises OutOfBoundsDatetime if the given date is outside the range that
can be represented by nanosecond-resolution 64-bit integers."""
cdef:
bint error = False

if (dts.year <= 1677 and
cmp_npy_datetimestruct(dts, &_NS_MIN_DTS) == -1):
npy_datetimestruct cmp_upper, cmp_lower

if unit == NPY_FR_ns:
cmp_upper = _NS_MAX_DTS
cmp_lower = _NS_MIN_DTS
elif unit == NPY_FR_us:
cmp_upper = _US_MAX_DTS
cmp_lower = _US_MIN_DTS
elif unit == NPY_FR_ms:
cmp_upper = _MS_MAX_DTS
cmp_lower = _MS_MIN_DTS
elif unit == NPY_FR_s:
cmp_upper = _S_MAX_DTS
cmp_lower = _S_MIN_DTS
elif unit == NPY_FR_m:
cmp_upper = _M_MAX_DTS
cmp_lower = _M_MIN_DTS
else:
raise NotImplementedError(unit)

if cmp_npy_datetimestruct(dts, &cmp_lower) == -1:
error = True
elif (dts.year >= 2262 and
cmp_npy_datetimestruct(dts, &_NS_MAX_DTS) == 1):
elif cmp_npy_datetimestruct(dts, &cmp_upper) == 1:
error = True

if error:
fmt = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} '
f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}')
# TODO: "nanosecond" in the message assumes NPY_FR_ns
raise OutOfBoundsDatetime(f'Out of bounds nanosecond timestamp: {fmt}')


Expand Down Expand Up @@ -202,3 +228,68 @@ cdef inline int _string_to_dts(str val, npy_datetimestruct* dts,
buf = get_c_string_buf_and_size(val, &length)
return parse_iso_8601_datetime(buf, length, want_exc,
dts, out_local, out_tzoffset)


cpdef ndarray astype_overflowsafe(
ndarray values,
cnp.dtype dtype,
bint copy=True,
):
"""
Convert an ndarray with datetime64[X] to datetime64[Y], raising on overflow.
"""
if values.descr.type_num != cnp.NPY_DATETIME:
# aka values.dtype.kind != "M"
raise TypeError("astype_overflowsafe values must have datetime64 dtype")
if dtype.type_num != cnp.NPY_DATETIME:
raise TypeError("astype_overflowsafe dtype must be datetime64")

cdef:
NPY_DATETIMEUNIT from_unit = get_unit_from_dtype(values.dtype)
NPY_DATETIMEUNIT to_unit = get_unit_from_dtype(dtype)

if (
from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
):
# without raising explicitly here, we end up with a SystemError
# built-in function [...] returned a result with an error
raise ValueError("datetime64 values and dtype must have a unit specified")

if from_unit == to_unit:
# Check this before allocating result for perf, might save some memory
if copy:
return values.copy()
return values

cdef:
ndarray i8values = values.view("i8")

# equiv: result = np.empty((<object>values).shape, dtype="i8")
ndarray iresult = cnp.PyArray_EMPTY(
values.ndim, values.shape, cnp.NPY_INT64, 0
)

cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)
cnp.flatiter it
Py_ssize_t i, N = values.size
int64_t value, new_value
npy_datetimestruct dts

for i in range(N):
# Analogous to: item = values[i]
value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]

if value == NPY_DATETIME_NAT:
new_value = NPY_DATETIME_NAT
else:
pandas_datetime_to_datetimestruct(value, from_unit, &dts)
check_dts_bounds(&dts, to_unit)
new_value = npy_datetimestruct_to_datetime(to_unit, &dts)

# Analogous to: iresult[i] = new_value
(<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value

cnp.PyArray_MultiIter_NEXT(mi)

return iresult.view(dtype)
30 changes: 30 additions & 0 deletions pandas/_libs/tslibs/src/datetime/np_datetime.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,40 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
#include <numpy/ndarraytypes.h>
#include "np_datetime.h"


const npy_datetimestruct _AS_MIN_DTS = {
1969, 12, 31, 23, 59, 50, 776627, 963145, 224193};
const npy_datetimestruct _FS_MIN_DTS = {
1969, 12, 31, 21, 26, 16, 627963, 145224, 193000};
const npy_datetimestruct _PS_MIN_DTS = {
1969, 9, 16, 5, 57, 7, 963145, 224193, 0};
const npy_datetimestruct _NS_MIN_DTS = {
1677, 9, 21, 0, 12, 43, 145224, 193000, 0};
const npy_datetimestruct _US_MIN_DTS = {
-290308, 12, 21, 19, 59, 05, 224193, 0, 0};
const npy_datetimestruct _MS_MIN_DTS = {
-292275055, 5, 16, 16, 47, 4, 193000, 0, 0};
const npy_datetimestruct _S_MIN_DTS = {
-292277022657, 1, 27, 8, 29, 53, 0, 0, 0};
const npy_datetimestruct _M_MIN_DTS = {
-17536621475646, 5, 4, 5, 53, 0, 0, 0, 0};

const npy_datetimestruct _AS_MAX_DTS = {
1970, 1, 1, 0, 0, 9, 223372, 36854, 775807};
const npy_datetimestruct _FS_MAX_DTS = {
1970, 1, 1, 2, 33, 43, 372036, 854775, 807000};
const npy_datetimestruct _PS_MAX_DTS = {
1970, 4, 17, 18, 2, 52, 36854, 775807, 0};
const npy_datetimestruct _NS_MAX_DTS = {
2262, 4, 11, 23, 47, 16, 854775, 807000, 0};
const npy_datetimestruct _US_MAX_DTS = {
294247, 1, 10, 4, 0, 54, 775807, 0, 0};
const npy_datetimestruct _MS_MAX_DTS = {
292278994, 8, 17, 7, 12, 55, 807000, 0, 0};
const npy_datetimestruct _S_MAX_DTS = {
292277026596, 12, 4, 15, 30, 7, 0, 0, 0};
const npy_datetimestruct _M_MAX_DTS = {
17536621479585, 8, 30, 18, 7, 0, 0, 0, 0};


const int days_per_month_table[2][12] = {
Expand Down
14 changes: 14 additions & 0 deletions pandas/_libs/tslibs/src/datetime/np_datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,22 @@ typedef struct {
npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds;
} pandas_timedeltastruct;

extern const npy_datetimestruct _AS_MIN_DTS;
extern const npy_datetimestruct _AS_MAX_DTS;
extern const npy_datetimestruct _FS_MIN_DTS;
extern const npy_datetimestruct _FS_MAX_DTS;
extern const npy_datetimestruct _PS_MIN_DTS;
extern const npy_datetimestruct _PS_MAX_DTS;
extern const npy_datetimestruct _NS_MIN_DTS;
extern const npy_datetimestruct _NS_MAX_DTS;
extern const npy_datetimestruct _US_MIN_DTS;
extern const npy_datetimestruct _US_MAX_DTS;
extern const npy_datetimestruct _MS_MIN_DTS;
extern const npy_datetimestruct _MS_MAX_DTS;
extern const npy_datetimestruct _S_MIN_DTS;
extern const npy_datetimestruct _S_MAX_DTS;
extern const npy_datetimestruct _M_MIN_DTS;
extern const npy_datetimestruct _M_MAX_DTS;

// stuff pandas needs
// ----------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/tzconversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] stamps, tzinfo tz):

int64_t[::1] result

if is_utc(tz):
if is_utc(tz) or tz is None:
# Much faster than going through the "standard" pattern below
return stamps.copy()

Expand Down
56 changes: 55 additions & 1 deletion pandas/tests/tslibs/test_np_datetime.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import numpy as np
import pytest

from pandas._libs.tslibs.np_datetime import py_get_unit_from_dtype
from pandas._libs.tslibs.np_datetime import (
OutOfBoundsDatetime,
astype_overflowsafe,
py_get_unit_from_dtype,
)

import pandas._testing as tm


def test_get_unit_from_dtype():
Expand Down Expand Up @@ -35,3 +42,50 @@ def test_get_unit_from_dtype():
assert py_get_unit_from_dtype(np.dtype("m8[ps]")) == 11
assert py_get_unit_from_dtype(np.dtype("m8[fs]")) == 12
assert py_get_unit_from_dtype(np.dtype("m8[as]")) == 13


class TestAstypeOverflowSafe:
def test_pass_non_dt64_array(self):
# check that we raise, not segfault
arr = np.arange(5)
dtype = np.dtype("M8[ns]")

msg = "astype_overflowsafe values must have datetime64 dtype"
with pytest.raises(TypeError, match=msg):
astype_overflowsafe(arr, dtype, copy=True)

with pytest.raises(TypeError, match=msg):
astype_overflowsafe(arr, dtype, copy=False)

def test_pass_non_dt64_dtype(self):
# check that we raise, not segfault
arr = np.arange(5, dtype="i8").view("M8[D]")
dtype = np.dtype("m8[ns]")

msg = "astype_overflowsafe dtype must be datetime64"
with pytest.raises(TypeError, match=msg):
astype_overflowsafe(arr, dtype, copy=True)

with pytest.raises(TypeError, match=msg):
astype_overflowsafe(arr, dtype, copy=False)

def test_astype_overflowsafe(self):
dtype = np.dtype("M8[ns]")

dt = np.datetime64("2262-04-05", "D")
arr = dt + np.arange(10, dtype="m8[D]")

# arr.astype silently overflows, so this
wrong = arr.astype(dtype)
roundtrip = wrong.astype(arr.dtype)
assert not (wrong == roundtrip).all()

msg = "Out of bounds nanosecond timestamp"
with pytest.raises(OutOfBoundsDatetime, match=msg):
astype_overflowsafe(arr, dtype)

# But converting to microseconds is fine, and we match numpy's results.
dtype2 = np.dtype("M8[us]")
result = astype_overflowsafe(arr, dtype2)
expected = arr.astype(dtype2)
tm.assert_numpy_array_equal(result, expected)