Skip to content

Commit f104171

Browse files
committed
feat: add time and date dtypes
Migrated from BigQuery googleapis/python-bigquery#972
1 parent 07758f2 commit f104171

13 files changed

+1552
-13
lines changed

db_dtypes/__init__.py

Lines changed: 171 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,178 @@
1515
Pandas Data Types for SQL systems (BigQuery, Spanner)
1616
"""
1717

18-
from .version import __version__
18+
import datetime
19+
20+
import numpy
21+
import pandas
22+
import pandas.compat.numpy.function
23+
import pandas.core.algorithms
24+
import pandas.core.arrays
25+
import pandas.core.dtypes.base
26+
import pandas.core.dtypes.dtypes
27+
import pandas.core.dtypes.generic
28+
import pandas.core.nanops
29+
import pyarrow
30+
31+
from db_dtypes.version import __version__
32+
from db_dtypes import core
33+
34+
35+
date_dtype_name = "date"
36+
time_dtype_name = "time"
37+
38+
39+
@pandas.core.dtypes.dtypes.register_extension_dtype
40+
class TimeDtype(core.BaseDatetimeDtype):
41+
"""
42+
Extension dtype for time data.
43+
"""
44+
45+
name = time_dtype_name
46+
type = datetime.time
47+
48+
def construct_array_type(self):
49+
return TimeArray
50+
51+
52+
class TimeArray(core.BaseDatetimeArray):
53+
"""
54+
Pandas array type containing time data
55+
"""
56+
57+
# Data are stored as datetime64 values with a date of Jan 1, 1970
58+
59+
dtype = TimeDtype()
60+
_epoch = datetime.datetime(1970, 1, 1)
61+
_npepoch = numpy.datetime64(_epoch)
62+
63+
@classmethod
64+
def _datetime(cls, scalar):
65+
if isinstance(scalar, datetime.time):
66+
return datetime.datetime.combine(cls._epoch, scalar)
67+
elif isinstance(scalar, str):
68+
# iso string
69+
h, m, s = map(float, scalar.split(":"))
70+
s, us = divmod(s, 1)
71+
return datetime.datetime(
72+
1970, 1, 1, int(h), int(m), int(s), int(us * 1000000)
73+
)
74+
else:
75+
raise TypeError("Invalid value type", scalar)
76+
77+
def _box_func(self, x):
78+
if pandas.isnull(x):
79+
return None
80+
81+
try:
82+
return x.astype("<M8[us]").astype(datetime.datetime).time()
83+
except AttributeError:
84+
x = numpy.datetime64(x)
85+
return x.astype("<M8[us]").astype(datetime.datetime).time()
86+
87+
__return_deltas = {"timedelta", "timedelta64", "timedelta64[ns]", "<m8", "<m8[ns]"}
88+
89+
def astype(self, dtype, copy=True):
90+
deltas = self._ndarray - self._npepoch
91+
stype = str(dtype)
92+
if stype in self.__return_deltas:
93+
return deltas
94+
elif stype.startswith("timedelta64[") or stype.startswith("<m8["):
95+
return deltas.astype(dtype, copy=False)
96+
else:
97+
return super().astype(dtype, copy=copy)
98+
99+
def __arrow_array__(self, type=None):
100+
return pyarrow.array(
101+
self.to_numpy(), type=type if type is not None else pyarrow.time64("ns"),
102+
)
103+
104+
105+
@pandas.core.dtypes.dtypes.register_extension_dtype
106+
class DateDtype(core.BaseDatetimeDtype):
107+
"""
108+
Extension dtype for time data.
109+
"""
110+
111+
name = date_dtype_name
112+
type = datetime.date
113+
114+
def construct_array_type(self):
115+
return DateArray
116+
117+
118+
class DateArray(core.BaseDatetimeArray):
119+
"""
120+
Pandas array type containing date data
121+
"""
122+
123+
# Data are stored as datetime64 values with a date of Jan 1, 1970
124+
125+
dtype = DateDtype()
126+
127+
@staticmethod
128+
def _datetime(scalar):
129+
if isinstance(scalar, datetime.date):
130+
return datetime.datetime(scalar.year, scalar.month, scalar.day)
131+
elif isinstance(scalar, str):
132+
# iso string
133+
return datetime.datetime(*map(int, scalar.split("-")))
134+
else:
135+
raise TypeError("Invalid value type", scalar)
136+
137+
def _box_func(self, x):
138+
if pandas.isnull(x):
139+
return None
140+
try:
141+
return x.astype("<M8[us]").astype(datetime.datetime).date()
142+
except AttributeError:
143+
x = numpy.datetime64(x)
144+
return x.astype("<M8[us]").astype(datetime.datetime).date()
145+
146+
def astype(self, dtype, copy=True):
147+
stype = str(dtype)
148+
if stype.startswith("datetime"):
149+
if stype == "datetime" or stype == "datetime64":
150+
dtype = self._ndarray.dtype
151+
return self._ndarray.astype(dtype, copy=copy)
152+
elif stype.startswith("<M8"):
153+
if stype == "<M8":
154+
dtype = self._ndarray.dtype
155+
return self._ndarray.astype(dtype, copy=copy)
156+
157+
return super().astype(dtype, copy=copy)
158+
159+
def __arrow_array__(self, type=None):
160+
return pyarrow.array(
161+
self._ndarray, type=type if type is not None else pyarrow.date32(),
162+
)
163+
164+
def __add__(self, other):
165+
if isinstance(other, pandas.DateOffset):
166+
return self.astype("object") + other
167+
168+
if isinstance(other, TimeArray):
169+
return (other._ndarray - other._npepoch) + self._ndarray
170+
171+
return super().__add__(other)
172+
173+
def __radd__(self, other):
174+
return self.__add__(other)
175+
176+
def __sub__(self, other):
177+
if isinstance(other, pandas.DateOffset):
178+
return self.astype("object") - other
179+
180+
if isinstance(other, self.__class__):
181+
return self._ndarray - other._ndarray
182+
183+
return super().__sub__(other)
184+
19185

20186
__all__ = [
21187
"__version__",
188+
"DateArray",
189+
"DateDtype",
190+
"TimeArray",
191+
"TimeDtype",
22192
]

db_dtypes/core.py

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from typing import Any, Optional, Sequence
16+
17+
import numpy
18+
import pandas
19+
from pandas._libs import NaT
20+
from pandas._typing import Scalar
21+
import pandas.compat.numpy.function
22+
import pandas.core.algorithms
23+
import pandas.core.arrays
24+
import pandas.core.dtypes.base
25+
from pandas.core.dtypes.common import is_dtype_equal, is_list_like, pandas_dtype
26+
import pandas.core.dtypes.dtypes
27+
import pandas.core.dtypes.generic
28+
import pandas.core.nanops
29+
30+
from db_dtypes import pandas_backports
31+
32+
33+
pandas_release = pandas_backports.pandas_release
34+
35+
36+
class BaseDatetimeDtype(pandas.core.dtypes.base.ExtensionDtype):
37+
na_value = NaT
38+
kind = "o"
39+
names = None
40+
41+
@classmethod
42+
def construct_from_string(cls, name):
43+
if name != cls.name:
44+
raise TypeError()
45+
46+
return cls()
47+
48+
49+
class BaseDatetimeArray(
50+
pandas_backports.OpsMixin, pandas_backports.NDArrayBackedExtensionArray
51+
):
52+
def __init__(self, values, dtype=None, copy: bool = False):
53+
if not (
54+
isinstance(values, numpy.ndarray) and values.dtype == numpy.dtype("<M8[ns]")
55+
):
56+
values = self.__ndarray(values)
57+
elif copy:
58+
values = values.copy()
59+
60+
super().__init__(values=values, dtype=values.dtype)
61+
62+
@classmethod
63+
def __ndarray(cls, scalars):
64+
return numpy.array(
65+
[None if scalar is None else cls._datetime(scalar) for scalar in scalars],
66+
"M8[ns]",
67+
)
68+
69+
@classmethod
70+
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
71+
if dtype is not None:
72+
assert dtype.__class__ is cls.dtype.__class__
73+
return cls(cls.__ndarray(scalars))
74+
75+
_from_sequence_of_strings = _from_sequence
76+
77+
def astype(self, dtype, copy=True):
78+
dtype = pandas_dtype(dtype)
79+
if is_dtype_equal(dtype, self.dtype):
80+
if not copy:
81+
return self
82+
else:
83+
return self.copy()
84+
85+
return super().astype(dtype, copy=copy)
86+
87+
def _cmp_method(self, other, op):
88+
if type(other) != type(self):
89+
return NotImplemented
90+
return op(self._ndarray, other._ndarray)
91+
92+
def __setitem__(self, key, value):
93+
if is_list_like(value):
94+
_datetime = self._datetime
95+
value = [_datetime(v) for v in value]
96+
elif not pandas.isna(value):
97+
value = self._datetime(value)
98+
return super().__setitem__(key, value)
99+
100+
def _from_factorized(self, unique, original):
101+
return self.__class__(unique)
102+
103+
def isna(self):
104+
return pandas.isna(self._ndarray)
105+
106+
def _validate_scalar(self, value):
107+
if pandas.isna(value):
108+
return None
109+
110+
if not isinstance(value, self.dtype.type):
111+
raise ValueError(value)
112+
113+
return value
114+
115+
def take(
116+
self,
117+
indices: Sequence[int],
118+
*,
119+
allow_fill: bool = False,
120+
fill_value: Any = None,
121+
):
122+
indices = numpy.asarray(indices, dtype=numpy.intp)
123+
data = self._ndarray
124+
if allow_fill:
125+
fill_value = self._validate_scalar(fill_value)
126+
fill_value = (
127+
numpy.datetime64()
128+
if fill_value is None
129+
else numpy.datetime64(self._datetime(fill_value))
130+
)
131+
if (indices < -1).any():
132+
raise ValueError(
133+
"take called with negative indexes other than -1,"
134+
" when a fill value is provided."
135+
)
136+
out = data.take(indices)
137+
if allow_fill:
138+
out[indices == -1] = fill_value
139+
140+
return self.__class__(out)
141+
142+
# TODO: provide implementations of dropna, fillna, unique,
143+
# factorize, argsort, searchsoeted for better performance over
144+
# abstract implementations.
145+
146+
def any(
147+
self,
148+
*,
149+
axis: Optional[int] = None,
150+
out=None,
151+
keepdims: bool = False,
152+
skipna: bool = True,
153+
):
154+
pandas.compat.numpy.function.validate_any(
155+
(), {"out": out, "keepdims": keepdims}
156+
)
157+
result = pandas.core.nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
158+
return result
159+
160+
def all(
161+
self,
162+
*,
163+
axis: Optional[int] = None,
164+
out=None,
165+
keepdims: bool = False,
166+
skipna: bool = True,
167+
):
168+
pandas.compat.numpy.function.validate_all(
169+
(), {"out": out, "keepdims": keepdims}
170+
)
171+
result = pandas.core.nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
172+
return result
173+
174+
def min(
175+
self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs
176+
) -> Scalar:
177+
pandas.compat.numpy.function.validate_min((), kwargs)
178+
result = pandas.core.nanops.nanmin(
179+
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
180+
)
181+
return self._box_func(result)
182+
183+
def max(
184+
self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs
185+
) -> Scalar:
186+
pandas.compat.numpy.function.validate_max((), kwargs)
187+
result = pandas.core.nanops.nanmax(
188+
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
189+
)
190+
return self._box_func(result)
191+
192+
if pandas_release >= (1, 2):
193+
194+
def median(
195+
self,
196+
*,
197+
axis: Optional[int] = None,
198+
out=None,
199+
overwrite_input: bool = False,
200+
keepdims: bool = False,
201+
skipna: bool = True,
202+
):
203+
pandas.compat.numpy.function.validate_median(
204+
(),
205+
{"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims},
206+
)
207+
result = pandas.core.nanops.nanmedian(
208+
self._ndarray, axis=axis, skipna=skipna
209+
)
210+
return self._box_func(result)

0 commit comments

Comments
 (0)