From 084cb34fcd4624b1288a4593f6446e1bf178f9c9 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 14 Oct 2018 19:34:17 -0700 Subject: [PATCH 1/7] implement take for EA mixins --- pandas/core/arrays/datetimelike.py | 81 +++++++++++-- pandas/core/arrays/datetimes.py | 31 ++++- pandas/core/arrays/period.py | 20 +++- pandas/core/arrays/timedeltas.py | 16 +++ pandas/tests/arrays/test_datetimelike.py | 145 ++++++++++++++++++++++- 5 files changed, 280 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 37fc451ba2a2b..ae69cfb49f3b4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -11,7 +11,8 @@ from pandas._libs.tslibs.period import ( Period, DIFFERENT_FREQ_INDEX, IncompatibleFrequency) -from pandas.errors import NullFrequencyError, PerformanceWarning +from pandas.errors import ( + NullFrequencyError, PerformanceWarning, AbstractMethodError) from pandas import compat from pandas.tseries import frequencies @@ -34,9 +35,10 @@ is_object_dtype) from pandas.core.dtypes.generic import ABCSeries, ABCDataFrame, ABCIndexClass from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.missing import isna import pandas.core.common as com -from pandas.core.algorithms import checked_add_with_arr +from pandas.core.algorithms import checked_add_with_arr, take from .base import ExtensionOpsMixin from pandas.util._decorators import deprecate_kwarg @@ -77,12 +79,10 @@ class AttributesMixin(object): @property def _attributes(self): # Inheriting subclass should implement _attributes as a list of strings - from pandas.errors import AbstractMethodError raise AbstractMethodError(self) @classmethod def _simple_new(cls, values, **kwargs): - from pandas.errors import AbstractMethodError raise AbstractMethodError(cls) def _get_attributes_dict(self): @@ -119,7 +119,7 @@ def _box_func(self): """ box function to get object from internal representation """ - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _box_values(self, values): """ @@ -140,6 +140,67 @@ def asi8(self): # do not cache or you'll create a memory leak return self.values.view('i8') + # ------------------------------------------------------------------ + # Extension Array Interface + # TODO: + # _from_sequence + # _from_factorized + # __setitem__ + # _values_for_argsort + # argsort + # fillna + # dropna + # shift + # unique + # _values_for_factorize + # factorize + # _formatting_values + # _reduce + + def _validate_fill_value(self, fill_value): + """ + If a fill_value is passed to `take` convert it to an i8 representation, + raising ValueError if this is not possible. + + Parameters + ---------- + fill_value : object + + Returns + ------- + fill_value : np.int64 + + Raises + ------ + ValueError + """ + raise AbstractMethodError(self) + + def take(self, indices, allow_fill=False, fill_value=None): + + if allow_fill: + fill_value = self._validate_fill_value(fill_value) + + new_values = take(self._data, + indices, + allow_fill=allow_fill, + fill_value=fill_value) + + return self._shallow_copy(new_values) + + @classmethod + def _concat_same_type(cls, to_concat): + # for TimedeltaArray and PeriodArray; DatetimeArray overrides + freqs = {x.freq for x in to_concat} + assert len(freqs) == 1 + freq = list(freqs)[0] + values = np.concatenate([x._data for x in to_concat]) + return cls._simple_new(values, freq=freq) + + def copy(self, deep=False): + # TODO: ignoring `deep`? + return self._shallow_copy(self._data.copy()) + # ------------------------------------------------------------------ # Array-like Methods @@ -211,6 +272,10 @@ def astype(self, dtype, copy=True): # ------------------------------------------------------------------ # Null Handling + def isna(self): + # EA Interface + return self._isnan + @property # NB: override with cache_readonly in immutable subclasses def _isnan(self): """ return if each value is nan""" @@ -352,13 +417,13 @@ def _add_datelike(self, other): typ=type(other).__name__)) def _sub_datelike(self, other): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _sub_period(self, other): return NotImplemented def _add_offset(self, offset): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _add_delta(self, other): return NotImplemented @@ -380,7 +445,7 @@ def _add_delta_tdi(self, other): Add a delta of a TimedeltaIndex return the i8 result view """ - if not len(self) == len(other): + if len(self) != len(other): raise ValueError("cannot add indices of unequal length") self_i8 = self.asi8 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6cc4922788cf3..2399a4561665e 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -12,7 +12,7 @@ conversion, fields, timezones, resolution as libresolution) -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import cache_readonly, Appender from pandas.errors import PerformanceWarning from pandas import compat @@ -298,6 +298,35 @@ def _generate_range(cls, start, end, periods, freq, tz=None, return cls._simple_new(index.values, freq=freq, tz=tz) + # ---------------------------------------------------------------- + # Extension Array Interface + + @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) + def _validate_fill_value(self, fill_value): + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, (datetime, np.datetime64)): + self._assert_tzawareness_compat(fill_value) + fill_value = Timestamp(fill_value).value + else: + raise ValueError("'fill_value' should be a Timestamp. " + "Got '{got}'.".format(got=fill_value)) + return fill_value + + @classmethod + def _concat_same_type(cls, to_concat): + # for TimedeltaArray and PeriodArray; DatetimeArray requires tz + freqs = {x.freq for x in to_concat} + assert len(freqs) == 1 + freq = list(freqs)[0] + + tzs = {x.tz for x in to_concat} + assert len(tzs) == 1 + tz = list(tzs)[0] + + values = np.concatenate([x._data for x in to_concat]) + return cls._simple_new(values, freq=freq, tz=tz) + # ----------------------------------------------------------------- # Descriptive Properties diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index d32ff76c0819b..b514911552bf7 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -14,13 +14,14 @@ from pandas._libs.tslibs.fields import isleapyear_arr from pandas import compat -from pandas.util._decorators import (cache_readonly, deprecate_kwarg) +from pandas.util._decorators import cache_readonly, deprecate_kwarg, Appender from pandas.core.dtypes.common import ( is_integer_dtype, is_float_dtype, is_period_dtype, is_datetime64_dtype) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import isna import pandas.core.common as com @@ -192,6 +193,23 @@ def _generate_range(cls, start, end, periods, freq, fields): return subarr, freq + # -------------------------------------------------------------------- + # ExtensionArray Interface + + @Appender(DatetimeLikeArrayMixin._validate_fill_value.__doc__) + def _validate_fill_value(self, fill_value): + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, Period): + if fill_value.freq != self.freq: + raise ValueError("'fill_value' freq must match own " + "freq ({freq})".format(freq=self.freq)) + fill_value = fill_value.ordinal + else: + raise ValueError("'fill_value' should be a Period. " + "Got '{got}'.".format(got=fill_value)) + return fill_value + # -------------------------------------------------------------------- # Vectorized analogues of Period properties diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 4904a90ab7b2b..d55ff1a41bc93 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -8,6 +8,8 @@ from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import array_to_timedelta64 +from pandas.util._decorators import Appender + from pandas import compat from pandas.core.dtypes.common import ( @@ -180,6 +182,20 @@ def _generate_range(cls, start, end, periods, freq, closed=None, **kwargs): return index + # ---------------------------------------------------------------- + # Extension Array Interface + + @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) + def _validate_fill_value(self, fill_value): + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)): + fill_value = Timedelta(fill_value).value + else: + raise ValueError("'fill_value' should be a Timedelta. " + "Got '{got}'.".format(got=fill_value)) + return fill_value + # ---------------------------------------------------------------- # Arithmetic Methods diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 6bb4241451b3f..eec0a9d8775bf 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -45,7 +45,105 @@ def datetime_index(request): return pi -class TestDatetimeArray(object): +@pytest.fixture +def timedelta_index(request): + """ + A fixture to provide TimedeltaIndex objects with different frequencies. + Most TimedeltaArray behavior is already tested in TimedeltaIndex tests, + so here we just test that the TimedeltaArray behavior matches + the TimedeltaIndex behavior. + """ + # TODO: flesh this out + return pd.TimedeltaIndex(['1 Day', '3 Hours', 'NaT']) + + +def index_to_array(index): + """ + Helper function to construct a Datetime/Timedelta/Period Array from an + instance of the corresponding Index subclass. + """ + if isinstance(index, pd.DatetimeIndex): + return DatetimeArrayMixin(index) + elif isinstance(index, pd.TimedeltaIndex): + return TimedeltaArrayMixin(index) + elif isinstance(index, pd.PeriodIndex): + return PeriodArrayMixin(index) + else: + raise TypeError(type(index)) + + +class SharedTests(object): + index_cls = None + + def test_take(self): + data = np.arange(100, dtype='i8') + np.random.shuffle(data) + + idx = self.index_cls._simple_new(data, freq='D') + arr = index_to_array(idx) + + takers = [1, 4, 94] + result = arr.take(takers) + expected = idx.take(takers) + + tm.assert_index_equal(self.index_cls(result), expected) + + takers = np.array([1, 4, 94]) + result = arr.take(takers) + expected = idx.take(takers) + + tm.assert_index_equal(self.index_cls(result), expected) + + def test_take_fill(self): + data = np.arange(10, dtype='i8') + + idx = self.index_cls._simple_new(data, freq='D') + arr = index_to_array(idx) + + result = arr.take([-1, 1], allow_fill=True, fill_value=None) + assert result[0] is pd.NaT + + result = arr.take([-1, 1], allow_fill=True, fill_value=np.nan) + assert result[0] is pd.NaT + + result = arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT) + assert result[0] is pd.NaT + + with pytest.raises(ValueError): + arr.take([0, 1], allow_fill=True, fill_value=2) + + with pytest.raises(ValueError): + arr.take([0, 1], allow_fill=True, fill_value=2.0) + + with pytest.raises(ValueError): + arr.take([0, 1], allow_fill=True, + fill_value=pd.Timestamp.now().time) + + +class TestDatetimeArray(SharedTests): + index_cls = pd.DatetimeIndex + + def test_take_fill_valid(self, datetime_index, tz_naive_fixture): + dti = datetime_index.tz_localize(tz_naive_fixture) + arr = index_to_array(dti) + + now = pd.Timestamp.now().tz_localize(dti.tz) + result = arr.take([-1, 1], allow_fill=True, fill_value=now) + assert result[0] == now + + with pytest.raises(ValueError): + # fill_value Timedelta invalid + arr.take([-1, 1], allow_fill=True, fill_value=now - now) + + with pytest.raises(ValueError): + # fill_value Period invalid + arr.take([-1, 1], allow_fill=True, fill_value=now.to_period('D')) + + tz = None if dti.tz is not None else 'US/Eastern' + now = pd.Timestamp.now().tz_localize(tz) + with pytest.raises(TypeError): + # Timestamp with mismatched tz-awareness + arr.take([-1, 1], allow_fill=True, fill_value=now) def test_from_dti(self, tz_naive_fixture): tz = tz_naive_fixture @@ -103,7 +201,26 @@ def test_int_properties(self, datetime_index, propname): tm.assert_numpy_array_equal(result, expected) -class TestTimedeltaArray(object): +class TestTimedeltaArray(SharedTests): + index_cls = pd.TimedeltaIndex + + def test_take_fill_valid(self, timedelta_index): + tdi = timedelta_index + arr = index_to_array(tdi) + + td1 = pd.Timedelta(days=1) + result = arr.take([-1, 1], allow_fill=True, fill_value=td1) + assert result[0] == td1 + + now = pd.Timestamp.now() + with pytest.raises(ValueError): + # fill_value Timestamp invalid + arr.take([0, 1], allow_fill=True, fill_value=now) + + with pytest.raises(ValueError): + # fill_value Period invalid + arr.take([0, 1], allow_fill=True, fill_value=now.to_period('D')) + def test_from_tdi(self): tdi = pd.TimedeltaIndex(['1 Day', '3 Hours']) arr = TimedeltaArrayMixin(tdi) @@ -123,7 +240,29 @@ def test_astype_object(self): assert list(asobj) == list(tdi) -class TestPeriodArray(object): +class TestPeriodArray(SharedTests): + index_cls = pd.PeriodIndex + + def test_take_fill_valid(self, period_index): + pi = period_index + arr = index_to_array(pi) + + now = pd.Timestamp.now().to_period(pi.freq) + result = arr.take([-1, 1], allow_fill=True, fill_value=now) + assert result[0] == now + + with pytest.raises(ValueError): + # fill_value Period with mis-matched freq invalid + arr.take([0, 1], allow_fill=True, + fill_value=pd.Timestamp.now().to_period(2*pi.freq)) + + with pytest.raises(ValueError): + # fill_value Timedelta invalid + arr.take([0, 1], allow_fill=True, fill_value=pd.Timedelta(days=1)) + + with pytest.raises(ValueError): + # fill_value Timestamp invalid + arr.take([0, 1], allow_fill=True, fill_value=now.to_timestamp()) def test_from_pi(self, period_index): pi = period_index From 0971615f4a78a99f7f1e8684b1e97f10dae57b41 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 14 Oct 2018 20:42:58 -0700 Subject: [PATCH 2/7] remove unused import --- pandas/core/arrays/datetimelike.py | 1 - pandas/tests/arrays/test_datetimelike.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ae69cfb49f3b4..59e780581938c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -35,7 +35,6 @@ is_object_dtype) from pandas.core.dtypes.generic import ABCSeries, ABCDataFrame, ABCIndexClass from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.missing import isna import pandas.core.common as com from pandas.core.algorithms import checked_add_with_arr, take diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index eec0a9d8775bf..42aadafcb49ab 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -254,7 +254,7 @@ def test_take_fill_valid(self, period_index): with pytest.raises(ValueError): # fill_value Period with mis-matched freq invalid arr.take([0, 1], allow_fill=True, - fill_value=pd.Timestamp.now().to_period(2*pi.freq)) + fill_value=pd.Timestamp.now().to_period(2 * pi.freq)) with pytest.raises(ValueError): # fill_value Timedelta invalid From a83adad995a18557bfc4eaecd3921120979baf84 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 15 Oct 2018 07:59:42 -0700 Subject: [PATCH 3/7] un-implement copy --- pandas/core/arrays/datetimelike.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 59e780581938c..18997b69c76a8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -155,6 +155,7 @@ def asi8(self): # factorize # _formatting_values # _reduce + # copy def _validate_fill_value(self, fill_value): """ @@ -196,10 +197,6 @@ def _concat_same_type(cls, to_concat): values = np.concatenate([x._data for x in to_concat]) return cls._simple_new(values, freq=freq) - def copy(self, deep=False): - # TODO: ignoring `deep`? - return self._shallow_copy(self._data.copy()) - # ------------------------------------------------------------------ # Array-like Methods From 755bc5c7db0bf867f8a522b97048c7401e14ec0a Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 15 Oct 2018 09:38:47 -0700 Subject: [PATCH 4/7] Avoid timezone-loss warning --- pandas/tests/arrays/test_datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 42aadafcb49ab..3a1c4b0d42d95 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -137,7 +137,7 @@ def test_take_fill_valid(self, datetime_index, tz_naive_fixture): with pytest.raises(ValueError): # fill_value Period invalid - arr.take([-1, 1], allow_fill=True, fill_value=now.to_period('D')) + arr.take([-1, 1], allow_fill=True, fill_value=pd.Period('2014Q1')) tz = None if dti.tz is not None else 'US/Eastern' now = pd.Timestamp.now().tz_localize(tz) From 50234e97282b8840897bc5861b61b058b9f6537e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 15 Oct 2018 14:30:37 -0700 Subject: [PATCH 5/7] Tests for concat_same_type --- pandas/tests/arrays/test_datetimelike.py | 49 ++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 3a1c4b0d42d95..48d87648c09c1 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -119,6 +119,17 @@ def test_take_fill(self): arr.take([0, 1], allow_fill=True, fill_value=pd.Timestamp.now().time) + def test_concat_same_type(self): + data = np.arange(10, dtype='i8') + + idx = self.index_cls._simple_new(data, freq='D').insert(0, pd.NaT) + arr = index_to_array(idx) + + result = arr._concat_same_type([arr[:-1], arr[1:], arr]) + expected = idx._concat_same_dtype([idx[:-1], idx[1:], idx], None) + + tm.assert_index_equal(self.index_cls(result), expected) + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex @@ -200,6 +211,19 @@ def test_int_properties(self, datetime_index, propname): tm.assert_numpy_array_equal(result, expected) + def test_concat_same_type_invalid(self, datetime_index): + # different timezones + dti = datetime_index + arr = DatetimeArrayMixin(dti) + + if arr.tz is None: + other = arr.tz_localize('UTC') + else: + other = arr.tz_localize(None) + + with pytest.raises(AssertionError): + arr._concat_same_type([arr, other]) + class TestTimedeltaArray(SharedTests): index_cls = pd.TimedeltaIndex @@ -239,6 +263,19 @@ def test_astype_object(self): assert asobj.dtype == 'O' assert list(asobj) == list(tdi) + def test_concat_same_type_invalid(self, timedelta_index): + # different freqs + tdi = timedelta_index + arr = TimedeltaArrayMixin(tdi) + + other = pd.timedelta_range('1D', periods=5, freq='2D') + # FIXME: TimedeltaArray should inherit freq='2D' without specifying it + other = TimedeltaArrayMixin(other, freq='2D') + assert other.freq != arr.freq + + with pytest.raises(AssertionError): + arr._concat_same_type([arr, other]) + class TestPeriodArray(SharedTests): index_cls = pd.PeriodIndex @@ -315,3 +352,15 @@ def test_int_properties(self, period_index, propname): expected = np.array(getattr(pi, propname)) tm.assert_numpy_array_equal(result, expected) + + def test_concat_same_type_invalid(self, period_index): + # different freqs + pi = period_index + arr = PeriodArrayMixin(pi) + + other = pd.period_range('2016Q3', periods=5, freq='3Q') + other = PeriodArrayMixin(other) + assert other.freq != arr.freq + + with pytest.raises(AssertionError): + arr._concat_same_type([arr, other]) From dcc985e6d93d1841e39f5d09052c2c1ed68e1cf3 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 15 Oct 2018 15:50:22 -0700 Subject: [PATCH 6/7] use Index.take for Index subclasses --- pandas/core/arrays/datetimelike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6924032207582..b57a8365a5322 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -185,8 +185,8 @@ def take(self, indices, allow_fill=False, fill_value=None): indices, allow_fill=allow_fill, fill_value=fill_value) - - return self._shallow_copy(new_values) + freq = self.freq if is_period_dtype(self) else None # TODO: use "infer"? + return self._shallow_copy(new_values, freq=freq) @classmethod def _concat_same_type(cls, to_concat): From a1fea06c6e0d26312dde52a5685c7abc3c64cbdb Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 15 Oct 2018 18:13:13 -0700 Subject: [PATCH 7/7] troubleshoot --- pandas/core/arrays/datetimelike.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b57a8365a5322..df5440006ec36 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -185,7 +185,10 @@ def take(self, indices, allow_fill=False, fill_value=None): indices, allow_fill=allow_fill, fill_value=fill_value) - freq = self.freq if is_period_dtype(self) else None # TODO: use "infer"? + + # TODO: use "infer"? Why does not passing freq cause + # failures in py37 but not py27? + freq = self.freq if is_period_dtype(self) else None return self._shallow_copy(new_values, freq=freq) @classmethod