Skip to content

Commit 5fafeba

Browse files
committed
ENH: speed up Series.__getitem__ with timestamps, much more efficient implementation of Series.asof, close #1168
1 parent e9c0186 commit 5fafeba

File tree

11 files changed

+124
-137
lines changed

11 files changed

+124
-137
lines changed

pandas/core/algorithms.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ def value_counts(values, sort=True, ascending=False):
140140

141141
return result
142142

143+
143144
def rank(values, axis=0, method='average', na_option='keep',
144145
ascending=True):
145146
"""

pandas/core/daterange.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,3 @@ def __setstate__(self, aug_state):
4343
self.offset = offset
4444
self.tzinfo = tzinfo
4545
Index.__setstate__(self, *index_state)
46-
47-
def interval_range():
48-
"""
49-
Return a fixed frequency interval index
50-
"""

pandas/core/index.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from pandas.util import py3compat
1212
import pandas.core.common as com
1313
import pandas._tseries as lib
14-
import pandas._engines as _gin
1514

1615

1716
__all__ = ['Index']
@@ -65,7 +64,7 @@ class Index(np.ndarray):
6564
name = None
6665
asi8 = None
6766

68-
_engine_type = _gin.ObjectEngine
67+
_engine_type = lib.ObjectEngine
6968

7069
def __new__(cls, data, dtype=None, copy=False, name=None):
7170
if isinstance(data, np.ndarray):
@@ -385,6 +384,20 @@ def asof(self, label):
385384

386385
return label
387386

387+
def asof_locs(self, where, mask):
388+
"""
389+
390+
"""
391+
locs = self.values[mask].searchsorted(where.values, side='right')
392+
locs = np.where(locs > 0, locs - 1, 0)
393+
394+
result = np.arange(len(self))[mask].take(locs)
395+
396+
first = mask.argmax()
397+
result[(locs == 0) & (where < self.values[first])] = -1
398+
399+
return result
400+
388401
def order(self, return_indexer=False, ascending=True):
389402
"""
390403
Return sorted copy of Index
@@ -611,7 +624,7 @@ def get_value(self, series, key):
611624
raise
612625

613626
try:
614-
return _gin.get_value_at(series, key)
627+
return lib.get_value_at(series, key)
615628
except IndexError:
616629
raise
617630
except TypeError:
@@ -1053,7 +1066,7 @@ class Int64Index(Index):
10531066
_inner_indexer = lib.inner_join_indexer_int64
10541067
_outer_indexer = lib.outer_join_indexer_int64
10551068

1056-
_engine_type = _gin.Int64Engine
1069+
_engine_type = lib.Int64Engine
10571070

10581071
def __new__(cls, data, dtype=None, copy=False, name=None):
10591072
if not isinstance(data, np.ndarray):
@@ -1283,7 +1296,7 @@ def get_value(self, series, key):
12831296
pass
12841297

12851298
try:
1286-
return _gin.get_value_at(series, key)
1299+
return lib.get_value_at(series, key)
12871300
except IndexError:
12881301
raise
12891302
except TypeError:
@@ -2340,3 +2353,4 @@ def _maybe_box_dtindex(idx):
23402353
if isinstance(idx, DatetimeIndex):
23412354
return Index(_dt_box_array(idx.asi8), dtype='object')
23422355
return idx
2356+

pandas/core/series.py

Lines changed: 41 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def f(self, *args, **kwargs):
219219
return f
220220

221221
_stat_doc = """
222-
Return %(name)s of values
222+
Return %(name)s of values
223223
%(na_action)s
224224
225225
Parameters
@@ -238,7 +238,21 @@ def f(self, *args, **kwargs):
238238
_doc_ndarray_interface = ("Extra parameters are to preserve ndarray"
239239
"interface.\n")
240240

241-
#-------------------------------------------------------------------------------
241+
242+
def _make_stat_func(nanop, name, shortname, na_action=_doc_exclude_na,
243+
extras=_doc_ndarray_interface):
244+
245+
@Substitution(name=name, shortname=shortname,
246+
na_action=na_action, extras=extras)
247+
@Appender(_stat_doc)
248+
def f(self, axis=0, dtype=None, out=None, skipna=True, level=None):
249+
if level is not None:
250+
return self._agg_by_level(shortname, level=level, skipna=skipna)
251+
return nanop(self.values, skipna=skipna)
252+
f.__name__ = shortname
253+
return f
254+
255+
#----------------------------------------------------------------------
242256
# Series class
243257

244258
class Series(np.ndarray, generic.PandasObject):
@@ -985,21 +999,10 @@ def nunique(self):
985999
"""
9861000
return len(self.value_counts())
9871001

988-
@Substitution(name='sum', shortname='sum', na_action=_doc_exclude_na,
989-
extras=_doc_ndarray_interface)
990-
@Appender(_stat_doc)
991-
def sum(self, axis=0, dtype=None, out=None, skipna=True, level=None):
992-
if level is not None:
993-
return self._agg_by_level('sum', level=level, skipna=skipna)
994-
return nanops.nansum(self.values, skipna=skipna)
995-
996-
@Substitution(name='mean', shortname='mean', na_action=_doc_exclude_na,
997-
extras=_doc_ndarray_interface)
998-
@Appender(_stat_doc)
999-
def mean(self, axis=0, dtype=None, out=None, skipna=True, level=None):
1000-
if level is not None:
1001-
return self._agg_by_level('mean', level=level, skipna=skipna)
1002-
return nanops.nanmean(self.values, skipna=skipna)
1002+
sum = _make_stat_func(nanops.nansum, 'sum', 'sum')
1003+
mean = _make_stat_func(nanops.nanmean, 'mean', 'mean')
1004+
median = _make_stat_func(nanops.nanmedian, 'median', 'median', extras='')
1005+
prod = _make_stat_func(nanops.nanprod, 'product', 'prod', extras='')
10031006

10041007
@Substitution(name='mean absolute deviation', shortname='mad',
10051008
na_action=_doc_exclude_na, extras='')
@@ -1011,22 +1014,6 @@ def mad(self, skipna=True, level=None):
10111014
demeaned = self - self.mean(skipna=skipna)
10121015
return np.abs(demeaned).mean(skipna=skipna)
10131016

1014-
@Substitution(name='median', shortname='median',
1015-
na_action=_doc_exclude_na, extras='')
1016-
@Appender(_stat_doc)
1017-
def median(self, skipna=True, level=None):
1018-
if level is not None:
1019-
return self._agg_by_level('median', level=level, skipna=skipna)
1020-
return nanops.nanmedian(self.values, skipna=skipna)
1021-
1022-
@Substitution(name='product', shortname='product',
1023-
na_action=_doc_exclude_na, extras='')
1024-
@Appender(_stat_doc)
1025-
def prod(self, axis=None, dtype=None, out=None, skipna=True, level=None):
1026-
if level is not None:
1027-
return self._agg_by_level('prod', level=level, skipna=skipna)
1028-
return nanops.nanprod(self.values, skipna=skipna)
1029-
10301017
@Substitution(name='minimum', shortname='min',
10311018
na_action=_doc_exclude_na, extras='')
10321019
@Appender(_stat_doc)
@@ -2278,7 +2265,7 @@ def shift(self, periods=1, freq=None, **kwds):
22782265
return Series(self, index=self.index.shift(periods, offset),
22792266
name=self.name)
22802267

2281-
def asof(self, date):
2268+
def asof(self, where):
22822269
"""
22832270
Return last good (non-NaN) value in TimeSeries if value is NaN for
22842271
requested date.
@@ -2287,7 +2274,7 @@ def asof(self, date):
22872274
22882275
Parameters
22892276
----------
2290-
date : datetime or similar value
2277+
wehre : date or array of dates
22912278
22922279
Notes
22932280
-----
@@ -2297,46 +2284,27 @@ def asof(self, date):
22972284
-------
22982285
value or NaN
22992286
"""
2300-
if isinstance(date, basestring):
2301-
date = datetools.to_datetime(date)
2302-
2303-
if not isinstance(date, (list, tuple, np.ndarray, Index)):
2304-
# treat scalar values differently
2305-
v = self.get(date)
2306-
if isnull(v):
2307-
candidates = self.index[notnull(self)]
2308-
index = candidates.searchsorted(lib.Timestamp(date))
2309-
if index > 0:
2310-
asOfDate = candidates[index - 1]
2311-
return self.get(asOfDate)
2312-
return nan
2313-
return v
2287+
if isinstance(where, basestring):
2288+
where = datetools.to_datetime(where)
23142289

2315-
if not isinstance(date, Index):
2316-
date = Index(date)
2317-
2318-
candidates = self.index[notnull(self)]
2319-
2320-
mask = date.isin(candidates)
2321-
2322-
there = self[date[mask]]
2323-
todo = date[-mask]
2324-
2325-
if len(there) == len(date):
2326-
if len(there) == 1:
2327-
return there[0]
2328-
return there
2329-
2330-
index = candidates.searchsorted(todo)
2331-
index = index - 1
2332-
asof_mask = index >= 0
2333-
asof = self.ix[candidates[index[asof_mask]]]
2334-
asof.index = todo[asof_mask]
2335-
2336-
if len(date) == 1 and len(asof) > 0:
2337-
return asof[0]
2290+
values = self.values
23382291

2339-
return there.combine_first(asof).reindex(date)
2292+
if not hasattr(where, '__iter__'):
2293+
if where < self.index[0]:
2294+
return np.nan
2295+
loc = self.index.searchsorted(where, side='right')
2296+
if loc > 0:
2297+
loc -= 1
2298+
while isnull(values[loc]) and loc > 0:
2299+
loc -= 1
2300+
return values[loc]
2301+
2302+
if not isinstance(where, Index):
2303+
where = Index(where)
2304+
2305+
locs = self.index.asof_locs(where, notnull(values))
2306+
new_values = com.take_1d(values, locs)
2307+
return Series(new_values, index=where, name=self.name)
23402308

23412309
def interpolate(self, method='linear'):
23422310
"""

pandas/sparse/array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from pandas._sparse import BlockIndex, IntIndex
1616
import pandas._sparse as splib
17-
import pandas._engines as _gin
17+
import pandas._tseries as lib
1818

1919

2020
def _sparse_op_wrap(op, name):
@@ -261,7 +261,7 @@ def _get_val_at(self, loc):
261261
if sp_loc == -1:
262262
return self.fill_value
263263
else:
264-
return _gin.get_value_at(self, sp_loc)
264+
return lib.get_value_at(self, sp_loc)
265265

266266
def take(self, indices, axis=0):
267267
"""

0 commit comments

Comments
 (0)