Skip to content

Commit 498206e

Browse files
committed
ENH: Int64Index class, Cython codegen, refactoring to enable transparent use
1 parent 7dbdd79 commit 498206e

16 files changed

+1368
-328
lines changed

pandas/core/daterange.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,9 @@ def union(self, other):
295295
else:
296296
return Index.union(self, other)
297297

298+
def _wrap_union_result(self, other, result):
299+
return Index(result)
300+
298301
def tz_normalize(self, tz):
299302
"""
300303
Convert DateRange from one time zone to another (using pytz)

pandas/core/frame.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1575,7 +1575,8 @@ def combine(self, other, func, fill_value=None):
15751575
this = self.reindex(new_index)
15761576
other = other.reindex(new_index)
15771577

1578-
new_columns = _try_sort(set(this.columns + other.columns))
1578+
# sorts if possible
1579+
new_columns = this.columns.union(other.columns)
15791580
do_fill = fill_value is not None
15801581

15811582
result = {}

pandas/core/groupby.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def groups(self):
115115
to_groupby = Index(to_groupby)
116116

117117
axis = self.obj._get_axis(self.axis)
118-
self._groups = _tseries.groupby(axis, to_groupby)
118+
self._groups = axis.groupby(to_groupby)
119119

120120
return self._groups
121121

@@ -477,7 +477,7 @@ def __init__(self, index, grouper=None, name=None, level=None):
477477
else:
478478
self.grouper = labels
479479

480-
self.index = index.values
480+
self.index = np.asarray(index.values, dtype=object)
481481

482482
# no level passed
483483
if not isinstance(self.grouper, np.ndarray):

pandas/core/index.py

Lines changed: 67 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class Index(np.ndarray):
4242
"""
4343
def __new__(cls, data, dtype=None, copy=False):
4444
if isinstance(data, np.ndarray):
45-
if issubclass(data.dtype.type, np.integer):
45+
if dtype is None and issubclass(data.dtype.type, np.integer):
4646
subarr = data.astype(np.int64)
4747
return subarr.view(Int64Index)
4848
subarr = np.array(data, dtype=object, copy=copy)
@@ -57,27 +57,33 @@ def __new__(cls, data, dtype=None, copy=False):
5757
subarr[:] = data
5858
return subarr.view(cls)
5959

60+
@property
61+
def dtype(self):
62+
return np.dtype('object')
63+
6064
def summary(self):
6165
if len(self) > 0:
6266
index_summary = ', %s to %s' % (str(self[0]), str(self[-1]))
6367
else:
6468
index_summary = ''
65-
return 'Index: %s entries%s' % (len(self), index_summary)
69+
70+
name = type(self).__name__
71+
return '%s: %s entries%s' % (name, len(self), index_summary)
6672

6773
@property
6874
def values(self):
6975
return np.asarray(self)
7076

7177
@cache_readonly
7278
def is_monotonic(self):
73-
return lib.is_monotonic(self)
79+
return lib.is_monotonic_object(self)
7480

7581
_indexMap = None
7682
@property
7783
def indexMap(self):
7884
"{label -> location}"
7985
if self._indexMap is None:
80-
self._indexMap = lib.map_indices_buf(self)
86+
self._indexMap = lib.map_indices_object(self)
8187
self._verify_integrity()
8288

8389
return self._indexMap
@@ -258,7 +264,14 @@ def union(self, other):
258264
except Exception:
259265
pass
260266

261-
return Index(result)
267+
# for subclasses
268+
return self._wrap_union_result(other, result)
269+
270+
def _wrap_union_result(self, other, result):
271+
if type(self) == type(other):
272+
return type(self)(result)
273+
else:
274+
return Index(result)
262275

263276
def intersection(self, other):
264277
"""
@@ -354,16 +367,24 @@ def get_indexer(self, target, method=None):
354367

355368
method = self._get_method(method)
356369

370+
if self.dtype != target.dtype:
371+
target = Index(target, dtype=object)
372+
357373
if method == 'pad':
358-
indexer = lib.pad(self, target, self.indexMap, target.indexMap)
374+
indexer = lib.pad_object(self, target, self.indexMap,
375+
target.indexMap)
359376
elif method == 'backfill':
360-
indexer = lib.backfill(self, target, self.indexMap, target.indexMap)
377+
indexer = lib.backfill_object(self, target, self.indexMap,
378+
target.indexMap)
361379
elif method is None:
362-
indexer = lib.merge_indexer(target, self.indexMap)
380+
indexer = lib.merge_indexer_object(target, self.indexMap)
363381
else:
364382
raise ValueError('unrecognized method: %s' % method)
365383
return indexer
366384

385+
def groupby(self, to_groupby):
386+
return lib.groupby(self, to_groupby)
387+
367388
def _get_method(self, method):
368389
if method:
369390
method = method.lower()
@@ -487,6 +508,23 @@ def copy(self, order='C'):
487508

488509
class Int64Index(Index):
489510

511+
def __new__(cls, data, dtype=None, copy=False):
512+
if isinstance(data, np.ndarray):
513+
subarr = np.array(data, dtype=np.int64, copy=copy)
514+
elif np.isscalar(data):
515+
raise ValueError('Index(...) must be called with a collection '
516+
'of some kind, %s was passed' % repr(data))
517+
else:
518+
# other iterable of some kind
519+
if not isinstance(data, (list, tuple)):
520+
data = list(data)
521+
subarr = np.asarray(data, dtype=np.int64)
522+
return subarr.view(cls)
523+
524+
@property
525+
def dtype(self):
526+
return np.dtype('int64')
527+
490528
@cache_readonly
491529
def is_monotonic(self):
492530
return lib.is_monotonic_int64(self)
@@ -513,26 +551,36 @@ def equals(self, other):
513551
if self is other:
514552
return True
515553

516-
if not isinstance(other, Int64Index):
517-
return False
554+
# if not isinstance(other, Int64Index):
555+
# return False
518556

519557
return np.array_equal(self, other)
520558

521559
def get_indexer(self, target, method=None):
522560
target = _ensure_index(target)
523561

562+
if self.dtype != target.dtype:
563+
this = Index(self, dtype=object)
564+
target = Index(target, dtype=object)
565+
return this.get_indexer(target, method=method)
566+
524567
method = self._get_method(method)
525568

526569
if method == 'pad':
527-
indexer = lib.pad(self, target, self.indexMap, target.indexMap)
570+
indexer = lib.pad_int64(self, target, self.indexMap,
571+
target.indexMap)
528572
elif method == 'backfill':
529-
indexer = lib.backfill(self, target, self.indexMap, target.indexMap)
573+
indexer = lib.backfill_int64(self, target, self.indexMap,
574+
target.indexMap)
530575
elif method is None:
531576
indexer = lib.merge_indexer_int64(target, self.indexMap)
532577
else:
533578
raise ValueError('unrecognized method: %s' % method)
534579
return indexer
535-
get_indexer.__doc__ = Index.get_indexer__doc__
580+
get_indexer.__doc__ = Index.get_indexer.__doc__
581+
582+
def groupby(self, to_groupby):
583+
return lib.groupby(self.values.astype(object), to_groupby)
536584

537585
class DateIndex(Index):
538586
pass
@@ -965,13 +1013,14 @@ def get_indexer(self, target, method=None):
9651013
self_index = self.get_tuple_index()
9661014

9671015
if method == 'pad':
968-
indexer = lib.pad(self_index, target_index, self_index.indexMap,
969-
target.indexMap)
1016+
indexer = lib.pad_object(self_index, target_index,
1017+
self_index.indexMap, target.indexMap)
9701018
elif method == 'backfill':
971-
indexer = lib.backfill(self_index, target_index, self_index.indexMap,
972-
target.indexMap)
1019+
indexer = lib.backfill_object(self_index, target_index,
1020+
self_index.indexMap, target.indexMap)
9731021
else:
974-
indexer = lib.merge_indexer(target_index, self_index.indexMap)
1022+
indexer = lib.merge_indexer_object(target_index,
1023+
self_index.indexMap)
9751024
return indexer
9761025

9771026
def reindex(self, target, method=None):

pandas/core/internals.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -650,7 +650,8 @@ def join_on(self, other, on, axis=1, lsuffix=None, rsuffix=None):
650650
this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
651651

652652
other_axis = other.axes[axis]
653-
indexer = lib.merge_indexer(on.astype(object), other_axis.indexMap)
653+
indexer = lib.merge_indexer_object(on.astype(object),
654+
other_axis.indexMap)
654655

655656
# TODO: deal with length-0 case? or does it fall out?
656657
mask = indexer == -1

pandas/core/series.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,8 +1239,8 @@ def map(self, arg):
12391239
if isinstance(arg, dict):
12401240
arg = Series(arg)
12411241

1242-
indexer = lib.merge_indexer(self.values.astype(object),
1243-
arg.index.indexMap)
1242+
indexer = lib.merge_indexer_object(self.values.astype(object),
1243+
arg.index.indexMap)
12441244

12451245
new_values = common.take_1d(np.asarray(arg), indexer)
12461246
return Series(new_values, index=self.index)

pandas/io/pytables.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -663,12 +663,12 @@ def _read_panel_table(self, group, where=None):
663663

664664
# need a better algorithm
665665
tuple_index = long_index.get_tuple_index()
666-
index_map = lib.map_indices_buf(tuple_index)
666+
index_map = lib.map_indices_object(tuple_index)
667667

668668
unique_tuples = lib.fast_unique(tuple_index)
669669
unique_tuples = _asarray_tuplesafe(unique_tuples)
670670

671-
indexer = lib.merge_indexer(unique_tuples, index_map)
671+
indexer = lib.merge_indexer_object(unique_tuples, index_map)
672672

673673
new_index = long_index.take(indexer)
674674
new_values = lp.values.take(indexer, axis=0)

pandas/src/common.pyx

Lines changed: 1 addition & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -80,76 +80,6 @@ PyDateTime_IMPORT
8080
# initialize numpy
8181
import_array()
8282

83-
cpdef map_indices(ndarray index):
84-
'''
85-
Produce a dict mapping the values of the input array to their respective
86-
locations.
87-
88-
Example:
89-
array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
90-
91-
Better to do this with Cython because of the enormous speed boost.
92-
'''
93-
cdef int i, length
94-
cdef flatiter iter
95-
cdef dict result
96-
cdef object idx
97-
98-
result = {}
99-
100-
iter = <flatiter> PyArray_IterNew(index)
101-
length = PyArray_SIZE(index)
102-
103-
for i from 0 <= i < length:
104-
idx = PyArray_GETITEM(index, PyArray_ITER_DATA(iter))
105-
result[idx] = i
106-
PyArray_ITER_NEXT(iter)
107-
108-
return result
109-
110-
@cython.wraparound(False)
111-
@cython.boundscheck(False)
112-
cpdef map_indices_buf(ndarray[object] index):
113-
'''
114-
Produce a dict mapping the values of the input array to their respective
115-
locations.
116-
117-
Example:
118-
array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
119-
120-
Better to do this with Cython because of the enormous speed boost.
121-
'''
122-
cdef Py_ssize_t i, length
123-
cdef dict result = {}
124-
125-
length = len(index)
126-
127-
for i from 0 <= i < length:
128-
result[index[i]] = i
129-
130-
return result
131-
132-
@cython.wraparound(False)
133-
@cython.boundscheck(False)
134-
cpdef map_indices_int64(ndarray[int64_t] index):
135-
'''
136-
Produce a dict mapping the values of the input array to their respective
137-
locations.
138-
139-
Example:
140-
array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
141-
142-
Better to do this with Cython because of the enormous speed boost.
143-
'''
144-
cdef Py_ssize_t i, length
145-
cdef dict result = {}
146-
147-
length = len(index)
148-
149-
for i from 0 <= i < length:
150-
result[index[i]] = i
151-
152-
return result
15383

15484
cpdef map_indices_list(list index):
15585
'''
@@ -171,6 +101,7 @@ cpdef map_indices_list(list index):
171101

172102
return result
173103

104+
174105
from libc.stdlib cimport malloc, free
175106

176107
cdef class MultiMap:
@@ -235,40 +166,3 @@ def isAllDates(ndarray[object, ndim=1] arr):
235166
return False
236167

237168
return True
238-
239-
def is_monotonic(ndarray[object] arr):
240-
cdef:
241-
Py_ssize_t i, n
242-
object prev, cur
243-
244-
n = len(arr)
245-
246-
if n < 2:
247-
return True
248-
249-
prev = arr[0]
250-
for i from 1 <= i < n:
251-
cur = arr[i]
252-
if cur < prev:
253-
return False
254-
prev = cur
255-
return True
256-
257-
258-
def is_monotonic_int64(ndarray[int64_t] arr):
259-
cdef:
260-
Py_ssize_t i, n
261-
int64_t prev, cur
262-
263-
n = len(arr)
264-
265-
if n < 2:
266-
return True
267-
268-
prev = arr[0]
269-
for i from 1 <= i < n:
270-
cur = arr[i]
271-
if cur < prev:
272-
return False
273-
prev = cur
274-
return True

0 commit comments

Comments
 (0)