From 10c094373a0f9888ac2beb18ae4af3e628755ae4 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 3 Jan 2014 18:53:00 -0500 Subject: [PATCH] BUG: Bug in selection with missing values via .ix from a duplicate indexed DataFrame failing (GH5835) --- doc/source/release.rst | 1 + pandas/core/internals.py | 81 +++++++++++++++++++++++------------ pandas/tests/test_indexing.py | 8 ++++ 3 files changed, 63 insertions(+), 27 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 3d4b5e8facfa4..0150b233110a7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -73,6 +73,7 @@ Bug Fixes ~~~~~~~~~ - Bug in Series replace with timestamp dict (:issue:`5797`) - read_csv/read_table now respects the `prefix` kwarg (:issue:`5732`). + - Bug in selection with missing values via ``.ix`` from a duplicate indexed DataFrame failing (:issue:`5835`) pandas 0.13.0 ------------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index d636edeec0815..7f49f7c1993bd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3119,6 +3119,9 @@ def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None, if not allow_dups and not self.axes[axis].is_unique: raise ValueError("cannot reindex from a duplicate axis") + if not self.is_consolidated(): + self = self.consolidate() + if axis == 0: return self._reindex_indexer_items(new_axis, indexer, fill_value) @@ -3140,38 +3143,62 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): new_blocks = [] is_unique = new_items.is_unique + # we have duplicates in the items and what we are reindexing + if not is_unique and not self.items.is_unique: + + rl = self._set_ref_locs(do_refs='force') + for i, idx in enumerate(indexer): + item = new_items.take([i]) + if idx >= 0: + blk, lidx = rl[idx] + blk = make_block(_block_shape(blk.iget(lidx)), item, + new_items, ndim=self.ndim, fastpath=True, + placement=[i]) + + # a missing value + else: + blk = self._make_na_block(item, + new_items, + placement=[i], + fill_value=fill_value) + new_blocks.append(blk) + new_blocks = _consolidate(new_blocks, new_items) + + # keep track of what items aren't found anywhere - l = np.arange(len(item_order)) - mask = np.zeros(len(item_order), dtype=bool) - for blk in self.blocks: - blk_indexer = blk.items.get_indexer(item_order) - selector = blk_indexer != -1 + else: + l = np.arange(len(item_order)) + mask = np.zeros(len(item_order), dtype=bool) - # update with observed items - mask |= selector + for blk in self.blocks: + blk_indexer = blk.items.get_indexer(item_order) + selector = blk_indexer != -1 + + # update with observed items + mask |= selector - if not selector.any(): - continue + if not selector.any(): + continue - new_block_items = new_items.take(selector.nonzero()[0]) - new_values = com.take_nd(blk.values, blk_indexer[selector], axis=0, - allow_fill=False) - placement = l[selector] if not is_unique else None - new_blocks.append(make_block(new_values, - new_block_items, + new_block_items = new_items.take(selector.nonzero()[0]) + new_values = com.take_nd(blk.values, blk_indexer[selector], axis=0, + allow_fill=False) + placement = l[selector] if not is_unique else None + new_blocks.append(make_block(new_values, + new_block_items, new_items, - placement=placement, - fastpath=True)) - - if not mask.all(): - na_items = new_items[-mask] - placement = l[-mask] if not is_unique else None - na_block = self._make_na_block(na_items, - new_items, - placement=placement, - fill_value=fill_value) - new_blocks.append(na_block) - new_blocks = _consolidate(new_blocks, new_items) + placement=placement, + fastpath=True)) + + if not mask.all(): + na_items = new_items[-mask] + placement = l[-mask] if not is_unique else None + na_block = self._make_na_block(na_items, + new_items, + placement=placement, + fill_value=fill_value) + new_blocks.append(na_block) + new_blocks = _consolidate(new_blocks, new_items) return self.__class__(new_blocks, new_axes) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index fe3aac0e9eeaa..c48b4b84698b6 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -981,6 +981,14 @@ def test_dups_fancy_indexing(self): result = df.ix[['A','A','E']] assert_frame_equal(result, expected) + # GH 5835 + # dups on index and missing values + df = DataFrame(np.random.randn(5,5),columns=['A','B','B','B','A']) + + expected = pd.concat([df.ix[:,['A','B']],DataFrame(np.nan,columns=['C'],index=df.index)],axis=1) + result = df.ix[:,['A','B','C']] + assert_frame_equal(result, expected) + def test_indexing_mixed_frame_bug(self): # GH3492