From b3096bb5dad67a396212f807264efa12c8b35696 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 14 Mar 2017 12:57:14 +0100 Subject: [PATCH 1/3] BUG: Fix iloc with duplicate labels closes #15686 --- doc/source/whatsnew/v0.20.2.txt | 2 +- pandas/core/dtypes/cast.py | 21 +++++ pandas/core/indexing.py | 138 ++++++++++++++++++++++++-------- 3 files changed, 127 insertions(+), 34 deletions(-) mode change 100755 => 100644 pandas/core/indexing.py diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 03579dab0d6a3..743307113fa27 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -46,7 +46,7 @@ Indexing ^^^^^^^^ - Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`) - +- Bug in ``DataFrame.iloc`` with duplicate labels (:issue:`15686`) I/O ^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 19d3792f73de7..62ab0ea54c28d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1026,3 +1026,24 @@ def find_common_type(types): return np.object return np.find_common_type(types, []) + + +def _maybe_convert_indexer(indexer, until): + """ + Convert slice, tuple, list or scalar "indexer" to 1-d array of indices, + using "until" as maximum for upwards open slices. + """ + + if is_scalar(indexer): + return np.array([indexer], dtype=int) + + if isinstance(indexer, np.ndarray): + if indexer.dtype == bool: + return np.where(indexer)[0] + return indexer + + if isinstance(indexer, slice): + stop = until if indexer.stop is None else indexer.stop + return np.arange(stop, dtype=int)[indexer] + + return np.array(indexer, dtype=int) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py old mode 100755 new mode 100644 index a01e3dc46dfe9..5dc10d232c479 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -16,6 +16,7 @@ _is_unorderable_exception, _ensure_platform_int) from pandas.core.dtypes.missing import isnull, _infer_fill_value +from pandas.core.dtypes.cast import _maybe_convert_indexer from pandas.core.index import Index, MultiIndex @@ -81,6 +82,24 @@ def __getitem__(self, arg): IndexSlice = _IndexSlice() +class InfoCleaner: + """ + A context manager which temporarily removes labels on the "info" axis, + replacing them with a RangeIndex, and then puts them back in place. + Used to unambiguously index by position. + """ + def __init__(self, obj): + self._obj = obj + self._info_axis = self._obj._AXIS_NAMES[self._obj._info_axis_number] + + def __enter__(self): + self._old_col = getattr(self._obj, self._info_axis) + setattr(self._obj, self._info_axis, range(len(self._old_col))) + + def __exit__(self, *args): + setattr(self._obj, self._info_axis, self._old_col) + + class IndexingError(Exception): pass @@ -492,29 +511,10 @@ def _setitem_with_indexer(self, indexer, value): else: lplane_indexer = 0 - def setter(item, v): - s = self.obj[item] - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer - - # perform the equivalent of a setitem on the info axis - # as we have a null slice or a slice with full bounds - # which means essentially reassign to the columns of a - # multi-dim object - # GH6149 (null slice), GH10408 (full bounds) - if (isinstance(pi, tuple) and - all(is_null_slice(idx) or - is_full_slice(idx, len(self.obj)) - for idx in pi)): - s = v - else: - # set the item, possibly having a dtype change - s._consolidate_inplace() - s = s.copy() - s._data = s._data.setitem(indexer=pi, value=v) - s._maybe_update_cacher(clear=True) - - # reset the sliced object if unique - self.obj[item] = s + setter_kwargs = {'items': labels, + 'indexer': indexer, + 'pi': plane_indexer[0] if lplane_indexer == 1 + else plane_indexer} def can_do_equal_len(): """ return True if we have an equal len settable """ @@ -542,7 +542,7 @@ def can_do_equal_len(): sub_indexer = list(indexer) multiindex_indexer = isinstance(labels, MultiIndex) - for item in labels: + for idx, item in enumerate(labels): if item in value: sub_indexer[info_axis] = item v = self._align_series( @@ -551,7 +551,7 @@ def can_do_equal_len(): else: v = np.nan - setter(item, v) + self._setter(idx, v, force_loc=True, **setter_kwargs) # we have an equal len ndarray/convertible to our labels elif np.array(value).ndim == 2: @@ -563,14 +563,15 @@ def can_do_equal_len(): raise ValueError('Must have equal len keys and value ' 'when setting with an ndarray') - for i, item in enumerate(labels): + for i in range(len(labels)): # setting with a list, recoerces - setter(item, value[:, i].tolist()) + self._setter(i, value[:, i].tolist(), force_loc=True, + **setter_kwargs) # we have an equal len list/ndarray elif can_do_equal_len(): - setter(labels[0], value) + self._setter(0, value, **setter_kwargs) # per label values else: @@ -579,13 +580,12 @@ def can_do_equal_len(): raise ValueError('Must have equal len keys and value ' 'when setting with an iterable') - for item, v in zip(labels, value): - setter(item, v) + for i, v in zip(range(len(labels)), value): + self._setter(i, v, **setter_kwargs) else: - # scalar - for item in labels: - setter(item, value) + for idx in range(len(labels)): + self._setter(idx, value, **setter_kwargs) else: if isinstance(indexer, tuple): @@ -619,6 +619,47 @@ def can_do_equal_len(): value=value) self.obj._maybe_update_cacher(clear=True) + def _setter(self, idx, v, items, pi, **kwargs): + """ + Set a single value on the underlying object. Label-based. + + Parameters + ---------- + idx : int + The index of the desired element inside "items" + + v : any + The value to assign to the specified location + + items: list + A list of labels + + pi: tuple or list-like + Components of original indexer preceding the info axis + """ + item = items[idx] + s = self.obj[item] + + # perform the equivalent of a setitem on the info axis + # as we have a null slice or a slice with full bounds + # which means essentially reassign to the columns of a + # multi-dim object + # GH6149 (null slice), GH10408 (full bounds) + if (isinstance(pi, tuple) and + all(is_null_slice(ix) or + is_full_slice(ix, len(self.obj)) + for ix in pi)): + s = v + else: + # set the item, possibly having a dtype change + s._consolidate_inplace() + s = s.copy() + s._data = s._data.setitem(indexer=pi, value=v) + s._maybe_update_cacher(clear=True) + + # reset the sliced object if unique + self.obj[item] = s + def _align_series(self, indexer, ser, multiindex_indexer=False): """ Parameters @@ -1766,6 +1807,37 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): raise ValueError("Can only index by location with a [%s]" % self._valid_types) + def _setter(self, idx, v, indexer, force_loc=False, **kwargs): + """ + Set a single value on the underlying object. Position-based by default. + + Parameters + ---------- + idx : int + The index of the desired element + + v : any + The value to assign to the specified location + + indexer: list + The original indexer + + force_loc: bool + If True, use location-based indexing. + + Other keyword arguments are forwarded to _NDFrameIndexer._setter() + """ + + if force_loc: + super(_iLocIndexer, self)._setter(idx, v, **kwargs) + else: + info_axis = self.obj._info_axis_number + max_idx = len(self.obj._get_axis(info_axis)) + kwargs['items'] = _maybe_convert_indexer(indexer[info_axis], + max_idx) + with InfoCleaner(self.obj): + super(_iLocIndexer, self)._setter(idx, v, **kwargs) + class _ScalarAccessIndexer(_NDFrameIndexer): """ access scalars quickly """ From 99e44d78fa39dc6a27e16d08afaed48f143492c4 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 14 Mar 2017 16:48:55 +0100 Subject: [PATCH 2/3] TST: fixed one tests and added two more for #15686 --- pandas/tests/indexing/test_iloc.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index af4b9e1f0cc25..4656f8f6f3955 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -288,15 +288,33 @@ def test_iloc_setitem_dups(self): # iloc with a mask aligning from another iloc df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) - df = concat([df1, df2], axis=1) + df_orig = concat([df1, df2], axis=1) + df = df_orig.copy() + # GH 15686 + # iloc with mask, duplicated index and multiple blocks expected = df.fillna(3) - expected['A'] = expected['A'].astype('float64') + expected.iloc[:, 0] = expected.iloc[:, 0].astype('float64') inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] tm.assert_frame_equal(df, expected) + # GH 15686 + # iloc with scalar, duplicated index and multiple blocks + df = df_orig.copy() + expected = df.fillna(15) + df.iloc[0, 0] = 15 + tm.assert_frame_equal(df, expected) + + # GH 15686 + # iloc with repeated value, duplicated index and multiple blocks + df = df_orig.copy() + expected = concat([DataFrame([{'A': 15, 'B': 1}, {'A': 15, 'B': 2}]), + df2], axis=1) + df.iloc[:, 0] = 15 + tm.assert_frame_equal(df, expected) + # del a dup column across blocks expected = DataFrame({0: [1, 2], 1: [3, 4]}) expected.columns = ['B', 'B'] From de098e039c8fd3289d66b587e4edcc7a89d6901d Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 14 Mar 2017 17:12:08 +0100 Subject: [PATCH 3/3] TST: added xfailing test on iloc with dups --- pandas/tests/indexing/test_iloc.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 4656f8f6f3955..2a7cf3e285d44 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -332,6 +332,17 @@ def test_iloc_setitem_dups(self): drop=True) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(reason="BlockManager.setitem() broken") + def test_iloc_setitem_dups_mixed_df(self): + # GH 12991 + df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) + df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) + df = concat([df1, df2], axis=1) + + expected = df.fillna(15) + df.iloc[0, 0] = 15 + tm.assert_frame_equal(df, expected) + def test_iloc_getitem_frame(self): df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2), columns=lrange(0, 8, 2))