From 574facc76c898377a11956f1a7051de34965efbe Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 9 Oct 2014 13:10:45 -0400 Subject: [PATCH] BUG/REGR: bool-like Indexes not properly coercing to object (GH8522) --- doc/source/v0.15.0.txt | 2 +- pandas/core/base.py | 2 +- pandas/core/index.py | 18 ++++++++++------ pandas/tests/test_base.py | 42 +++++++++++++++++++++++++++++++------ pandas/tests/test_index.py | 8 +++++++ pandas/tests/test_series.py | 9 +++++++- pandas/util/testing.py | 7 +++++++ 7 files changed, 73 insertions(+), 15 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index eec424f619bde..d972edeb2bbb3 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -642,7 +642,7 @@ Internal Refactoring In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This change allows very easy sub-classing and creation of new index types. This should be -a transparent change with only very limited API implications (:issue:`5080`, :issue:`7439`, :issue:`7796`, :issue:`8024`, :issue:`8367`, :issue:`7997`) +a transparent change with only very limited API implications (:issue:`5080`, :issue:`7439`, :issue:`7796`, :issue:`8024`, :issue:`8367`, :issue:`7997`, :issue:`8522`) - you may need to unpickle pandas version < 0.15.0 pickles using ``pd.read_pickle`` rather than ``pickle.load``. See :ref:`pickle docs ` - when plotting with a ``PeriodIndex``. The ``matplotlib`` internal axes will now be arrays of ``Period`` rather than a ``PeriodIndex``. (this is similar to how a ``DatetimeIndex`` passes arrays of ``datetimes`` now) diff --git a/pandas/core/base.py b/pandas/core/base.py index 794c05db082c7..5d6f39e1792c3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -499,7 +499,7 @@ def searchsorted(self, key, side='left'): @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, take_last=False, inplace=False): duplicated = self.duplicated(take_last=take_last) - result = self[~duplicated.values] + result = self[~(duplicated.values).astype(bool)] if inplace: return self._update_inplace(result) else: diff --git a/pandas/core/index.py b/pandas/core/index.py index 99f1682b133c3..f87b7e982b332 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -148,16 +148,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, data = np.array(data, dtype=dtype, copy=copy) except TypeError: pass - elif isinstance(data, PeriodIndex): - return PeriodIndex(data, copy=copy, name=name, **kwargs) + # maybe coerce to a sub-class + if isinstance(data, PeriodIndex): + return PeriodIndex(data, copy=copy, name=name, **kwargs) if issubclass(data.dtype.type, np.integer): return Int64Index(data, copy=copy, dtype=dtype, name=name) - if issubclass(data.dtype.type, np.floating): + elif issubclass(data.dtype.type, np.floating): return Float64Index(data, copy=copy, dtype=dtype, name=name) - - if com.is_bool_dtype(data): - subarr = data + elif issubclass(data.dtype.type, np.bool) or com.is_bool_dtype(data): + subarr = data.astype('object') else: subarr = com._asarray_tuplesafe(data, dtype=object) @@ -583,6 +583,9 @@ def is_unique(self): """ return if the index has unique values """ return self._engine.is_unique + def is_boolean(self): + return self.inferred_type in ['boolean'] + def is_integer(self): return self.inferred_type in ['integer'] @@ -592,6 +595,9 @@ def is_floating(self): def is_numeric(self): return self.inferred_type in ['integer', 'floating'] + def is_object(self): + return self.dtype == np.object_ + def is_mixed(self): return 'mixed' in self.inferred_type diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index f508b8915da1c..814da043d0319 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -180,6 +180,7 @@ def f(): class Ops(tm.TestCase): def setUp(self): + self.bool_index = tm.makeBoolIndex(10) self.int_index = tm.makeIntIndex(10) self.float_index = tm.makeFloatIndex(10) self.dt_index = tm.makeDateIndex(10) @@ -189,14 +190,15 @@ def setUp(self): arr = np.random.randn(10) self.int_series = Series(arr, index=self.int_index) - self.float_series = Series(arr, index=self.int_index) + self.float_series = Series(arr, index=self.float_index) self.dt_series = Series(arr, index=self.dt_index) self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) self.period_series = Series(arr, index=self.period_index) self.string_series = Series(arr, index=self.string_index) - types = ['int','float','dt', 'dt_tz', 'period','string'] - self.objs = [ getattr(self,"{0}_{1}".format(t,f)) for t in types for f in ['index','series'] ] + types = ['bool','int','float','dt', 'dt_tz', 'period','string'] + fmts = [ "{0}_{1}".format(t,f) for t in types for f in ['index','series'] ] + self.objs = [ getattr(self,f) for f in fmts if getattr(self,f,None) is not None ] def check_ops_properties(self, props, filter=None, ignore_failures=False): for op in props: @@ -340,6 +342,9 @@ def test_value_counts_unique_nunique(self): # freq must be specified because repeat makes freq ambiguous expected_index = o[::-1] o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) + # don't test boolean + elif isinstance(o,Index) and o.is_boolean(): + continue elif isinstance(o, Index): expected_index = values[::-1] o = klass(np.repeat(values, range(1, len(o) + 1))) @@ -366,6 +371,10 @@ def test_value_counts_unique_nunique(self): klass = type(o) values = o.values + if isinstance(o,Index) and o.is_boolean(): + # don't test boolean + continue + if ((isinstance(o, Int64Index) and not isinstance(o, (DatetimeIndex, PeriodIndex)))): # skips int64 because it doesn't allow to include nan or None @@ -537,7 +546,14 @@ def test_value_counts_inferred(self): def test_factorize(self): for o in self.objs: - exp_arr = np.array(range(len(o))) + + if isinstance(o,Index) and o.is_boolean(): + exp_arr = np.array([0,1] + [0] * 8) + exp_uniques = o + exp_uniques = Index([False,True]) + else: + exp_arr = np.array(range(len(o))) + exp_uniques = o labels, uniques = o.factorize() self.assert_numpy_array_equal(labels, exp_arr) @@ -545,16 +561,22 @@ def test_factorize(self): expected = Index(o.values) self.assert_numpy_array_equal(uniques, expected) else: - self.assertTrue(uniques.equals(o)) + self.assertTrue(uniques.equals(exp_uniques)) for o in self.objs: + + # don't test boolean + if isinstance(o,Index) and o.is_boolean(): + continue + # sort by value, and create duplicates if isinstance(o, Series): o.sort() + n = o.iloc[5:].append(o) else: indexer = o.argsort() o = o.take(indexer) - n = o[5:].append(o) + n = o[5:].append(o) exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) labels, uniques = n.factorize(sort=True) @@ -582,6 +604,14 @@ def test_duplicated_drop_duplicates(self): for original in self.objs: if isinstance(original, Index): + + # special case + if original.is_boolean(): + result = original.drop_duplicates() + expected = Index([False,True]) + tm.assert_index_equal(result, expected) + continue + # original doesn't have duplicates expected = Index([False] * len(original)) tm.assert_index_equal(original.duplicated(), expected) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 9984ad30612db..a8c4548f462ac 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -94,6 +94,7 @@ def setUp(self): dateIndex = tm.makeDateIndex(100), intIndex = tm.makeIntIndex(100), floatIndex = tm.makeFloatIndex(100), + boolIndex = Index([True,False]), empty = Index([]), tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], [1, 2, 3])) @@ -732,6 +733,13 @@ def test_is_numeric(self): self.assertTrue(self.intIndex.is_numeric()) self.assertTrue(self.floatIndex.is_numeric()) + def test_is_object(self): + self.assertTrue(self.strIndex.is_object()) + self.assertTrue(self.boolIndex.is_object()) + self.assertFalse(self.intIndex.is_object()) + self.assertFalse(self.dateIndex.is_object()) + self.assertFalse(self.floatIndex.is_object()) + def test_is_all_dates(self): self.assertTrue(self.dateIndex.is_all_dates) self.assertFalse(self.strIndex.is_all_dates) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index d3f7414289053..29bdb2c983d61 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1222,7 +1222,7 @@ def test_getitem_dups(self): expected = Series([3,4],index=['C','C'],dtype=np.int64) result = s['C'] assert_series_equal(result, expected) - + def test_getitem_dataframe(self): rng = list(range(10)) s = pd.Series(10, index=rng) @@ -1817,6 +1817,13 @@ def test_drop(self): # bad axis self.assertRaises(ValueError, s.drop, 'one', axis='columns') + # GH 8522 + s = Series([2,3], index=[True, False]) + self.assertTrue(s.index.is_object()) + result = s.drop(True) + expected = Series([3],index=[False]) + assert_series_equal(result,expected) + def test_ix_setitem(self): inds = self.series.index[[3, 4, 7]] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 977d445f917a8..d8cc39908a31f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -738,6 +738,13 @@ def makeStringIndex(k=10): def makeUnicodeIndex(k=10): return Index([randu(10) for _ in range(k)]) +def makeBoolIndex(k=10): + if k == 1: + return Index([True]) + elif k == 2: + return Index([False,True]) + return Index([False,True] + [False]*(k-2)) + def makeIntIndex(k=10): return Index(lrange(k))