Skip to content

BUG: Groupby(sort=False) with datetime-like Categorical raises ValueError #10508

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 28, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,22 @@ Other API Changes
- Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
- Serialize metadata properties of subclasses of pandas objects (:issue:`10553`).
- ``Categorical.name`` was removed to make `Categorical` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
- ``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` are unique, rather than returnning ``np.array`` (:issue:`10508`)

- unordered category: values and categories are sorted by appearance order.
- ordered category: values are sorted by appearance order, categories keeps existing order.

.. ipython :: python

cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=True)
cat
cat.unique()

cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'])
cat
cat.unique()

- ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`)
- ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`)

=============================== ==============================================================
Expand Down Expand Up @@ -365,6 +380,9 @@ Bug Fixes
- Bug in ``DataFrame.interpolate`` with ``axis=1`` and ``inplace=True`` (:issue:`10395`)
- Bug in ``io.sql.get_schema`` when specifying multiple columns as primary
key (:issue:`10385`).

- Bug in ``groupby(sort=False)`` with datetime-like ``Categorical`` raises ``ValueError`` (:issue:`10505`)

- Bug in ``test_categorical`` on big-endian builds (:issue:`10425`)
- Bug in ``Series.shift`` and ``DataFrame.shift`` not supporting categorical data (:issue:`9416`)
- Bug in ``Series.map`` using categorical ``Series`` raises ``AttributeError`` (:issue:`10324`)
Expand Down
21 changes: 16 additions & 5 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1558,19 +1558,30 @@ def mode(self):

def unique(self):
"""
Return the unique values.
Return the ``Categorical`` which ``categories`` and ``codes`` are unique.
Unused categories are NOT returned.

Unused categories are NOT returned. Unique values are returned in order
of appearance.
- unordered category: values and categories are sorted by appearance
order.
- ordered category: values are sorted by appearance order, categories
keeps existing order.

Returns
-------
unique values : array
unique values : ``Categorical``
"""

from pandas.core.nanops import unique1d
# unlike np.unique, unique1d does not sort
unique_codes = unique1d(self.codes)
return take_1d(self.categories.values, unique_codes)
cat = self.copy()
# keep nan in codes
cat._codes = unique_codes
# exclude nan from indexer for categories
take_codes = unique_codes[unique_codes != -1]
if self.ordered:
take_codes = sorted(take_codes)
return cat.set_categories(cat.categories.take(take_codes))

def equals(self, other):
"""
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1959,7 +1959,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,

# fix bug #GH8868 sort=False being ignored in categorical groupby
else:
self.grouper = self.grouper.reorder_categories(self.grouper.unique())
cat = self.grouper.unique()
self.grouper = self.grouper.reorder_categories(cat.categories)

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
Expand Down
49 changes: 44 additions & 5 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,20 +958,59 @@ def test_min_max(self):
self.assertEqual(_max, 1)

def test_unique(self):
cat = Categorical(["a","b"])
exp = np.asarray(["a","b"])
# categories are reordered based on value when ordered=False
cat = Categorical(["a", "b"])
exp = np.asarray(["a", "b"])
res = cat.unique()
self.assert_numpy_array_equal(res, exp)

cat = Categorical(["a","b","a","a"], categories=["a","b","c"])
cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
res = cat.unique()
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, Categorical(exp))

# unique should not sort
cat = Categorical(["b", "b", np.nan, "a"], categories=["a","b","c"])
cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
exp = np.asarray(["c", "a", "b"])
res = cat.unique()
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, Categorical(exp, categories=['c', 'a', 'b']))

# nan must be removed
cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
res = cat.unique()
exp = np.asarray(["b", np.nan, "a"], dtype=object)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, Categorical(["b", np.nan, "a"], categories=["b", "a"]))

def test_unique_ordered(self):
# keep categories order when ordered=True
cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True)
res = cat.unique()
exp = np.asarray(['b', 'a'])
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, exp_cat)

cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True)
res = cat.unique()
exp = np.asarray(['c', 'b', 'a'])
exp_cat = Categorical(exp, categories=['a', 'b', 'c'], ordered=True)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, exp_cat)

cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True)
res = cat.unique()
exp = np.asarray(['b', 'a'])
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, exp_cat)

cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], ordered=True)
res = cat.unique()
exp = np.asarray(['b', np.nan, 'a'], dtype=object)
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, exp_cat)

def test_mode(self):
s = Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True)
Expand Down
52 changes: 51 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3413,7 +3413,8 @@ def test_groupby_sort_categorical(self):

col = 'range'
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
# when categories is ordered, group is ordered by category's order
assert_frame_equal(result_sort, df.groupby(col, sort=False).first())

df['range'] = Categorical(df['range'],ordered=False)
index = Index(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], dtype='object')
Expand All @@ -3431,6 +3432,55 @@ def test_groupby_sort_categorical(self):
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())

def test_groupby_sort_categorical_datetimelike(self):
# GH10505

# use same data as test_groupby_sort_categorical, which category is
# corresponding to datetime.month
df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
datetime(2011, 2, 1), datetime(2011, 5, 1),
datetime(2011, 2, 1), datetime(2011, 1, 1),
datetime(2011, 5, 1)],
'foo': [10, 8, 5, 6, 4, 1, 7],
'bar': [10, 20, 30, 40, 50, 60, 70]},
columns=['dt', 'foo', 'bar'])

# ordered=True
df['dt'] = Categorical(df['dt'], ordered=True)
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 7, 1)]
result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
result_sort.index = CategoricalIndex(index, name='dt', ordered=True)

index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 1, 1)]
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
columns=['foo', 'bar'])
result_nosort.index = CategoricalIndex(index, categories=index,
name='dt', ordered=True)

col = 'dt'
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
# when categories is ordered, group is ordered by category's order
assert_frame_equal(result_sort, df.groupby(col, sort=False).first())

# ordered = False
df['dt'] = Categorical(df['dt'], ordered=False)
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 7, 1)]
result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
result_sort.index = CategoricalIndex(index, name='dt')

index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 1, 1)]
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
columns=['foo', 'bar'])
result_nosort.index = CategoricalIndex(index, categories=index, name='dt')

col = 'dt'
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())


def test_groupby_sort_multiindex_series(self):
# series multiindex groupby sort argument was not being passed through _compress_group_index
Expand Down