Skip to content

[WIP]API: CategoricalType for specifying categoricals #14698

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,14 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to
df["B"] = raw_cat
df

You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``:
You can also specify differently ordered categories or make the resulting data
ordered by passing a :class:`CategoricalDtype`:

.. ipython:: python

s = pd.Series(["a","b","c","a"])
s_cat = s.astype("category", categories=["b","c","d"], ordered=False)
cat_type = pd.CategoricalDtype(categories=["b", "c", "d"], ordered=False)
s_cat = s.astype(cat_type)
s_cat

Categorical data has a specific ``category`` :ref:`dtype <basics.dtypes>`:
Expand Down Expand Up @@ -141,6 +143,20 @@ constructor to save the factorize step during normal constructor mode:
splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))


CategoricalDtype
----------------

A categorical's type is fully described by 1.) its categories (an iterable),
and 2.) its orderedness (a boolean).
This information can be stored in a :class:`~pandas.CategoricalDtype` and passed to
any place pandas expects a `dtype`. For example :func:`pandas.read_csv`,
:func:`pandas.DataFrame.astype`, the Series constructor, etc.

As a convenience, you can use the string `'category'` in place of a
:class:`pandas.CategoricalDtype` when you want the default behavior of
the categories being unordered, and equal to the set values present in the array.

Description
-----------

Expand Down
1 change: 1 addition & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np

from pandas.core.algorithms import factorize, match, unique, value_counts
from pandas.types.dtypes import CategoricalDtype
from pandas.types.missing import isnull, notnull
from pandas.core.categorical import Categorical
from pandas.core.groupby import Grouper
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,12 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
# may need to convert to categorical
# this is only called for non-categoricals
if self.is_categorical_astype(dtype):
kwargs = kwargs.copy()
categories = getattr(dtype, 'categories', None)
ordered = getattr(dtype, 'ordered', False)
# should we raise if CategoricalType and passed in kwargs?
kwargs.setdefault('categories', categories)
kwargs.setdefault('ordered', ordered)
return self.make_block(Categorical(self.values, **kwargs))

# astype processing
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2837,7 +2837,8 @@ def _try_cast(arr, take_fast_path):
subarr = np.array(subarr, dtype=dtype, copy=copy)
except (ValueError, TypeError):
if is_categorical_dtype(dtype):
subarr = Categorical(arr)
subarr = Categorical(arr, dtype.categories,
ordered=dtype.ordered)
elif dtype is not None and raise_cast_failure:
raise
else:
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,24 @@ def test_constructor_categorical(self):
self.assertTrue(is_categorical_dtype(s))
self.assertTrue(is_categorical_dtype(s.dtype))

def test_constructor_categorical_dtype(self):
result = pd.Series(['a', 'b'],
dtype=pd.CategoricalDtype(['a', 'b', 'c'],
ordered=True))
self.assertTrue(is_categorical_dtype(result))
tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c']))
self.assertTrue(result.cat.ordered)

result = pd.Series(['a', 'b'], dtype=pd.CategoricalDtype(['b', 'a']))
self.assertTrue(is_categorical_dtype(result))
tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
self.assertFalse(result.cat.ordered)

result = pd.Series(['a', 'b', 'c'],
dtype=pd.CategoricalDtype(['a', 'b']))
expected = pd.Series(pd.Categorical(['a', 'b', np.nan]))
tm.assert_series_equal(result, expected)

def test_constructor_maskedarray(self):
data = ma.masked_all((3, ), dtype=float)
result = Series(data)
Expand Down
19 changes: 18 additions & 1 deletion pandas/tests/series/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from numpy import nan
import numpy as np

from pandas import Series
from pandas import Series, CategoricalDtype, Categorical, Index
from pandas.tseries.index import Timestamp
from pandas.tseries.tdi import Timedelta

Expand Down Expand Up @@ -149,6 +149,23 @@ def test_astype_dict(self):
self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str})
self.assertRaises(KeyError, s.astype, {0: str})

def test_astype_categorical(self):
s = Series(['a', 'b', 'a'])
result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
expected = Series(Categorical(['a', 'b', 'a'], ordered=True))
assert_series_equal(result, expected)

result = s.astype(CategoricalDtype(['a', 'b'], ordered=False))
expected = Series(Categorical(['a', 'b', 'a'], ordered=False))
assert_series_equal(result, expected)

result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False))
expected = Series(Categorical(['a', 'b', 'a'],
categories=['a', 'b', 'c'],
ordered=False))
assert_series_equal(result, expected)
tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c']))

def test_complexx(self):
# GH4819
# complex access for ndarray compat
Expand Down
5 changes: 4 additions & 1 deletion pandas/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,10 @@ def is_complex_dtype(arr_or_dtype):
def _coerce_to_dtype(dtype):
""" coerce a string / np.dtype to a dtype """
if is_categorical_dtype(dtype):
dtype = CategoricalDtype()
categories = getattr(dtype, 'categories', None)
ordered = getattr(dtype, 'ordered', False)
# TODO: pass thru categories, ordered
dtype = CategoricalDtype(categories=categories, ordered=ordered)
elif is_datetime64tz_dtype(dtype):
dtype = DatetimeTZDtype(dtype)
elif is_period_dtype(dtype):
Expand Down
64 changes: 61 additions & 3 deletions pandas/types/dtypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" define extension dtypes """

import re
import reprlib
import numpy as np
from pandas import compat

Expand Down Expand Up @@ -98,25 +99,73 @@ class CategoricalDtypeType(type):
class CategoricalDtype(ExtensionDtype):

"""
Type for categorical data with the categories and orderedness,
but not the values

.. versionadded:: 0.20.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this has been around for quite some time, you are adding parameter support in 0.20.0


Parameters
----------
categories : list or None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list-like, use similar to whats in Categorical now

ordered : bool, default False

Examples
--------
>>> t = CategoricalDtype(categories=['b', 'a'], ordered=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these examples are not relevant. This should be not include Series. This is a self-contained type.

>>> s = Series(['a', 'a', 'b', 'b', 'a'])
>>> s.astype(t)
0 a
1 a
2 b
3 b
4 a
dtype: category
Categories (2, object): [b < a]

Notes
-----
An instance of ``CategoricalDtype`` compares equal with any other
instance of ``CategoricalDtype``, regardless of categories or ordered.
In addition they compare equal to the string ``'category'``.
To check whether two instances of a ``CategoricalDtype`` match,
use the ``is`` operator.

>>> t1 = CategoricalDtype(['a', 'b'], ordered=True)
>>> t2 = CategoricalDtype(['a', 'c'], ordered=False)
>>> t1 == t2
True
>>> t1 == 'category'
True
>>> t1 is t2
False
>>> t1 is CategoricalDtype(['a', 'b'], ordered=True)
True

A np.dtype duck-typed class, suitable for holding a custom categorical
dtype.

THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object
"""
# TODO: Document public vs. private API
name = 'category'
type = CategoricalDtypeType
kind = 'O'
str = '|O08'
base = np.dtype('O')
_cache = {}

def __new__(cls):
def __new__(cls, categories=None, ordered=False):
categories_ = categories if categories is None else tuple(categories)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs all of the validation logic (from Categorical). on the actual categories.

t = (categories_, ordered)

try:
return cls._cache[cls.name]
return cls._cache[t]
except KeyError:
c = object.__new__(cls)
cls._cache[cls.name] = c
c.categories = categories
c.ordered = ordered

cls._cache[t] = c
return c

def __hash__(self):
Expand All @@ -129,6 +178,15 @@ def __eq__(self, other):

return isinstance(other, CategoricalDtype)

# def __unicode__(self):
# tpl = 'CategoricalDtype({!r}, ordered={})'
# return tpl.format(reprlib.repr(self.categories), self.ordered)

# def __repr__(self):
# """ return the base repr for the categories """
# tpl = 'CategoricalDtype({!r}, ordered={})'
# return tpl.format(reprlib.repr(self.categories), self.ordered)

@classmethod
def construct_from_string(cls, string):
""" attempt to construct this type from a string, raise a TypeError if
Expand Down