pandas-dev · jreback · Mar 12, 2020 · Mar 11, 2020
diff --git a/pandas/_testing.py b/pandas/_testing.py
@@ -824,10 +824,14 @@ def assert_categorical_equal(
             left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes",
         )
     else:
+        try:
+            lc = left.categories.sort_values()
+            rc = right.categories.sort_values()
+        except TypeError:
+            # e.g. '<' not supported between instances of 'int' and 'str'
+            lc, rc = left.categories, right.categories
         assert_index_equal(
-            left.categories.sort_values(),
-            right.categories.sort_values(),
-            obj=f"{obj}.categories",
+            lc, rc, obj=f"{obj}.categories",
         )
         assert_index_equal(
             left.categories.take(left.codes),

diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py
@@ -1,48 +1,51 @@
+import numpy as np
 import pytest
 
 import pandas as pd
 import pandas._testing as tm
 
 
 @pytest.mark.parametrize(
-    "to_replace,value,expected,check_types,check_categorical",
+    "to_replace,value,expected,flip_categories",
     [
         # one-to-one
-        (1, 2, [2, 2, 3], True, True),
-        (1, 4, [4, 2, 3], True, True),
-        (4, 1, [1, 2, 3], True, True),
-        (5, 6, [1, 2, 3], True, True),
+        (1, 2, [2, 2, 3], False),
+        (1, 4, [4, 2, 3], False),
+        (4, 1, [1, 2, 3], False),
+        (5, 6, [1, 2, 3], False),
         # many-to-one
-        ([1], 2, [2, 2, 3], True, True),
-        ([1, 2], 3, [3, 3, 3], True, True),
-        ([1, 2], 4, [4, 4, 3], True, True),
-        ((1, 2, 4), 5, [5, 5, 3], True, True),
-        ((5, 6), 2, [1, 2, 3], True, True),
+        ([1], 2, [2, 2, 3], False),
+        ([1, 2], 3, [3, 3, 3], False),
+        ([1, 2], 4, [4, 4, 3], False),
+        ((1, 2, 4), 5, [5, 5, 3], False),
+        ((5, 6), 2, [1, 2, 3], False),
         # many-to-many, handled outside of Categorical and results in separate dtype
-        ([1], [2], [2, 2, 3], False, False),
-        ([1, 4], [5, 2], [5, 2, 3], False, False),
+        ([1], [2], [2, 2, 3], True),
+        ([1, 4], [5, 2], [5, 2, 3], True),
         # check_categorical sorts categories, which crashes on mixed dtypes
-        (3, "4", [1, 2, "4"], True, False),
-        ([1, 2, "3"], "5", ["5", "5", 3], True, False),
+        (3, "4", [1, 2, "4"], False),
+        ([1, 2, "3"], "5", ["5", "5", 3], True),
     ],
 )
-def test_replace(to_replace, value, expected, check_types, check_categorical):
+def test_replace(to_replace, value, expected, flip_categories):
     # GH 31720
+    stays_categorical = not isinstance(value, list)
+
     s = pd.Series([1, 2, 3], dtype="category")
     result = s.replace(to_replace, value)
     expected = pd.Series(expected, dtype="category")
     s.replace(to_replace, value, inplace=True)
+
+    if flip_categories:
+        expected = expected.cat.set_categories(expected.cat.categories[::-1])
+
+    if not stays_categorical:
+        # the replace call loses categorical dtype
+        expected = pd.Series(np.asarray(expected))
+
     tm.assert_series_equal(
-        expected,
-        result,
-        check_dtype=check_types,
-        check_categorical=check_categorical,
-        check_category_order=False,
+        expected, result, check_category_order=False,
     )
     tm.assert_series_equal(
-        expected,
-        s,
-        check_dtype=check_types,
-        check_categorical=check_categorical,
-        check_category_order=False,
+        expected, s, check_category_order=False,
     )
diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py
@@ -273,17 +273,13 @@ def test_to_xarray_index_types(self, index):
         assert isinstance(result, Dataset)
 
         # idempotency
-        # categoricals are not preserved
         # datetimes w/tz are preserved
         # column names are lost
         expected = df.copy()
         expected["f"] = expected["f"].astype(object)
         expected.columns.name = None
         tm.assert_frame_equal(
-            result.to_dataframe(),
-            expected,
-            check_index_type=False,
-            check_categorical=False,
+            result.to_dataframe(), expected,
         )
 
     @td.skip_if_no("xarray", min_version="0.7.0")

diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
@@ -13,8 +13,6 @@
 from pandas.compat import is_platform_little_endian, is_platform_windows
 import pandas.util._test_decorators as td
 
-from pandas.core.dtypes.common import is_categorical_dtype
-
 import pandas as pd
 from pandas import (
     Categorical,
@@ -1057,18 +1055,7 @@ def test_latin_encoding(self, setup_path, dtype, val):
 
         s_nan = ser.replace(nan_rep, np.nan)
 
-        if is_categorical_dtype(s_nan):
-            assert is_categorical_dtype(retr)
-            tm.assert_series_equal(
-                s_nan, retr, check_dtype=False, check_categorical=False
-            )
-        else:
-            tm.assert_series_equal(s_nan, retr)
-
-        # FIXME: don't leave commented-out
-        # fails:
-        # for x in examples:
-        #     roundtrip(s, nan_rep=b'\xf8\xfc')
+        tm.assert_series_equal(s_nan, retr)
 
     def test_append_some_nans(self, setup_path):
 

diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1026,7 +1026,14 @@ def test_categorical_with_stata_missing_values(self, version):
             original.to_stata(path, version=version)
             written_and_read_again = self.read_dta(path)
             res = written_and_read_again.set_index("index")
-            tm.assert_frame_equal(res, original, check_categorical=False)
+
+            expected = original.copy()
+            for col in expected:
+                cat = expected[col]._values
+                new_cats = cat.remove_unused_categories().categories
+                cat = cat.set_categories(new_cats, ordered=True)
+                expected[col] = cat
+            tm.assert_frame_equal(res, expected)
 
     @pytest.mark.parametrize("file", ["dta19_115", "dta19_117"])
     def test_categorical_order(self, file):
@@ -1044,15 +1051,17 @@ def test_categorical_order(self, file):
         cols = []
         for is_cat, col, labels, codes in expected:
             if is_cat:
-                cols.append((col, pd.Categorical.from_codes(codes, labels)))
+                cols.append(
+                    (col, pd.Categorical.from_codes(codes, labels, ordered=True))
+                )
             else:
                 cols.append((col, pd.Series(labels, dtype=np.float32)))
         expected = DataFrame.from_dict(dict(cols))
 
         # Read with and with out categoricals, ensure order is identical
         file = getattr(self, file)
         parsed = read_stata(file)
-        tm.assert_frame_equal(expected, parsed, check_categorical=False)
+        tm.assert_frame_equal(expected, parsed)
 
         # Check identity of codes
         for col in expected:
@@ -1137,18 +1146,30 @@ def test_read_chunks_117(
                     chunk = itr.read(chunksize)
                 except StopIteration:
                     break
-            from_frame = parsed.iloc[pos : pos + chunksize, :]
+            from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
+            from_frame = self._convert_categorical(from_frame)
             tm.assert_frame_equal(
-                from_frame,
-                chunk,
-                check_dtype=False,
-                check_datetimelike_compat=True,
-                check_categorical=False,
+                from_frame, chunk, check_dtype=False, check_datetimelike_compat=True,
             )
 
             pos += chunksize
         itr.close()
 
+    @staticmethod
+    def _convert_categorical(from_frame: DataFrame) -> DataFrame:
+        """
+        Emulate the categorical casting behavior we expect from roundtripping.
+        """
+        for col in from_frame:
+            ser = from_frame[col]
+            if is_categorical_dtype(ser.dtype):
+                cat = ser._values.remove_unused_categories()
+                if cat.categories.dtype == object:
+                    categories = pd.Index(cat.categories._values)
+                    cat = cat.set_categories(categories)
+                from_frame[col] = cat
+        return from_frame
+
     def test_iterator(self):
 
         fname = self.dta3_117
@@ -1223,13 +1244,10 @@ def test_read_chunks_115(
                     chunk = itr.read(chunksize)
                 except StopIteration:
                     break
-            from_frame = parsed.iloc[pos : pos + chunksize, :]
+            from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
+            from_frame = self._convert_categorical(from_frame)
             tm.assert_frame_equal(
-                from_frame,
-                chunk,
-                check_dtype=False,
-                check_datetimelike_compat=True,
-                check_categorical=False,
+                from_frame, chunk, check_dtype=False, check_datetimelike_compat=True,
             )
 
             pos += chunksize

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -2077,8 +2077,7 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse):
         }
     ).set_index("foo")
 
-    # Categorical is unordered, so don't check ordering.
-    tm.assert_frame_equal(result, expected, check_categorical=False)
+    tm.assert_frame_equal(result, expected)
 
 
 def test_merge_equal_cat_dtypes2():
@@ -2100,8 +2099,7 @@ def test_merge_equal_cat_dtypes2():
         {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)}
     ).set_index("foo")
 
-    # Categorical is unordered, so don't check ordering.
-    tm.assert_frame_equal(result, expected, check_categorical=False)
+    tm.assert_frame_equal(result, expected)
 
 
 def test_merge_on_cat_and_ext_array():

diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
@@ -296,18 +296,18 @@ def cmp(a, b):
         # array conversion
         tm.assert_almost_equal(np.array(s), np.array(s.values))
 
-        # valid conversion
-        for valid in [
-            lambda x: x.astype("category"),
-            lambda x: x.astype(CategoricalDtype()),
-            lambda x: x.astype("object").astype("category"),
-            lambda x: x.astype("object").astype(CategoricalDtype()),
-        ]:
-
-            result = valid(s)
-            # compare series values
-            # internal .categories can't be compared because it is sorted
-            tm.assert_series_equal(result, s, check_categorical=False)
+        tm.assert_series_equal(s.astype("category"), s)
+        tm.assert_series_equal(s.astype(CategoricalDtype()), s)
+
+        roundtrip_expected = s.cat.set_categories(
+            s.cat.categories.sort_values()
+        ).cat.remove_unused_categories()
+        tm.assert_series_equal(
+            s.astype("object").astype("category"), roundtrip_expected
+        )
+        tm.assert_series_equal(
+            s.astype("object").astype(CategoricalDtype()), roundtrip_expected
+        )
 
         # invalid conversion (these are NOT a dtype)
         msg = (