From 170d4bb5248bd01f7dc6b33a63214fbcfb679216 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Mar 2020 13:18:27 -0700 Subject: [PATCH] TST: tighten check_categorical=False tests --- pandas/_testing.py | 10 ++-- .../tests/arrays/categorical/test_replace.py | 53 ++++++++++--------- pandas/tests/generic/test_frame.py | 6 +-- pandas/tests/io/pytables/test_store.py | 15 +----- pandas/tests/io/test_stata.py | 48 +++++++++++------ pandas/tests/reshape/merge/test_merge.py | 6 +-- pandas/tests/series/test_dtypes.py | 24 ++++----- 7 files changed, 84 insertions(+), 78 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 136dfbd40276d..dff15c66750ac 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -824,10 +824,14 @@ def assert_categorical_equal( left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", ) else: + try: + lc = left.categories.sort_values() + rc = right.categories.sort_values() + except TypeError: + # e.g. '<' not supported between instances of 'int' and 'str' + lc, rc = left.categories, right.categories assert_index_equal( - left.categories.sort_values(), - right.categories.sort_values(), - obj=f"{obj}.categories", + lc, rc, obj=f"{obj}.categories", ) assert_index_equal( left.categories.take(left.codes), diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 52530123bd52f..b9ac3ce9a37ae 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas as pd @@ -5,44 +6,46 @@ @pytest.mark.parametrize( - "to_replace,value,expected,check_types,check_categorical", + "to_replace,value,expected,flip_categories", [ # one-to-one - (1, 2, [2, 2, 3], True, True), - (1, 4, [4, 2, 3], True, True), - (4, 1, [1, 2, 3], True, True), - (5, 6, [1, 2, 3], True, True), + (1, 2, [2, 2, 3], False), + (1, 4, [4, 2, 3], False), + (4, 1, [1, 2, 3], False), + (5, 6, [1, 2, 3], False), # many-to-one - ([1], 2, [2, 2, 3], True, True), - ([1, 2], 3, [3, 3, 3], True, True), - ([1, 2], 4, [4, 4, 3], True, True), - ((1, 2, 4), 5, [5, 5, 3], True, True), - ((5, 6), 2, [1, 2, 3], True, True), + ([1], 2, [2, 2, 3], False), + ([1, 2], 3, [3, 3, 3], False), + ([1, 2], 4, [4, 4, 3], False), + ((1, 2, 4), 5, [5, 5, 3], False), + ((5, 6), 2, [1, 2, 3], False), # many-to-many, handled outside of Categorical and results in separate dtype - ([1], [2], [2, 2, 3], False, False), - ([1, 4], [5, 2], [5, 2, 3], False, False), + ([1], [2], [2, 2, 3], True), + ([1, 4], [5, 2], [5, 2, 3], True), # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], True, False), - ([1, 2, "3"], "5", ["5", "5", 3], True, False), + (3, "4", [1, 2, "4"], False), + ([1, 2, "3"], "5", ["5", "5", 3], True), ], ) -def test_replace(to_replace, value, expected, check_types, check_categorical): +def test_replace(to_replace, value, expected, flip_categories): # GH 31720 + stays_categorical = not isinstance(value, list) + s = pd.Series([1, 2, 3], dtype="category") result = s.replace(to_replace, value) expected = pd.Series(expected, dtype="category") s.replace(to_replace, value, inplace=True) + + if flip_categories: + expected = expected.cat.set_categories(expected.cat.categories[::-1]) + + if not stays_categorical: + # the replace call loses categorical dtype + expected = pd.Series(np.asarray(expected)) + tm.assert_series_equal( - expected, - result, - check_dtype=check_types, - check_categorical=check_categorical, - check_category_order=False, + expected, result, check_category_order=False, ) tm.assert_series_equal( - expected, - s, - check_dtype=check_types, - check_categorical=check_categorical, - check_category_order=False, + expected, s, check_category_order=False, ) diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 8fe49b2ec2299..631f484cfc22a 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -273,17 +273,13 @@ def test_to_xarray_index_types(self, index): assert isinstance(result, Dataset) # idempotency - # categoricals are not preserved # datetimes w/tz are preserved # column names are lost expected = df.copy() expected["f"] = expected["f"].astype(object) expected.columns.name = None tm.assert_frame_equal( - result.to_dataframe(), - expected, - check_index_type=False, - check_categorical=False, + result.to_dataframe(), expected, ) @td.skip_if_no("xarray", min_version="0.7.0") diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 34f6a73812c97..2702d378fd153 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -13,8 +13,6 @@ from pandas.compat import is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_categorical_dtype - import pandas as pd from pandas import ( Categorical, @@ -1057,18 +1055,7 @@ def test_latin_encoding(self, setup_path, dtype, val): s_nan = ser.replace(nan_rep, np.nan) - if is_categorical_dtype(s_nan): - assert is_categorical_dtype(retr) - tm.assert_series_equal( - s_nan, retr, check_dtype=False, check_categorical=False - ) - else: - tm.assert_series_equal(s_nan, retr) - - # FIXME: don't leave commented-out - # fails: - # for x in examples: - # roundtrip(s, nan_rep=b'\xf8\xfc') + tm.assert_series_equal(s_nan, retr) def test_append_some_nans(self, setup_path): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3efac9cd605a8..eaa92fa53d799 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1026,7 +1026,14 @@ def test_categorical_with_stata_missing_values(self, version): original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) res = written_and_read_again.set_index("index") - tm.assert_frame_equal(res, original, check_categorical=False) + + expected = original.copy() + for col in expected: + cat = expected[col]._values + new_cats = cat.remove_unused_categories().categories + cat = cat.set_categories(new_cats, ordered=True) + expected[col] = cat + tm.assert_frame_equal(res, expected) @pytest.mark.parametrize("file", ["dta19_115", "dta19_117"]) def test_categorical_order(self, file): @@ -1044,7 +1051,9 @@ def test_categorical_order(self, file): cols = [] for is_cat, col, labels, codes in expected: if is_cat: - cols.append((col, pd.Categorical.from_codes(codes, labels))) + cols.append( + (col, pd.Categorical.from_codes(codes, labels, ordered=True)) + ) else: cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_dict(dict(cols)) @@ -1052,7 +1061,7 @@ def test_categorical_order(self, file): # Read with and with out categoricals, ensure order is identical file = getattr(self, file) parsed = read_stata(file) - tm.assert_frame_equal(expected, parsed, check_categorical=False) + tm.assert_frame_equal(expected, parsed) # Check identity of codes for col in expected: @@ -1137,18 +1146,30 @@ def test_read_chunks_117( chunk = itr.read(chunksize) except StopIteration: break - from_frame = parsed.iloc[pos : pos + chunksize, :] + from_frame = parsed.iloc[pos : pos + chunksize, :].copy() + from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, - chunk, - check_dtype=False, - check_datetimelike_compat=True, - check_categorical=False, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, ) pos += chunksize itr.close() + @staticmethod + def _convert_categorical(from_frame: DataFrame) -> DataFrame: + """ + Emulate the categorical casting behavior we expect from roundtripping. + """ + for col in from_frame: + ser = from_frame[col] + if is_categorical_dtype(ser.dtype): + cat = ser._values.remove_unused_categories() + if cat.categories.dtype == object: + categories = pd.Index(cat.categories._values) + cat = cat.set_categories(categories) + from_frame[col] = cat + return from_frame + def test_iterator(self): fname = self.dta3_117 @@ -1223,13 +1244,10 @@ def test_read_chunks_115( chunk = itr.read(chunksize) except StopIteration: break - from_frame = parsed.iloc[pos : pos + chunksize, :] + from_frame = parsed.iloc[pos : pos + chunksize, :].copy() + from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, - chunk, - check_dtype=False, - check_datetimelike_compat=True, - check_categorical=False, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, ) pos += chunksize diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d80e2e7afceef..51e6f80df657d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2077,8 +2077,7 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse): } ).set_index("foo") - # Categorical is unordered, so don't check ordering. - tm.assert_frame_equal(result, expected, check_categorical=False) + tm.assert_frame_equal(result, expected) def test_merge_equal_cat_dtypes2(): @@ -2100,8 +2099,7 @@ def test_merge_equal_cat_dtypes2(): {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)} ).set_index("foo") - # Categorical is unordered, so don't check ordering. - tm.assert_frame_equal(result, expected, check_categorical=False) + tm.assert_frame_equal(result, expected) def test_merge_on_cat_and_ext_array(): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 80a024eda7848..31f17be2fac7b 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -296,18 +296,18 @@ def cmp(a, b): # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) - # valid conversion - for valid in [ - lambda x: x.astype("category"), - lambda x: x.astype(CategoricalDtype()), - lambda x: x.astype("object").astype("category"), - lambda x: x.astype("object").astype(CategoricalDtype()), - ]: - - result = valid(s) - # compare series values - # internal .categories can't be compared because it is sorted - tm.assert_series_equal(result, s, check_categorical=False) + tm.assert_series_equal(s.astype("category"), s) + tm.assert_series_equal(s.astype(CategoricalDtype()), s) + + roundtrip_expected = s.cat.set_categories( + s.cat.categories.sort_values() + ).cat.remove_unused_categories() + tm.assert_series_equal( + s.astype("object").astype("category"), roundtrip_expected + ) + tm.assert_series_equal( + s.astype("object").astype(CategoricalDtype()), roundtrip_expected + ) # invalid conversion (these are NOT a dtype) msg = (