Skip to content

TST: tighten check_categorical=False tests #32636

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions pandas/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -824,10 +824,14 @@ def assert_categorical_equal(
left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes",
)
else:
try:
lc = left.categories.sort_values()
rc = right.categories.sort_values()
except TypeError:
# e.g. '<' not supported between instances of 'int' and 'str'
lc, rc = left.categories, right.categories
assert_index_equal(
left.categories.sort_values(),
right.categories.sort_values(),
obj=f"{obj}.categories",
lc, rc, obj=f"{obj}.categories",
)
assert_index_equal(
left.categories.take(left.codes),
Expand Down
53 changes: 28 additions & 25 deletions pandas/tests/arrays/categorical/test_replace.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,51 @@
import numpy as np
import pytest

import pandas as pd
import pandas._testing as tm


@pytest.mark.parametrize(
"to_replace,value,expected,check_types,check_categorical",
"to_replace,value,expected,flip_categories",
[
# one-to-one
(1, 2, [2, 2, 3], True, True),
(1, 4, [4, 2, 3], True, True),
(4, 1, [1, 2, 3], True, True),
(5, 6, [1, 2, 3], True, True),
(1, 2, [2, 2, 3], False),
(1, 4, [4, 2, 3], False),
(4, 1, [1, 2, 3], False),
(5, 6, [1, 2, 3], False),
# many-to-one
([1], 2, [2, 2, 3], True, True),
([1, 2], 3, [3, 3, 3], True, True),
([1, 2], 4, [4, 4, 3], True, True),
((1, 2, 4), 5, [5, 5, 3], True, True),
((5, 6), 2, [1, 2, 3], True, True),
([1], 2, [2, 2, 3], False),
([1, 2], 3, [3, 3, 3], False),
([1, 2], 4, [4, 4, 3], False),
((1, 2, 4), 5, [5, 5, 3], False),
((5, 6), 2, [1, 2, 3], False),
# many-to-many, handled outside of Categorical and results in separate dtype
([1], [2], [2, 2, 3], False, False),
([1, 4], [5, 2], [5, 2, 3], False, False),
([1], [2], [2, 2, 3], True),
([1, 4], [5, 2], [5, 2, 3], True),
# check_categorical sorts categories, which crashes on mixed dtypes
(3, "4", [1, 2, "4"], True, False),
([1, 2, "3"], "5", ["5", "5", 3], True, False),
(3, "4", [1, 2, "4"], False),
([1, 2, "3"], "5", ["5", "5", 3], True),
],
)
def test_replace(to_replace, value, expected, check_types, check_categorical):
def test_replace(to_replace, value, expected, flip_categories):
# GH 31720
stays_categorical = not isinstance(value, list)

s = pd.Series([1, 2, 3], dtype="category")
result = s.replace(to_replace, value)
expected = pd.Series(expected, dtype="category")
s.replace(to_replace, value, inplace=True)

if flip_categories:
expected = expected.cat.set_categories(expected.cat.categories[::-1])

if not stays_categorical:
# the replace call loses categorical dtype
expected = pd.Series(np.asarray(expected))

tm.assert_series_equal(
expected,
result,
check_dtype=check_types,
check_categorical=check_categorical,
check_category_order=False,
expected, result, check_category_order=False,
)
tm.assert_series_equal(
expected,
s,
check_dtype=check_types,
check_categorical=check_categorical,
check_category_order=False,
expected, s, check_category_order=False,
)
6 changes: 1 addition & 5 deletions pandas/tests/generic/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,17 +273,13 @@ def test_to_xarray_index_types(self, index):
assert isinstance(result, Dataset)

# idempotency
# categoricals are not preserved
# datetimes w/tz are preserved
# column names are lost
expected = df.copy()
expected["f"] = expected["f"].astype(object)
expected.columns.name = None
tm.assert_frame_equal(
result.to_dataframe(),
expected,
check_index_type=False,
check_categorical=False,
result.to_dataframe(), expected,
)

@td.skip_if_no("xarray", min_version="0.7.0")
Expand Down
15 changes: 1 addition & 14 deletions pandas/tests/io/pytables/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
from pandas.compat import is_platform_little_endian, is_platform_windows
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_categorical_dtype

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -1057,18 +1055,7 @@ def test_latin_encoding(self, setup_path, dtype, val):

s_nan = ser.replace(nan_rep, np.nan)

if is_categorical_dtype(s_nan):
assert is_categorical_dtype(retr)
tm.assert_series_equal(
s_nan, retr, check_dtype=False, check_categorical=False
)
else:
tm.assert_series_equal(s_nan, retr)

# FIXME: don't leave commented-out
# fails:
# for x in examples:
# roundtrip(s, nan_rep=b'\xf8\xfc')
tm.assert_series_equal(s_nan, retr)

def test_append_some_nans(self, setup_path):

Expand Down
48 changes: 33 additions & 15 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,7 +1026,14 @@ def test_categorical_with_stata_missing_values(self, version):
original.to_stata(path, version=version)
written_and_read_again = self.read_dta(path)
res = written_and_read_again.set_index("index")
tm.assert_frame_equal(res, original, check_categorical=False)

expected = original.copy()
for col in expected:
cat = expected[col]._values
new_cats = cat.remove_unused_categories().categories
cat = cat.set_categories(new_cats, ordered=True)
expected[col] = cat
tm.assert_frame_equal(res, expected)

@pytest.mark.parametrize("file", ["dta19_115", "dta19_117"])
def test_categorical_order(self, file):
Expand All @@ -1044,15 +1051,17 @@ def test_categorical_order(self, file):
cols = []
for is_cat, col, labels, codes in expected:
if is_cat:
cols.append((col, pd.Categorical.from_codes(codes, labels)))
cols.append(
(col, pd.Categorical.from_codes(codes, labels, ordered=True))
)
else:
cols.append((col, pd.Series(labels, dtype=np.float32)))
expected = DataFrame.from_dict(dict(cols))

# Read with and with out categoricals, ensure order is identical
file = getattr(self, file)
parsed = read_stata(file)
tm.assert_frame_equal(expected, parsed, check_categorical=False)
tm.assert_frame_equal(expected, parsed)

# Check identity of codes
for col in expected:
Expand Down Expand Up @@ -1137,18 +1146,30 @@ def test_read_chunks_117(
chunk = itr.read(chunksize)
except StopIteration:
break
from_frame = parsed.iloc[pos : pos + chunksize, :]
from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
from_frame = self._convert_categorical(from_frame)
tm.assert_frame_equal(
from_frame,
chunk,
check_dtype=False,
check_datetimelike_compat=True,
check_categorical=False,
from_frame, chunk, check_dtype=False, check_datetimelike_compat=True,
)

pos += chunksize
itr.close()

@staticmethod
def _convert_categorical(from_frame: DataFrame) -> DataFrame:
"""
Emulate the categorical casting behavior we expect from roundtripping.
"""
for col in from_frame:
ser = from_frame[col]
if is_categorical_dtype(ser.dtype):
cat = ser._values.remove_unused_categories()
if cat.categories.dtype == object:
categories = pd.Index(cat.categories._values)
cat = cat.set_categories(categories)
from_frame[col] = cat
return from_frame

def test_iterator(self):

fname = self.dta3_117
Expand Down Expand Up @@ -1223,13 +1244,10 @@ def test_read_chunks_115(
chunk = itr.read(chunksize)
except StopIteration:
break
from_frame = parsed.iloc[pos : pos + chunksize, :]
from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
from_frame = self._convert_categorical(from_frame)
tm.assert_frame_equal(
from_frame,
chunk,
check_dtype=False,
check_datetimelike_compat=True,
check_categorical=False,
from_frame, chunk, check_dtype=False, check_datetimelike_compat=True,
)

pos += chunksize
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2077,8 +2077,7 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse):
}
).set_index("foo")

# Categorical is unordered, so don't check ordering.
tm.assert_frame_equal(result, expected, check_categorical=False)
tm.assert_frame_equal(result, expected)


def test_merge_equal_cat_dtypes2():
Expand All @@ -2100,8 +2099,7 @@ def test_merge_equal_cat_dtypes2():
{"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)}
).set_index("foo")

# Categorical is unordered, so don't check ordering.
tm.assert_frame_equal(result, expected, check_categorical=False)
tm.assert_frame_equal(result, expected)


def test_merge_on_cat_and_ext_array():
Expand Down
24 changes: 12 additions & 12 deletions pandas/tests/series/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,18 +296,18 @@ def cmp(a, b):
# array conversion
tm.assert_almost_equal(np.array(s), np.array(s.values))

# valid conversion
for valid in [
lambda x: x.astype("category"),
lambda x: x.astype(CategoricalDtype()),
lambda x: x.astype("object").astype("category"),
lambda x: x.astype("object").astype(CategoricalDtype()),
]:

result = valid(s)
# compare series values
# internal .categories can't be compared because it is sorted
tm.assert_series_equal(result, s, check_categorical=False)
tm.assert_series_equal(s.astype("category"), s)
tm.assert_series_equal(s.astype(CategoricalDtype()), s)

roundtrip_expected = s.cat.set_categories(
s.cat.categories.sort_values()
).cat.remove_unused_categories()
tm.assert_series_equal(
s.astype("object").astype("category"), roundtrip_expected
)
tm.assert_series_equal(
s.astype("object").astype(CategoricalDtype()), roundtrip_expected
)

# invalid conversion (these are NOT a dtype)
msg = (
Expand Down