diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 8a915324a72c1..d44667b258414 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -8,272 +8,167 @@ import pandas as pd import pandas._testing as tm +# Each test case consists of a tuple with the data and dtype to create the +# test Series, the default dtype for the expected result (which is valid +# for most cases), and the specific cases where the result deviates from +# this default. Those overrides are defined as a dict with (keyword, val) as +# dictionary key. In case of multiple items, the last override takes precendence. +test_cases = [ + ( + # data + [1, 2, 3], + # original dtype + np.dtype("int32"), + # default expected dtype + "Int32", + # exceptions on expected dtype + {("convert_integer", False): np.dtype("int32")}, + ), + ( + [1, 2, 3], + np.dtype("int64"), + "Int64", + {("convert_integer", False): np.dtype("int64")}, + ), + ( + ["x", "y", "z"], + np.dtype("O"), + pd.StringDtype(), + {("convert_string", False): np.dtype("O")}, + ), + ( + [True, False, np.nan], + np.dtype("O"), + pd.BooleanDtype(), + {("convert_boolean", False): np.dtype("O")}, + ), + ( + ["h", "i", np.nan], + np.dtype("O"), + pd.StringDtype(), + {("convert_string", False): np.dtype("O")}, + ), + ( # GH32117 + ["h", "i", 1], + np.dtype("O"), + np.dtype("O"), + {}, + ), + ( + [10, np.nan, 20], + np.dtype("float"), + "Int64", + {("convert_integer", False): np.dtype("float")}, + ), + ([np.nan, 100.5, 200], np.dtype("float"), np.dtype("float"), {}), + ( + [3, 4, 5], + "Int8", + "Int8", + {}, + ), + ( + [[1, 2], [3, 4], [5]], + None, + np.dtype("O"), + {}, + ), + ( + [4, 5, 6], + np.dtype("uint32"), + "UInt32", + {("convert_integer", False): np.dtype("uint32")}, + ), + ( + [-10, 12, 13], + np.dtype("i1"), + "Int8", + {("convert_integer", False): np.dtype("i1")}, + ), + ( + [1, 2.0], + object, + "Int64", + { + ("convert_integer", False): np.dtype("float"), + ("infer_objects", False): np.dtype("object"), + }, + ), + ( + [1, 2.5], + object, + np.dtype("float"), + {("infer_objects", False): np.dtype("object")}, + ), + (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + "datetime64[ns]", + np.dtype("datetime64[ns]"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + object, + np.dtype("datetime64[ns]"), + {("infer_objects", False): np.dtype("object")}, + ), + (pd.period_range("1/1/2011", freq="M", periods=3), None, pd.PeriodDtype("M"), {}), + ( + pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + None, + pd.IntervalDtype("int64"), + {}, + ), +] + class TestSeriesConvertDtypes: - # The answerdict has keys that have 4 tuples, corresponding to the arguments - # infer_objects, convert_string, convert_integer, convert_boolean - # This allows all 16 possible combinations to be tested. Since common - # combinations expect the same answer, this provides an easy way to list - # all the possibilities @pytest.mark.parametrize( - "data, maindtype, answerdict", - [ - ( - [1, 2, 3], - np.dtype("int32"), - { - ((True, False), (True, False), (True,), (True, False)): "Int32", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "int32" - ), - }, - ), - ( - [1, 2, 3], - np.dtype("int64"), - { - ((True, False), (True, False), (True,), (True, False)): "Int64", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "int64" - ), - }, - ), - ( - ["x", "y", "z"], - np.dtype("O"), - { - ( - (True, False), - (True,), - (True, False), - (True, False), - ): pd.StringDtype(), - ((True, False), (False,), (True, False), (True, False)): np.dtype( - "O" - ), - }, - ), - ( - [True, False, np.nan], - np.dtype("O"), - { - ( - (True, False), - (True, False), - (True, False), - (True,), - ): pd.BooleanDtype(), - ((True, False), (True, False), (True, False), (False,)): np.dtype( - "O" - ), - }, - ), - ( - ["h", "i", np.nan], - np.dtype("O"), - { - ( - (True, False), - (True,), - (True, False), - (True, False), - ): pd.StringDtype(), - ((True, False), (False,), (True, False), (True, False)): np.dtype( - "O" - ), - }, - ), - ( # GH32117 - ["h", "i", 1], - np.dtype("O"), - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): np.dtype("O"), - }, - ), - ( - [10, np.nan, 20], - np.dtype("float"), - { - ((True, False), (True, False), (True,), (True, False)): "Int64", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "float" - ), - }, - ), - ( - [np.nan, 100.5, 200], - np.dtype("float"), - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): np.dtype("float"), - }, - ), - ( - [3, 4, 5], - "Int8", - {((True, False), (True, False), (True, False), (True, False)): "Int8"}, - ), - ( - [[1, 2], [3, 4], [5]], - None, - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): np.dtype("O"), - }, - ), - ( - [4, 5, 6], - np.dtype("uint32"), - { - ((True, False), (True, False), (True,), (True, False)): "UInt32", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "uint32" - ), - }, - ), - ( - [-10, 12, 13], - np.dtype("i1"), - { - ((True, False), (True, False), (True,), (True, False)): "Int8", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "i1" - ), - }, - ), - ( - [1, 2.0], - object, - { - ((True,), (True, False), (True,), (True, False)): "Int64", - ((True,), (True, False), (False,), (True, False)): np.dtype( - "float" - ), - ((False,), (True, False), (True, False), (True, False)): np.dtype( - "object" - ), - }, - ), - ( - [1, 2.5], - object, - { - ((True,), (True, False), (True, False), (True, False)): np.dtype( - "float" - ), - ((False,), (True, False), (True, False), (True, False)): np.dtype( - "object" - ), - }, - ), - ( - ["a", "b"], - pd.CategoricalDtype(), - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): pd.CategoricalDtype(), - }, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - pd.DatetimeTZDtype(tz="UTC"), - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): pd.DatetimeTZDtype(tz="UTC"), - }, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - "datetime64[ns]", - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): np.dtype("datetime64[ns]"), - }, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - object, - { - ((True,), (True, False), (True, False), (True, False)): np.dtype( - "datetime64[ns]" - ), - ((False,), (True, False), (True, False), (True, False)): np.dtype( - "O" - ), - }, - ), - ( - pd.period_range("1/1/2011", freq="M", periods=3), - None, - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): pd.PeriodDtype("M"), - }, - ), - ( - pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), - None, - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): pd.IntervalDtype("int64"), - }, - ), - ], + "data, maindtype, expected_default, expected_other", + test_cases, ) @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) - def test_convert_dtypes(self, data, maindtype, params, answerdict): + def test_convert_dtypes( + self, data, maindtype, params, expected_default, expected_other + ): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) - answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)} - ns = series.convert_dtypes(*params) - expected_dtype = answers[tuple(params)] - expected = pd.Series(series.values, dtype=expected_dtype) - tm.assert_series_equal(ns, expected) + result = series.convert_dtypes(*params) + + param_names = [ + "infer_objects", + "convert_string", + "convert_integer", + "convert_boolean", + ] + params_dict = dict(zip(param_names, params)) + + expected_dtype = expected_default + for (key, val), dtype in expected_other.items(): + if params_dict[key] is val: + expected_dtype = dtype + + expected = pd.Series(data, dtype=expected_dtype) + tm.assert_series_equal(result, expected) # Test that it is a copy copy = series.copy(deep=True) - if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]: + if is_interval_dtype(result.dtype) and result.dtype.subtype.kind in ["i", "u"]: msg = "Cannot set float NaN to integer-backed IntervalArray" with pytest.raises(ValueError, match=msg): - ns[ns.notna()] = np.nan + result[result.notna()] = np.nan else: - ns[ns.notna()] = np.nan + result[result.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy)