Skip to content

ENH: include conversion to nullable float in convert_dtypes() #38117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 29, 2020
32 changes: 31 additions & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,7 @@ def convert_dtypes(
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
convert_floating: bool = True,
) -> Dtype:
"""
Convert objects to best possible type, and optionally,
Expand All @@ -1269,14 +1270,20 @@ def convert_dtypes(
Whether, if possible, conversion can be done to integer extension types.
convert_boolean : bool, defaults True
Whether object dtypes should be converted to ``BooleanDtypes()``.
convert_floating : bool, defaults True
Whether, if possible, conversion can be done to floating extension types.
If `convert_integer` is also True, preference will be give to integer
dtypes if the floats can be faithfully casted to integers.

Returns
-------
dtype
new dtype
"""
is_extension = is_extension_array_dtype(input_array.dtype)
if (convert_string or convert_integer or convert_boolean) and not is_extension:
if (
convert_string or convert_integer or convert_boolean or convert_floating
) and not is_extension:
try:
inferred_dtype = lib.infer_dtype(input_array)
except ValueError:
Expand Down Expand Up @@ -1304,6 +1311,29 @@ def convert_dtypes(
if is_integer_dtype(inferred_dtype):
inferred_dtype = input_array.dtype

if convert_floating:
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
input_array.dtype
):
from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE

inferred_float_dtype = FLOAT_STR_TO_DTYPE.get(
input_array.dtype.name, "Float64"
)
# if we could also convert to integer, check if all floats
# are actually integers
if convert_integer:
arr = input_array[notna(input_array)]
if (arr.astype(int) == arr).all():
inferred_dtype = "Int64"
else:
inferred_dtype = inferred_float_dtype
else:
inferred_dtype = inferred_float_dtype
else:
if is_float_dtype(inferred_dtype):
inferred_dtype = input_array.dtype

if convert_boolean:
if is_bool_dtype(input_array.dtype):
inferred_dtype = "boolean"
Expand Down
41 changes: 31 additions & 10 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6088,6 +6088,7 @@ def convert_dtypes(
convert_string: bool_t = True,
convert_integer: bool_t = True,
convert_boolean: bool_t = True,
convert_floating: bool_t = True,
) -> FrameOrSeries:
"""
Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
Expand All @@ -6104,6 +6105,12 @@ def convert_dtypes(
Whether, if possible, conversion can be done to integer extension types.
convert_boolean : bool, defaults True
Whether object dtypes should be converted to ``BooleanDtypes()``.
convert_floating : bool, defaults True
Whether, if possible, conversion can be done to floating extension types.
If `convert_integer` is also True, preference will be give to integer
dtypes if the floats can be faithfully casted to integers.

.. versionadded:: 1.2.0

Returns
-------
Expand All @@ -6121,19 +6128,25 @@ def convert_dtypes(
-----
By default, ``convert_dtypes`` will attempt to convert a Series (or each
Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
``convert_string``, ``convert_integer``, and ``convert_boolean``, it is
possible to turn off individual conversions to ``StringDtype``, the integer
extension types or ``BooleanDtype``, respectively.
``convert_string``, ``convert_integer``, ``convert_boolean`` and
``convert_boolean``, it is possible to turn off individual conversions
to ``StringDtype``, the integer extension types, ``BooleanDtype``
or floating extension types, respectively.

For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
rules as during normal Series/DataFrame construction. Then, if possible,
convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension
type, otherwise leave as ``object``.
convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
or floating extension type, otherwise leave as ``object``.

If the dtype is integer, convert to an appropriate integer extension type.

If the dtype is numeric, and consists of all integers, convert to an
appropriate integer extension type.
appropriate integer extension type. Otherwise, convert to an
appropriate floating extension type.

.. versionchanged:: 1.2
Starting with pandas 1.2, this method also converts float columns
to the nullable floating extension type.

In the future, as new dtypes are added that support ``pd.NA``, the results
of this method will change to support those new dtypes.
Expand Down Expand Up @@ -6173,7 +6186,7 @@ def convert_dtypes(
>>> dfn = df.convert_dtypes()
>>> dfn
a b c d e f
0 1 x True h 10 NaN
0 1 x True h 10 <NA>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be worth mentioning here that this changed in 1.2

1 2 y False i <NA> 100.5
2 3 z <NA> <NA> 20 200.0

Expand All @@ -6183,7 +6196,7 @@ def convert_dtypes(
c boolean
d string
e Int64
f float64
f Float64
dtype: object

Start with a Series of strings and missing data represented by ``np.nan``.
Expand All @@ -6205,12 +6218,20 @@ def convert_dtypes(
"""
if self.ndim == 1:
return self._convert_dtypes(
infer_objects, convert_string, convert_integer, convert_boolean
infer_objects,
convert_string,
convert_integer,
convert_boolean,
convert_floating,
)
else:
results = [
col._convert_dtypes(
infer_objects, convert_string, convert_integer, convert_boolean
infer_objects,
convert_string,
convert_integer,
convert_boolean,
convert_floating,
)
for col_name, col in self.items()
]
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4706,16 +4706,21 @@ def _convert_dtypes(
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
convert_floating: bool = True,
) -> "Series":
input_series = self
if infer_objects:
input_series = input_series.infer_objects()
if is_object_dtype(input_series):
input_series = input_series.copy()

if convert_string or convert_integer or convert_boolean:
if convert_string or convert_integer or convert_boolean or convert_floating:
inferred_dtype = convert_dtypes(
input_series._values, convert_string, convert_integer, convert_boolean
input_series._values,
convert_string,
convert_integer,
convert_boolean,
convert_floating,
)
try:
result = input_series.astype(inferred_dtype)
Expand Down
35 changes: 27 additions & 8 deletions pandas/tests/series/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,17 @@
[10, np.nan, 20],
np.dtype("float"),
"Int64",
{("convert_integer", False): np.dtype("float")},
{
("convert_integer", False, "convert_floating", True): "Float64",
("convert_integer", False, "convert_floating", False): np.dtype("float"),
},
),
(
[np.nan, 100.5, 200],
np.dtype("float"),
"Float64",
{("convert_floating", False): np.dtype("float")},
),
([np.nan, 100.5, 200], np.dtype("float"), np.dtype("float"), {}),
(
[3, 4, 5],
"Int8",
Expand All @@ -85,20 +93,30 @@
"Int8",
{("convert_integer", False): np.dtype("i1")},
),
(
[1.2, 1.3],
np.dtype("float32"),
"Float32",
{("convert_floating", False): np.dtype("float32")},
),
(
[1, 2.0],
object,
"Int64",
{
("convert_integer", False): np.dtype("float"),
("convert_integer", False): "Float64",
("convert_integer", False, "convert_floating", False): np.dtype("float"),
("infer_objects", False): np.dtype("object"),
},
),
(
[1, 2.5],
object,
np.dtype("float"),
{("infer_objects", False): np.dtype("object")},
"Float64",
{
("convert_floating", False): np.dtype("float"),
("infer_objects", False): np.dtype("object"),
},
),
(["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}),
(
Expand Down Expand Up @@ -134,7 +152,7 @@ class TestSeriesConvertDtypes:
"data, maindtype, expected_default, expected_other",
test_cases,
)
@pytest.mark.parametrize("params", product(*[(True, False)] * 4))
@pytest.mark.parametrize("params", product(*[(True, False)] * 5))
def test_convert_dtypes(
self, data, maindtype, params, expected_default, expected_other
):
Expand All @@ -150,12 +168,13 @@ def test_convert_dtypes(
"convert_string",
"convert_integer",
"convert_boolean",
"convert_floating",
]
params_dict = dict(zip(param_names, params))

expected_dtype = expected_default
for (key, val), dtype in expected_other.items():
if params_dict[key] is val:
for spec, dtype in expected_other.items():
if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])):
expected_dtype = dtype

expected = pd.Series(data, dtype=expected_dtype)
Expand Down