From d4ea4a28ce7254fb42128139ea1404d62ca21362 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 16:37:38 +0100 Subject: [PATCH 1/5] ENH: Add use_nullable_dtypes to to_numeric --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/tools/numeric.py | 38 ++++++++++++++++++++---- pandas/tests/tools/test_to_numeric.py | 42 +++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b1387e9717079..b818357727d9e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -42,6 +42,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_sql` * :func:`read_sql_query` * :func:`read_sql_table` +* :func:`to_numeric` Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index da47aa549dfa3..80da4c3a92c50 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import maybe_downcast_numeric from pandas.core.dtypes.common import ( ensure_object, + is_bool_dtype, is_datetime_or_timedelta_dtype, is_decimal, is_integer_dtype, @@ -34,6 +35,7 @@ def to_numeric( arg, errors: DateTimeErrorChoices = "raise", downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, + use_nullable_dtypes: bool = False, ): """ Convert argument to a numeric type. @@ -47,7 +49,7 @@ def to_numeric( numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are passed in, it is very likely they will be converted to float so that - they can stored in an `ndarray`. These warnings apply similarly to + they can be stored in an `ndarray`. These warnings apply similarly to `Series` since it internally leverages `ndarray`. Parameters @@ -78,6 +80,10 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when converting data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. Returns ------- @@ -183,6 +189,7 @@ def to_numeric( values = values._data[~mask] values_dtype = getattr(values, "dtype", None) + new_mask: np.ndarray | None = None if is_numeric_dtype(values_dtype): pass elif is_datetime_or_timedelta_dtype(values_dtype): @@ -191,13 +198,23 @@ def to_numeric( values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: - values, _ = lib.maybe_convert_numeric( - values, set(), coerce_numeric=coerce_numeric + values, new_mask = lib.maybe_convert_numeric( + values, + set(), + coerce_numeric=coerce_numeric, + convert_to_masked_nullable=use_nullable_dtypes, ) except (ValueError, TypeError): if errors == "raise": raise + if new_mask is not None: + # Remove unnecessary values, is expected later anyway and enables + # downcasting + values = values[~new_mask] + elif use_nullable_dtypes and new_mask is None: + new_mask = np.zeros(values.shape, dtype=np.bool_) + # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): @@ -228,17 +245,26 @@ def to_numeric( if values.dtype == dtype: break - # GH33013: for IntegerArray & FloatingArray need to reconstruct masked array - if mask is not None: + # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct + # masked array + if mask is not None or new_mask is not None: + if mask is None: + mask = new_mask data = np.zeros(mask.shape, dtype=values.dtype) data[~mask] = values from pandas.core.arrays import ( + BooleanArray, FloatingArray, IntegerArray, ) - klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray + if is_integer_dtype(data.dtype): + klass = IntegerArray + elif is_bool_dtype(data.dtype): + klass = BooleanArray + else: + klass = FloatingArray values = klass(data, mask.copy()) if is_series: diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 1347f6eb50b09..93f6011608fba 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -807,3 +807,45 @@ def test_to_numeric_large_float_not_downcast_to_float_32(val): expected = Series([val]) result = to_numeric(expected, downcast="float") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")] +) +def test_to_numeric_use_nullable_dtypes(val, dtype): + # GH# + ser = Series([val], dtype=object) + result = to_numeric(ser, use_nullable_dtypes=True) + expected = Series([val], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")] +) +def test_to_numeric_use_nullable_dtypes_na(val, dtype): + # GH# + ser = Series([val, None], dtype=object) + result = to_numeric(ser, use_nullable_dtypes=True) + expected = Series([val, pd.NA], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "val, dtype, downcast", + [(1, "Int8", "integer"), (1.5, "Float32", "float"), (1, "Int8", "signed")], +) +def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast): + # GH# + ser = Series([val, None], dtype=object) + result = to_numeric(ser, use_nullable_dtypes=True, downcast=downcast) + expected = Series([val, pd.NA], dtype=dtype) + tm.assert_series_equal(result, expected) + + +def test_to_numeric_use_nullable_dtypes_downcasting_uint(): + # GH# + ser = Series([1, pd.NA], dtype="UInt64") + result = to_numeric(ser, use_nullable_dtypes=True, downcast="unsigned") + expected = Series([1, pd.NA], dtype="UInt8") + tm.assert_series_equal(result, expected) From cebccbe8588054a87fea56421387af7666a0396f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 16:39:00 +0100 Subject: [PATCH 2/5] Add gh ref --- pandas/tests/tools/test_to_numeric.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 93f6011608fba..5f1f734cc82d4 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -813,7 +813,7 @@ def test_to_numeric_large_float_not_downcast_to_float_32(val): "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")] ) def test_to_numeric_use_nullable_dtypes(val, dtype): - # GH# + # GH#50505 ser = Series([val], dtype=object) result = to_numeric(ser, use_nullable_dtypes=True) expected = Series([val], dtype=dtype) @@ -824,7 +824,7 @@ def test_to_numeric_use_nullable_dtypes(val, dtype): "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")] ) def test_to_numeric_use_nullable_dtypes_na(val, dtype): - # GH# + # GH#50505 ser = Series([val, None], dtype=object) result = to_numeric(ser, use_nullable_dtypes=True) expected = Series([val, pd.NA], dtype=dtype) @@ -836,7 +836,7 @@ def test_to_numeric_use_nullable_dtypes_na(val, dtype): [(1, "Int8", "integer"), (1.5, "Float32", "float"), (1, "Int8", "signed")], ) def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast): - # GH# + # GH#50505 ser = Series([val, None], dtype=object) result = to_numeric(ser, use_nullable_dtypes=True, downcast=downcast) expected = Series([val, pd.NA], dtype=dtype) @@ -844,7 +844,7 @@ def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast): def test_to_numeric_use_nullable_dtypes_downcasting_uint(): - # GH# + # GH#50505 ser = Series([1, pd.NA], dtype="UInt64") result = to_numeric(ser, use_nullable_dtypes=True, downcast="unsigned") expected = Series([1, pd.NA], dtype="UInt8") From 91df80219ffdff03b8ef1a532b9b659f9435092e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 18:03:12 +0100 Subject: [PATCH 3/5] Fix typing --- pandas/core/tools/numeric.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 80da4c3a92c50..de1a49870ea24 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -198,7 +198,7 @@ def to_numeric( values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: - values, new_mask = lib.maybe_convert_numeric( + values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] values, set(), coerce_numeric=coerce_numeric, @@ -250,6 +250,7 @@ def to_numeric( if mask is not None or new_mask is not None: if mask is None: mask = new_mask + assert isinstance(mask, np.ndarray) data = np.zeros(mask.shape, dtype=values.dtype) data[~mask] = values @@ -259,6 +260,7 @@ def to_numeric( IntegerArray, ) + klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray] if is_integer_dtype(data.dtype): klass = IntegerArray elif is_bool_dtype(data.dtype): From c4c69f0d8a2f83fede850d4df61d06384c3b5a94 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 4 Jan 2023 21:51:45 +0100 Subject: [PATCH 4/5] Fix bugs --- pandas/_libs/lib.pyx | 2 ++ pandas/core/tools/numeric.py | 3 ++- pandas/tests/tools/test_to_numeric.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc7b876cb5de8..7fd055e0f739b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2243,6 +2243,7 @@ def maybe_convert_numeric( if convert_empty or seen.coerce_numeric: seen.saw_null() floats[i] = complexes[i] = NaN + mask[i] = 1 else: raise ValueError("Empty string encountered") elif util.is_complex_object(val): @@ -2299,6 +2300,7 @@ def maybe_convert_numeric( seen.saw_null() floats[i] = NaN + mask[i] = 1 if seen.check_uint64_conflict(): return (values, None) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index de1a49870ea24..9d8f14e101945 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -19,6 +19,7 @@ is_integer_dtype, is_number, is_numeric_dtype, + is_object_dtype, is_scalar, needs_i8_conversion, ) @@ -247,7 +248,7 @@ def to_numeric( # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct # masked array - if mask is not None or new_mask is not None: + if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype): if mask is None: mask = new_mask assert isinstance(mask, np.ndarray) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 5f1f734cc82d4..ed29b2cf6919c 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -849,3 +849,21 @@ def test_to_numeric_use_nullable_dtypes_downcasting_uint(): result = to_numeric(ser, use_nullable_dtypes=True, downcast="unsigned") expected = Series([1, pd.NA], dtype="UInt8") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "use_nullable_dtypes, dtype", [(True, "Float64"), (False, "float64")] +) +def test_to_numeric_use_nullable_dtypes_error(use_nullable_dtypes, dtype): + # GH#50505 + ser = Series(["a", "b", ""]) + expected = ser.copy() + with pytest.raises(ValueError, match="Unable to parse string"): + to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes) + + result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="ignore") + tm.assert_series_equal(result, expected) + + result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="coerce") + expected = Series([np.nan, np.nan, np.nan], dtype=dtype) + tm.assert_series_equal(result, expected) From 92f20e15996644ce173ea05cb15f3986f23254d2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 6 Jan 2023 19:57:40 +0100 Subject: [PATCH 5/5] Use array equal fast --- pandas/core/tools/numeric.py | 8 +++++--- pandas/tests/tools/test_to_numeric.py | 9 +++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 9d8f14e101945..a8ae8c47b0d19 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -29,7 +29,7 @@ ) import pandas as pd -from pandas.core.arrays.numeric import NumericArray +from pandas.core.arrays import BaseMaskedArray def to_numeric( @@ -185,7 +185,7 @@ def to_numeric( # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting mask: npt.NDArray[np.bool_] | None = None - if isinstance(values, NumericArray): + if isinstance(values, BaseMaskedArray): mask = values._mask values = values._data[~mask] @@ -251,6 +251,8 @@ def to_numeric( if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype): if mask is None: mask = new_mask + else: + mask = mask.copy() assert isinstance(mask, np.ndarray) data = np.zeros(mask.shape, dtype=values.dtype) data[~mask] = values @@ -268,7 +270,7 @@ def to_numeric( klass = BooleanArray else: klass = FloatingArray - values = klass(data, mask.copy()) + values = klass(data, mask) if is_series: return arg._constructor(values, index=arg.index, name=arg.name) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index ed29b2cf6919c..1c0a8301d65cc 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -851,6 +851,15 @@ def test_to_numeric_use_nullable_dtypes_downcasting_uint(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["Int64", "UInt64", "Float64", "boolean"]) +def test_to_numeric_use_nullable_dtypes_already_nullable(dtype): + # GH#50505 + ser = Series([1, pd.NA], dtype=dtype) + result = to_numeric(ser, use_nullable_dtypes=True) + expected = Series([1, pd.NA], dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "use_nullable_dtypes, dtype", [(True, "Float64"), (False, "float64")] )