From d36fb98ddba74631812f113050c0a6f945868051 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Dec 2023 03:30:57 +0100 Subject: [PATCH 01/63] Use DeprecationWarning instead of FutureWarning for is_.._dtype deprecations (#55703) --- pandas/core/dtypes/common.py | 28 ++++++++++++------------ pandas/tests/dtypes/test_common.py | 22 ++++++++++--------- pandas/tests/dtypes/test_dtypes.py | 20 ++++++++--------- pandas/tests/dtypes/test_inference.py | 4 ++-- pandas/tests/io/test_parquet.py | 5 ++++- pandas/tests/io/test_sql.py | 4 ++-- pandas/tests/series/test_constructors.py | 13 +++-------- 7 files changed, 47 insertions(+), 49 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3d12e334e7c0f..2245359fd8eac 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -211,8 +211,8 @@ def is_sparse(arr) -> bool: warnings.warn( "is_sparse is deprecated and will be removed in a future " "version. Check `isinstance(dtype, pd.SparseDtype)` instead.", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) dtype = getattr(arr, "dtype", arr) @@ -329,8 +329,8 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: warnings.warn( "is_datetime64tz_dtype is deprecated and will be removed in a future " "version. Check `isinstance(dtype, pd.DatetimeTZDtype)` instead.", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) if isinstance(arr_or_dtype, DatetimeTZDtype): # GH#33400 fastpath for dtype object @@ -408,8 +408,8 @@ def is_period_dtype(arr_or_dtype) -> bool: warnings.warn( "is_period_dtype is deprecated and will be removed in a future version. " "Use `isinstance(dtype, pd.PeriodDtype)` instead", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) if isinstance(arr_or_dtype, ExtensionDtype): # GH#33400 fastpath for dtype object @@ -454,8 +454,8 @@ def is_interval_dtype(arr_or_dtype) -> bool: warnings.warn( "is_interval_dtype is deprecated and will be removed in a future version. " "Use `isinstance(dtype, pd.IntervalDtype)` instead", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) if isinstance(arr_or_dtype, ExtensionDtype): # GH#33400 fastpath for dtype object @@ -499,8 +499,8 @@ def is_categorical_dtype(arr_or_dtype) -> bool: warnings.warn( "is_categorical_dtype is deprecated and will be removed in a future " "version. Use isinstance(dtype, pd.CategoricalDtype) instead", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) if isinstance(arr_or_dtype, ExtensionDtype): # GH#33400 fastpath for dtype object @@ -838,8 +838,8 @@ def is_int64_dtype(arr_or_dtype) -> bool: warnings.warn( "is_int64_dtype is deprecated and will be removed in a future " "version. Use dtype == np.int64 instead.", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) return _is_dtype_type(arr_or_dtype, classes(np.int64)) @@ -1241,8 +1241,8 @@ def is_bool_dtype(arr_or_dtype) -> bool: "The behavior of is_bool_dtype with an object-dtype Index " "of bool objects is deprecated. In a future version, " "this will return False. Cast the Index to a bool dtype instead.", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) return True return False diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 42043f95a7ace..c34c97b6e4f04 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -164,7 +164,9 @@ def get_is_dtype_funcs(): return [getattr(com, fname) for fname in fnames] -@pytest.mark.filterwarnings("ignore:is_categorical_dtype is deprecated:FutureWarning") +@pytest.mark.filterwarnings( + "ignore:is_categorical_dtype is deprecated:DeprecationWarning" +) @pytest.mark.parametrize("func", get_is_dtype_funcs(), ids=lambda x: x.__name__) def test_get_dtype_error_catch(func): # see gh-15941 @@ -180,7 +182,7 @@ def test_get_dtype_error_catch(func): or func is com.is_categorical_dtype or func is com.is_period_dtype ): - warn = FutureWarning + warn = DeprecationWarning with tm.assert_produces_warning(warn, match=msg): assert not func(None) @@ -200,7 +202,7 @@ def test_is_object(): ) def test_is_sparse(check_scipy): msg = "is_sparse is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert com.is_sparse(SparseArray([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) @@ -230,7 +232,7 @@ def test_is_datetime64_dtype(): def test_is_datetime64tz_dtype(): msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_datetime64tz_dtype(object) assert not com.is_datetime64tz_dtype([1, 2, 3]) assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) @@ -246,7 +248,7 @@ def kind(self) -> str: not_tz_dtype = NotTZDtype() msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_datetime64tz_dtype(not_tz_dtype) assert not com.needs_i8_conversion(not_tz_dtype) @@ -268,7 +270,7 @@ def test_is_timedelta64_dtype(): def test_is_period_dtype(): msg = "is_period_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_period_dtype(object) assert not com.is_period_dtype([1, 2, 3]) assert not com.is_period_dtype(pd.Period("2017-01-01")) @@ -279,7 +281,7 @@ def test_is_period_dtype(): def test_is_interval_dtype(): msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_interval_dtype(object) assert not com.is_interval_dtype([1, 2, 3]) @@ -292,7 +294,7 @@ def test_is_interval_dtype(): def test_is_categorical_dtype(): msg = "is_categorical_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_categorical_dtype(object) assert not com.is_categorical_dtype([1, 2, 3]) @@ -442,7 +444,7 @@ def test_is_not_unsigned_integer_dtype(dtype): ) def test_is_int64_dtype(dtype): msg = "is_int64_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert com.is_int64_dtype(dtype) @@ -480,7 +482,7 @@ def test_type_comparison_with_signed_int_ea_dtype_and_signed_int_numpy_dtype( ) def test_is_not_int64_dtype(dtype): msg = "is_int64_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_int64_dtype(dtype) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 4e8f375b31674..0dad0b05303ad 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -166,7 +166,7 @@ def test_is_dtype(self, dtype): def test_basic(self, dtype): msg = "is_categorical_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_categorical_dtype(dtype) factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -292,7 +292,7 @@ def test_subclass(self): def test_compat(self, dtype): msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_datetime64tz_dtype(dtype) assert is_datetime64tz_dtype("datetime64[ns, US/Eastern]") assert is_datetime64_any_dtype(dtype) @@ -353,14 +353,14 @@ def test_equality(self, dtype): def test_basic(self, dtype): msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_datetime64tz_dtype(dtype) dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr, name="A") # dtypes - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_datetime64tz_dtype(s.dtype) assert is_datetime64tz_dtype(s) assert not is_datetime64tz_dtype(np.dtype("float64")) @@ -531,7 +531,7 @@ def test_equality(self, dtype): def test_basic(self, dtype): msg = "is_period_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_period_dtype(dtype) pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="h") @@ -619,7 +619,7 @@ def test_construction(self, subtype): i = IntervalDtype(subtype, closed="right") assert i.subtype == np.dtype("int64") msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_interval_dtype(i) @pytest.mark.parametrize( @@ -642,7 +642,7 @@ def test_construction_generic(self, subtype): i = IntervalDtype(subtype) assert i.subtype is None msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_interval_dtype(i) @pytest.mark.parametrize( @@ -815,7 +815,7 @@ def test_name_repr_generic(self, subtype): def test_basic(self, dtype): msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_interval_dtype(dtype) ii = IntervalIndex.from_breaks(range(3)) @@ -830,7 +830,7 @@ def test_basic(self, dtype): def test_basic_dtype(self): msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_interval_dtype("interval[int64, both]") assert is_interval_dtype(IntervalIndex.from_tuples([(0, 1)])) assert is_interval_dtype(IntervalIndex.from_breaks(np.arange(4))) @@ -1178,7 +1178,7 @@ def test_is_dtype_no_warning(check): or check is is_datetime64tz_dtype or check is is_period_dtype ): - warn = FutureWarning + warn = DeprecationWarning with tm.assert_produces_warning(warn, match=msg): check(data) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 32c8def669c21..ff2cfc1278331 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1833,7 +1833,7 @@ def test_is_datetime_dtypes(self): assert is_datetime64_any_dtype(ts) assert is_datetime64_any_dtype(tsa) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not is_datetime64tz_dtype("datetime64") assert not is_datetime64tz_dtype("datetime64[ns]") assert not is_datetime64tz_dtype(ts) @@ -1845,7 +1845,7 @@ def test_is_datetime_dtypes_with_tz(self, tz): assert not is_datetime64_dtype(dtype) msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_datetime64tz_dtype(dtype) assert is_datetime64_ns_dtype(dtype) assert is_datetime64_any_dtype(dtype) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9ae2d5cb03afd..ad7cdad363e78 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -364,7 +364,10 @@ def test_parquet_pos_args_deprecation(engine): ) with tm.ensure_clean() as path: with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, ): df.to_parquet(path, engine) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e20c49c072515..e3272e5f5902d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1018,7 +1018,7 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): if conn == "sqlite_adbc_conn": df = df.drop(columns=["timedelta"]) if pa_version_under14p1: - exp_warning = FutureWarning + exp_warning = DeprecationWarning msg = "is_sparse is deprecated" else: exp_warning = None @@ -1885,7 +1885,7 @@ def test_api_timedelta(conn, request): if "adbc" in conn_name: if pa_version_under14p1: - exp_warning = FutureWarning + exp_warning = DeprecationWarning else: exp_warning = None else: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 502096d41dde2..0e6f1c284a988 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -17,7 +17,6 @@ from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -396,18 +395,12 @@ def test_constructor_categorical(self): def test_construct_from_categorical_with_dtype(self): # GH12574 - cat = Series(Categorical([1, 2, 3]), dtype="category") - msg = "is_categorical_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert is_categorical_dtype(cat) - assert is_categorical_dtype(cat.dtype) + ser = Series(Categorical([1, 2, 3]), dtype="category") + assert isinstance(ser.dtype, CategoricalDtype) def test_construct_intlist_values_category_dtype(self): ser = Series([1, 2, 3], dtype="category") - msg = "is_categorical_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert is_categorical_dtype(ser) - assert is_categorical_dtype(ser.dtype) + assert isinstance(ser.dtype, CategoricalDtype) def test_constructor_categorical_with_coercion(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) From 0d853e7707bdc6995e0cb058fc34bd042a6e9ec5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Dec 2023 19:57:39 -1000 Subject: [PATCH 02/63] BUG: Series.__mul__ for pyarrow strings (#56368) * BUG: Series.__mul__ for pyarrow strings * Fix existing tests * Another test --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 26 +++++++++++++--------- pandas/tests/arrays/string_/test_string.py | 7 +----- pandas/tests/extension/test_arrow.py | 26 +++++++++++++++++++++- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 67b4052b386c0..c878fd2664dc4 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -576,6 +576,7 @@ Strings ^^^^^^^ - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) +- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) Interval diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1609bf50a834a..e7a50dbba9935 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -668,16 +668,22 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pa_type = self._pa_array.type other = self._box_pa(other) - if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ - operator.add, - roperator.radd, - ]: - sep = pa.scalar("", type=pa_type) - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - else: - result = pc.binary_join_element_wise(other, self._pa_array, sep) - return type(self)(result) + if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): + if op in [operator.add, roperator.radd, operator.mul, roperator.rmul]: + sep = pa.scalar("", type=pa_type) + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + else: + if not ( + isinstance(other, pa.Scalar) and pa.types.is_integer(other.type) + ): + raise TypeError("Can only string multiply by an integer.") + result = pc.binary_join_element_wise( + *([self._pa_array] * other.as_py()), sep + ) + return type(self)(result) if ( isinstance(other, pa.Scalar) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 524a6632e5544..3e11062b8384e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -176,12 +176,7 @@ def test_add_sequence(dtype): tm.assert_extension_array_equal(result, expected) -def test_mul(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: - reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" - mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) - request.applymarker(mark) - +def test_mul(dtype): a = pd.array(["a", "b", None], dtype=dtype) result = a * 2 expected = pd.array(["aa", "bb", None], dtype=dtype) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9e70a59932701..3ce3cee9714e4 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -965,8 +965,16 @@ def _get_arith_xfail_marker(self, opname, pa_dtype): def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): pa_dtype = data.dtype.pyarrow_dtype - if all_arithmetic_operators == "__rmod__" and (pa.types.is_binary(pa_dtype)): + if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype): pytest.skip("Skip testing Python string formatting") + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -981,6 +989,14 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ): pytest.skip("Skip testing Python string formatting") + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1004,6 +1020,14 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): ), ) ) + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: From 2fc264aeb0b906168eab462ba7183aa6fa51da7e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 7 Dec 2023 12:15:34 -0500 Subject: [PATCH 03/63] CLN/TYP: "how" parameter in merge ops (#56372) cleanup how parameter in merge ops --- pandas/core/reshape/merge.py | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f8575b1b53908..0756b25adedcd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -718,7 +718,7 @@ class _MergeOperation: """ _merge_type = "merge" - how: MergeHow | Literal["asof"] + how: JoinHow | Literal["asof"] on: IndexLabel | None # left_on/right_on may be None when passed, but in validate_specification # get replaced with non-None. @@ -739,7 +739,7 @@ def __init__( self, left: DataFrame | Series, right: DataFrame | Series, - how: MergeHow | Literal["asof"] = "inner", + how: JoinHow | Literal["asof"] = "inner", on: IndexLabel | AnyArrayLike | None = None, left_on: IndexLabel | AnyArrayLike | None = None, right_on: IndexLabel | AnyArrayLike | None = None, @@ -1106,6 +1106,8 @@ def _maybe_add_join_keys( def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """return the join indexers""" + # make mypy happy + assert self.how != "asof" return get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how ) @@ -1114,8 +1116,6 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp] def _get_join_info( self, ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - # make mypy happy - assert self.how != "cross" left_ax = self.left.index right_ax = self.right.index @@ -1658,7 +1658,7 @@ def get_join_indexers( left_keys: list[ArrayLike], right_keys: list[ArrayLike], sort: bool = False, - how: MergeHow | Literal["asof"] = "inner", + how: JoinHow = "inner", ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """ @@ -1684,12 +1684,12 @@ def get_join_indexers( left_n = len(left_keys[0]) right_n = len(right_keys[0]) if left_n == 0: - if how in ["left", "inner", "cross"]: + if how in ["left", "inner"]: return _get_empty_indexer() elif not sort and how in ["right", "outer"]: return _get_no_sort_one_missing_indexer(right_n, True) elif right_n == 0: - if how in ["right", "inner", "cross"]: + if how in ["right", "inner"]: return _get_empty_indexer() elif not sort and how in ["left", "outer"]: return _get_no_sort_one_missing_indexer(left_n, False) @@ -1699,7 +1699,7 @@ def get_join_indexers( # get left & right join labels and num. of levels at each location mapped = ( - _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) + _factorize_keys(left_keys[n], right_keys[n], sort=sort) for n in range(len(left_keys)) ) zipped = zip(*mapped) @@ -1712,7 +1712,7 @@ def get_join_indexers( # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) # preserve left frame order if how == 'left' and sort == False kwargs = {} if how in ("inner", "left", "right"): @@ -2166,7 +2166,6 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp] left_join_keys[n], right_join_keys[n], sort=False, - how="left", ) for n in range(len(left_join_keys)) ] @@ -2310,10 +2309,7 @@ def _left_join_on_index( def _factorize_keys( - lk: ArrayLike, - rk: ArrayLike, - sort: bool = True, - how: MergeHow | Literal["asof"] = "inner", + lk: ArrayLike, rk: ArrayLike, sort: bool = True ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: """ Encode left and right keys as enumerated types. @@ -2329,8 +2325,6 @@ def _factorize_keys( sort : bool, defaults to True If True, the encoding is done such that the unique elements in the keys are sorted. - how : {'left', 'right', 'outer', 'inner'}, default 'inner' - Type of merge. Returns ------- @@ -2419,8 +2413,6 @@ def _factorize_keys( ) if dc.null_count > 0: count += 1 - if how == "right": - return rlab, llab, count return llab, rlab, count if not isinstance(lk, BaseMaskedArray) and not ( @@ -2491,8 +2483,6 @@ def _factorize_keys( np.putmask(rlab, rmask, count) count += 1 - if how == "right": - return rlab, llab, count return llab, rlab, count From 9f51d4fb481e0cbee97e0100d39c52d3ff5207e9 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Fri, 8 Dec 2023 02:04:12 +0800 Subject: [PATCH 04/63] BUG: Fix `Timestamp('now')` and `Timestamp.now` unit inconsistency (#56281) * BUG: Fix `Timestamp('now')` and `Timestamp.now` unit inconsistency * Move return * Update doc/source/whatsnew/v2.1.4.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Resolve * not use tuple in cython for perf --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 4 ++-- pandas/tests/scalar/timestamp/test_constructors.py | 7 +++++++ pandas/tests/tools/test_to_datetime.py | 4 ++-- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 723c33280a679..4f96bf967ad99 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -23,6 +23,7 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`) +- Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) - Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 8cca5598d0e06..4def5e2c9340e 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -599,11 +599,13 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, # Issue 9000, we short-circuit rather than going # into np_datetime_strings which returns utc dt = datetime.now(tz) + return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) elif ts == "today": # Issue 9000, we short-circuit rather than going # into np_datetime_strings which returns a normalized datetime dt = datetime.now(tz) # equiv: datetime.today().replace(tzinfo=tz) + return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, @@ -647,8 +649,6 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, reso = get_supported_reso(out_bestunit) return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso) - return convert_datetime_to_tsobject(dt, tz) - cdef check_overflows(_TSObject obj, NPY_DATETIMEUNIT reso=NPY_FR_ns): """ diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 91314a497b1fb..98e4d581dc104 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -464,6 +464,13 @@ def test_constructor_str_infer_reso(self): ts = Timestamp("2020-01-01 00+00:00") assert ts.unit == "s" + @pytest.mark.parametrize("method", ["now", "today"]) + def test_now_today_unit(self, method): + # GH#55879 + ts_from_method = getattr(Timestamp, method)() + ts_from_string = Timestamp(method) + assert ts_from_method.unit == ts_from_string.unit == "us" + class TestTimestampConstructors: def test_weekday_but_no_day_raises(self): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f74fe459eb4d6..de5d67e6bd25f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1040,7 +1040,7 @@ def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now") + now = Timestamp("now").as_unit("ns") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -1066,7 +1066,7 @@ def test_to_datetime_today(self, tz): pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today") + tstoday = Timestamp("today").as_unit("ns") tstoday2 = Timestamp.today().as_unit("ns") # These should all be equal with infinite perf; this gives From 4e2cb22464f63896aa520d225d5708cf30db15c9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Dec 2023 11:15:43 -0800 Subject: [PATCH 05/63] TYP: require_matching_freq (#56374) * TYP: require_matching_freq * mypy fixup --- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/period.pyx | 17 ++++++----------- pandas/core/arrays/period.py | 9 +++++++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 846d238beadbd..df6ce675b07fc 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -63,7 +63,7 @@ class PeriodMixin: def end_time(self) -> Timestamp: ... @property def start_time(self) -> Timestamp: ... - def _require_matching_freq(self, other, base: bool = ...) -> None: ... + def _require_matching_freq(self, other: BaseOffset, base: bool = ...) -> None: ... class Period(PeriodMixin): ordinal: int # int64_t diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 22f96f8f6c3fa..6b105f5974f9b 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1714,21 +1714,16 @@ cdef class PeriodMixin: """ return self.to_timestamp(how="end") - def _require_matching_freq(self, other, base=False): + def _require_matching_freq(self, other: BaseOffset, bint base=False): # See also arrays.period.raise_on_incompatible - if is_offset_object(other): - other_freq = other - else: - other_freq = other.freq - if base: - condition = self.freq.base != other_freq.base + condition = self.freq.base != other.base else: - condition = self.freq != other_freq + condition = self.freq != other if condition: freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) - other_freqstr = freq_to_period_freqstr(other_freq.n, other_freq.name) + other_freqstr = freq_to_period_freqstr(other.n, other.name) msg = DIFFERENT_FREQ.format( cls=type(self).__name__, own_freq=freqstr, @@ -1803,7 +1798,7 @@ cdef class _Period(PeriodMixin): return False elif op == Py_NE: return True - self._require_matching_freq(other) + self._require_matching_freq(other.freq) return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) elif other is NaT: return op == Py_NE @@ -1893,7 +1888,7 @@ cdef class _Period(PeriodMixin): ): return self + (-other) elif is_period_object(other): - self._require_matching_freq(other) + self._require_matching_freq(other.freq) # GH 23915 - mul by base freq since __add__ is agnostic of n return (self.ordinal - other.ordinal) * self.freq.base elif other is NaT: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index a8c21cfbb6e2f..5f267a9f816e7 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -370,10 +370,15 @@ def _unbox_scalar( # type: ignore[override] def _scalar_from_string(self, value: str) -> Period: return Period(value, freq=self.freq) - def _check_compatible_with(self, other) -> None: + # error: Argument 1 of "_check_compatible_with" is incompatible with + # supertype "DatetimeLikeArrayMixin"; supertype defines the argument type + # as "Period | Timestamp | Timedelta | NaTType" + def _check_compatible_with(self, other: Period | NaTType | PeriodArray) -> None: # type: ignore[override] if other is NaT: return - self._require_matching_freq(other) + # error: Item "NaTType" of "Period | NaTType | PeriodArray" has no + # attribute "freq" + self._require_matching_freq(other.freq) # type: ignore[union-attr] # -------------------------------------------------------------------- # Data / Attributes From c369d93cf7c82455bd3dd98233c7c09d8c567025 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 7 Dec 2023 20:17:10 +0100 Subject: [PATCH 06/63] BUG: ne comparison returns False for NA and other value (#56123) * BUG: ne comparison returns False for NA and other value * Fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/arrays/string_arrow.py | 6 ++++- pandas/tests/arithmetic/test_object.py | 29 ++++++++++++++-------- pandas/tests/arrays/string_/test_string.py | 26 ++++++++++++++----- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 4f96bf967ad99..ee2d8efdbcc6b 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -30,6 +30,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`) - Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 96ebb4901f797..976a8d3c32b23 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import operator import re from typing import ( TYPE_CHECKING, @@ -663,7 +664,10 @@ def _convert_int_dtype(self, result): def _cmp_method(self, other, op): result = super()._cmp_method(other, op) - return result.to_numpy(np.bool_, na_value=False) + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) def value_counts(self, dropna: bool = True) -> Series: from pandas import Series diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 7d27f940daa4c..4ffd76722286a 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -10,10 +10,13 @@ from pandas._config import using_pyarrow_string_dtype +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core import ops @@ -33,20 +36,24 @@ def test_comparison_object_numeric_nas(self, comparison_op): expected = func(ser.astype(float), shifted.astype(float)) tm.assert_series_equal(result, expected) - def test_object_comparisons(self): - ser = Series(["a", "b", np.nan, "c", "a"]) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_object_comparisons(self, infer_string): + with option_context("future.infer_string", infer_string): + ser = Series(["a", "b", np.nan, "c", "a"]) - result = ser == "a" - expected = Series([True, False, False, False, True]) - tm.assert_series_equal(result, expected) + result = ser == "a" + expected = Series([True, False, False, False, True]) + tm.assert_series_equal(result, expected) - result = ser < "a" - expected = Series([False, False, False, False, False]) - tm.assert_series_equal(result, expected) + result = ser < "a" + expected = Series([False, False, False, False, False]) + tm.assert_series_equal(result, expected) - result = ser != "a" - expected = -(ser == "a") - tm.assert_series_equal(result, expected) + result = ser != "a" + expected = -(ser == "a") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) def test_more_na_comparisons(self, dtype): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3e11062b8384e..8dcda44aa68e5 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,6 +2,8 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ +import operator + import numpy as np import pytest @@ -224,7 +226,10 @@ def test_comparison_methods_scalar(comparison_op, dtype): result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": expected = np.array([getattr(item, op_name)(other) for item in a]) - expected[1] = False + if comparison_op == operator.ne: + expected[1] = True + else: + expected[1] = False tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -239,7 +244,10 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): result = getattr(a, op_name)(pd.NA) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False]) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -265,7 +273,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): if dtype.storage == "pyarrow_numpy": expected_data = { "__eq__": [False, False, False], - "__ne__": [True, False, True], + "__ne__": [True, True, True], }[op_name] expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) @@ -285,12 +293,18 @@ def test_comparison_methods_array(comparison_op, dtype): other = [None, None, "c"] result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False]) - expected[-1] = getattr(other[-1], op_name)(a[-1]) + if operator.ne == comparison_op: + expected = np.array([True, True, False]) + else: + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) result = getattr(a, op_name)(pd.NA) - expected = np.array([False, False, False]) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: From 5e0df27fadfeb29e76336b527f4c01cd2f575136 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 7 Dec 2023 21:05:12 +0100 Subject: [PATCH 07/63] BUG: reset_index not preserving object dtype for string option (#56160) * BUG: reset_index not preserving object dtype for string option * Fixup * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/series.py | 2 +- pandas/tests/series/methods/test_reset_index.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index ee2d8efdbcc6b..041b8fc866edf 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -32,6 +32,7 @@ Bug fixes - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`) - Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`) +- Fixed bug in :meth:`Series.reset_index` not preserving object dtype when ``infer_string`` is set (:issue:`56160`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 464e066b4e86a..f884e61fac27b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1719,7 +1719,7 @@ def reset_index( return new_ser.__finalize__(self, method="reset_index") else: return self._constructor( - self._values.copy(), index=new_index, copy=False + self._values.copy(), index=new_index, copy=False, dtype=self.dtype ).__finalize__(self, method="reset_index") elif inplace: raise TypeError( diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 634b8699a89e6..48e2608a1032a 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -11,6 +11,7 @@ RangeIndex, Series, date_range, + option_context, ) import pandas._testing as tm @@ -167,6 +168,14 @@ def test_reset_index_inplace_and_drop_ignore_name(self): expected = Series(range(2), name="old") tm.assert_series_equal(ser, expected) + def test_reset_index_drop_infer_string(self): + # GH#56160 + pytest.importorskip("pyarrow") + ser = Series(["a", "b", "c"], dtype=object) + with option_context("future.infer_string", True): + result = ser.reset_index(drop=True) + tm.assert_series_equal(result, ser) + @pytest.mark.parametrize( "array, dtype", From aed4df1f609a434a37e3b5bed5c02481e0a406b3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 7 Dec 2023 22:15:38 +0100 Subject: [PATCH 08/63] CoW: Warn for transform inplace modification (#56381) --- pandas/core/frame.py | 1 - pandas/core/series.py | 6 +++++- pandas/tests/copy_view/test_methods.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1630c2c31920d..179279cc08bab 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4895,7 +4895,6 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: inplace = validate_bool_kwarg(inplace, "inplace") kwargs["level"] = kwargs.pop("level", 0) + 1 - # TODO(CoW) those index/column resolvers create unnecessary refs to `self` index_resolvers = self._get_index_resolvers() column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers diff --git a/pandas/core/series.py b/pandas/core/series.py index f884e61fac27b..e4dca97bc645d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4740,7 +4740,11 @@ def transform( ) -> DataFrame | Series: # Validate axis argument self._get_axis_number(axis) - ser = self.copy(deep=False) if using_copy_on_write() else self + ser = ( + self.copy(deep=False) + if using_copy_on_write() or warn_copy_on_write() + else self + ) result = SeriesApply(ser, func=func, args=args, kwargs=kwargs).transform() return result diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 1e3c95dbc1ca3..558b483933f25 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1938,8 +1938,8 @@ def func(ser): ser.iloc[0] = 100 return ser - # TODO(CoW-warn) should warn? - ser.transform(func) + with tm.assert_cow_warning(warn_copy_on_write): + ser.transform(func) if using_copy_on_write: tm.assert_series_equal(ser, ser_orig) From 3b9d1f6ee5480691410c0702dc4e616fa8aa1bb0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Dec 2023 16:15:01 -0800 Subject: [PATCH 09/63] TYP: tighter typing in _maybe_convert_freq, _from_ordinal (#56389) --- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/period.pyx | 22 ++++++++-------------- pandas/core/arrays/period.py | 7 +++++-- pandas/tests/scalar/period/test_period.py | 2 +- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index df6ce675b07fc..22f3bdbe668de 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -87,7 +87,7 @@ class Period(PeriodMixin): @classmethod def _maybe_convert_freq(cls, freq) -> BaseOffset: ... @classmethod - def _from_ordinal(cls, ordinal: int, freq) -> Period: ... + def _from_ordinal(cls, ordinal: int, freq: BaseOffset) -> Period: ... @classmethod def now(cls, freq: Frequency) -> Period: ... def strftime(self, fmt: str | None) -> str: ... diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 6b105f5974f9b..eeaf472c23b60 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1756,21 +1756,12 @@ cdef class _Period(PeriodMixin): @classmethod def _maybe_convert_freq(cls, object freq) -> BaseOffset: """ - Internally we allow integer and tuple representations (for now) that - are not recognized by to_offset, so we convert them here. Also, a - Period's freq attribute must have `freq.n > 0`, which we check for here. + A Period's freq attribute must have `freq.n > 0`, which we check for here. Returns ------- DateOffset """ - if isinstance(freq, int): - # We already have a dtype code - dtype = PeriodDtypeBase(freq, 1) - freq = dtype._freqstr - elif isinstance(freq, PeriodDtypeBase): - freq = freq._freqstr - freq = to_offset(freq, is_period=True) if freq.n <= 0: @@ -1780,7 +1771,7 @@ cdef class _Period(PeriodMixin): return freq @classmethod - def _from_ordinal(cls, ordinal: int64_t, freq) -> "Period": + def _from_ordinal(cls, ordinal: int64_t, freq: BaseOffset) -> "Period": """ Fast creation from an ordinal and freq that are already validated! """ @@ -1988,8 +1979,10 @@ cdef class _Period(PeriodMixin): return endpoint - np.timedelta64(1, "ns") if freq is None: - freq = self._dtype._get_to_timestamp_base() - base = freq + freq_code = self._dtype._get_to_timestamp_base() + dtype = PeriodDtypeBase(freq_code, 1) + freq = dtype._freqstr + base = freq_code else: freq = self._maybe_convert_freq(freq) base = freq._period_dtype_code @@ -2836,7 +2829,8 @@ class Period(_Period): FutureWarning, stacklevel=find_stack_level(), ) - + if ordinal == NPY_NAT: + return NaT return cls._from_ordinal(ordinal, freq) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5f267a9f816e7..1ff3896eea798 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -35,6 +35,7 @@ ) from pandas._libs.tslibs.dtypes import ( FreqGroup, + PeriodDtypeBase, freq_to_period_freqstr, ) from pandas._libs.tslibs.fields import isleapyear_arr @@ -652,8 +653,10 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: return (self + self.freq).to_timestamp(how="start") - adjust if freq is None: - freq = self._dtype._get_to_timestamp_base() - base = freq + freq_code = self._dtype._get_to_timestamp_base() + dtype = PeriodDtypeBase(freq_code, 1) + freq = dtype._freqstr + base = freq_code else: freq = Period._maybe_convert_freq(freq) base = freq._period_dtype_code diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 3e91264fdb3b1..06d82ccd72b8c 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -418,7 +418,7 @@ def test_parse_week_str_roundstrip(self): def test_period_from_ordinal(self): p = Period("2011-01", freq="M") - res = Period._from_ordinal(p.ordinal, freq="M") + res = Period._from_ordinal(p.ordinal, freq=p.freq) assert p == res assert isinstance(res, Period) From 3d492f175077788df89b6fc82608c201164d81e2 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 7 Dec 2023 22:09:32 -0500 Subject: [PATCH 10/63] DOC: Add release date for 2.1.4 (#56393) * DOC: Add release date for 2.1.4 * DOC: Add release date for 2.1.4 --- doc/source/whatsnew/v2.1.4.rst | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 041b8fc866edf..9cc79b7090499 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -1,6 +1,6 @@ .. _whatsnew_214: -What's new in 2.1.4 (December ??, 2023) +What's new in 2.1.4 (December 8, 2023) --------------------------------------- These are the changes in pandas 2.1.4. See :ref:`release` for a full changelog @@ -14,7 +14,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression when trying to read a pickled pandas :class:`DataFrame` from pandas 1.3 (:issue:`55137`) -- .. --------------------------------------------------------------------------- .. _whatsnew_214.bug_fixes: @@ -36,14 +35,6 @@ Bug fixes - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) -.. --------------------------------------------------------------------------- -.. _whatsnew_214.other: - -Other -~~~~~ -- -- - .. --------------------------------------------------------------------------- .. _whatsnew_214.contributors: From 025ccb5675c4773409d6d0e8145b21da17e0e011 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 09:32:45 +0100 Subject: [PATCH 11/63] CoW: Remove filterwarnings and todo (#56390) --- pandas/tests/frame/test_subclass.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index f19e31002c877..ef78ae62cb4d6 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -524,8 +524,6 @@ def test_subclassed_wide_to_long(self): tm.assert_frame_equal(long_frame, expected) - # TODO(CoW-warn) should not need to warn - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_subclassed_apply(self): # GH 19822 From 91e251c3234333efadd0b467020bfcd300d9b6d8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 10:38:38 +0100 Subject: [PATCH 12/63] CoW: Avoid warning in case of expansion (#56391) --- pandas/core/internals/managers.py | 17 ++++++++++++----- pandas/tests/copy_view/test_indexing.py | 3 +-- pandas/tests/frame/indexing/test_indexing.py | 4 +--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a221d02b75bb2..cc88312d5b58f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1325,16 +1325,16 @@ def column_setitem( This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ + needs_to_warn = False if warn_copy_on_write() and not self._has_no_reference(loc): if not isinstance( self.blocks[self.blknos[loc]].values, (ArrowExtensionArray, ArrowStringArray), ): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) + # We might raise if we are in an expansion case, so defer + # warning till we actually updated + needs_to_warn = True + elif using_copy_on_write() and not self._has_no_reference(loc): blkno = self.blknos[loc] # Split blocks to only copy the column we want to modify @@ -1358,6 +1358,13 @@ def column_setitem( new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) + if needs_to_warn: + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: """ Insert item at selected position. diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 33b8ce218f029..6f3850ab64daa 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1145,13 +1145,12 @@ def test_set_value_copy_only_necessary_column( view = df[:] if val == "a" and indexer[0] != slice(None): - # TODO(CoW-warn) assert the FutureWarning for CoW is also raised with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype is deprecated" ): indexer_func(df)[indexer] = val else: - with tm.assert_cow_warning(warn_copy_on_write): + with tm.assert_cow_warning(warn_copy_on_write and val == 100): indexer_func(df)[indexer] = val if using_copy_on_write: diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index dfb4a3092789a..40c6b8e180c5b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1397,9 +1397,7 @@ def test_iloc_setitem_enlarge_no_warning(self, warn_copy_on_write): df = DataFrame(columns=["a", "b"]) expected = df.copy() view = df[:] - # TODO(CoW-warn) false positive: shouldn't warn in case of enlargement? - with tm.assert_produces_warning(FutureWarning if warn_copy_on_write else None): - df.iloc[:, 0] = np.array([1, 2], dtype=np.float64) + df.iloc[:, 0] = np.array([1, 2], dtype=np.float64) tm.assert_frame_equal(view, expected) def test_loc_internals_not_updated_correctly(self): From 657da071c86fdef5e35ba3bfd45cda2ebc35fad4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 10:39:35 +0100 Subject: [PATCH 13/63] CoW: Add warnings for interpolate (#56289) Co-authored-by: Joris Van den Bossche --- pandas/core/internals/base.py | 7 ++++- pandas/core/internals/blocks.py | 31 ++++++++++++++++++- .../test_chained_assignment_deprecation.py | 13 +++++--- pandas/tests/copy_view/test_interp_fillna.py | 30 ++++++++++++++++-- pandas/tests/copy_view/test_methods.py | 3 +- 5 files changed, 75 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 0fd91b163aba9..b03b98d89ccd5 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -284,7 +284,11 @@ def replace_list( def interpolate(self, inplace: bool, **kwargs) -> Self: return self.apply_with_block( - "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write() + "interpolate", + inplace=inplace, + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: @@ -293,6 +297,7 @@ def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: inplace=inplace, **kwargs, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) def shift(self, periods: int, fill_value) -> Self: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 85780bad1e403..f0f8430c991ad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1657,6 +1657,7 @@ def pad_or_backfill( limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op @@ -1677,6 +1678,19 @@ def pad_or_backfill( limit_area=limit_area, copy=copy, ) + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True if axis == 1: new_values = new_values.T @@ -1697,6 +1711,7 @@ def interpolate( limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, **kwargs, ) -> list[Block]: inplace = validate_bool_kwarg(inplace, "inplace") @@ -1735,6 +1750,20 @@ def interpolate( ) data = extract_array(new_values, extract_numpy=True) + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + nb = self.make_block_same_class(data, refs=refs) return nb._maybe_downcast([nb], downcast, using_cow, caller="interpolate") @@ -2178,9 +2207,9 @@ def pad_or_backfill( limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: values = self.values - copy, refs = self._get_refs_and_copy(using_cow, inplace) if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index b51a5920917d6..2829617c84cd2 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -42,11 +42,16 @@ def test_methods_iloc_warn(using_copy_on_write): ("ffill", ()), ], ) -def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): - df = DataFrame({"a": [1, 2, 3], "b": 1}) +def test_methods_iloc_getitem_item_cache( + func, args, using_copy_on_write, warn_copy_on_write +): + df = DataFrame({"a": [1.5, 2, 3], "b": 1.5}) ser = df.iloc[:, 0] - # TODO(CoW-warn) should warn about updating a view - getattr(ser, func)(*args, inplace=True) + # TODO(CoW-warn) should warn about updating a view for all methods + with tm.assert_cow_warning( + warn_copy_on_write and func not in ("replace", "fillna") + ): + getattr(ser, func)(*args, inplace=True) # parent that holds item_cache is dead, so don't increase ref count ser = df.copy()["a"] diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 7849799eb2cc4..ddc5879a56d54 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -91,12 +91,13 @@ def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals): @pytest.mark.parametrize( "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] ) -def test_interpolate_inplace_with_refs(using_copy_on_write, vals): +def test_interpolate_inplace_with_refs(using_copy_on_write, vals, warn_copy_on_write): df = DataFrame({"a": [1, np.nan, 2]}) df_orig = df.copy() arr = get_array(df, "a") view = df[:] - df.interpolate(method="linear", inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.interpolate(method="linear", inplace=True) if using_copy_on_write: # Check that copy was triggered in interpolate and that we don't @@ -109,6 +110,31 @@ def test_interpolate_inplace_with_refs(using_copy_on_write, vals): assert np.shares_memory(arr, get_array(df, "a")) +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +def test_interp_fill_functions_inplace( + using_copy_on_write, func, warn_copy_on_write, dtype +): + # Check that these takes the same code paths as interpolate + df = DataFrame({"a": [1, np.nan, 2]}, dtype=dtype) + df_orig = df.copy() + arr = get_array(df, "a") + view = df[:] + + with tm.assert_cow_warning(warn_copy_on_write and dtype == "float64"): + getattr(df, func)(inplace=True) + + if using_copy_on_write: + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") + + def test_interpolate_cleaned_fill_method(using_copy_on_write): # Check that "method is set to None" case works correctly df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 558b483933f25..862aebdc70a9d 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1607,7 +1607,8 @@ def test_interpolate_creates_copy(using_copy_on_write, warn_copy_on_write): view = df[:] expected = df.copy() - df.ffill(inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.ffill(inplace=True) with tm.assert_cow_warning(warn_copy_on_write): df.iloc[0, 0] = 100.5 From 8399185ad3618cd7d4bc5018e8afad5cd357e813 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 12:01:44 +0100 Subject: [PATCH 14/63] CoW: Warn for inplace replace (#56297) Co-authored-by: Joris Van den Bossche --- pandas/core/internals/base.py | 7 ++- pandas/core/internals/blocks.py | 45 +++++++++++++++++++ .../test_chained_assignment_deprecation.py | 7 +-- pandas/tests/copy_view/test_replace.py | 20 +++++---- 4 files changed, 65 insertions(+), 14 deletions(-) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index b03b98d89ccd5..33f5b9feb1387 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -252,12 +252,16 @@ def replace(self, to_replace, value, inplace: bool) -> Self: value=value, inplace=inplace, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) @final def replace_regex(self, **kwargs) -> Self: return self.apply_with_block( - "_replace_regex", **kwargs, using_cow=using_copy_on_write() + "_replace_regex", + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) @final @@ -278,6 +282,7 @@ def replace_list( inplace=inplace, regex=regex, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) bm._consolidate_inplace() return bm diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0f8430c991ad..1af2d9e739038 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -830,6 +830,7 @@ def replace( # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -874,6 +875,20 @@ def replace( # and rest? blk = self._maybe_copy(using_cow, inplace) putmask_inplace(blk.values, mask, value) + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN @@ -934,6 +949,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ Replace elements by the given value. @@ -968,6 +984,20 @@ def _replace_regex( replace_regex(block.values, rx, value, mask) + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + nbs = block.convert(copy=False, using_cow=using_cow) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: @@ -992,6 +1022,7 @@ def replace_list( inplace: bool = False, regex: bool = False, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ See BlockManager.replace_list docstring. @@ -1048,6 +1079,20 @@ def replace_list( else: rb = [self if inplace else self.copy()] + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + opt = get_option("future.no_silent_downcasting") for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index 2829617c84cd2..25935d9489730 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -45,12 +45,9 @@ def test_methods_iloc_warn(using_copy_on_write): def test_methods_iloc_getitem_item_cache( func, args, using_copy_on_write, warn_copy_on_write ): - df = DataFrame({"a": [1.5, 2, 3], "b": 1.5}) + df = DataFrame({"a": [1, 2, 3], "b": 1}) ser = df.iloc[:, 0] - # TODO(CoW-warn) should warn about updating a view for all methods - with tm.assert_cow_warning( - warn_copy_on_write and func not in ("replace", "fillna") - ): + with tm.assert_cow_warning(warn_copy_on_write and func == "replace"): getattr(ser, func)(*args, inplace=True) # parent that holds item_cache is dead, so don't increase ref count diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index eb3b1a5ef68e8..268e859e782ec 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -48,12 +48,13 @@ def test_replace(using_copy_on_write, replace_kwargs): tm.assert_frame_equal(df, df_orig) -def test_replace_regex_inplace_refs(using_copy_on_write): +def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) df_orig = df.copy() view = df[:] arr = get_array(df, "a") - df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: assert not np.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0) @@ -202,11 +203,12 @@ def test_replace_inplace(using_copy_on_write, to_replace): @pytest.mark.parametrize("to_replace", [1.5, [1.5]]) -def test_replace_inplace_reference(using_copy_on_write, to_replace): +def test_replace_inplace_reference(using_copy_on_write, to_replace, warn_copy_on_write): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=to_replace, value=15.5, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(to_replace=to_replace, value=15.5, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr_a) @@ -354,12 +356,13 @@ def test_replace_list_none(using_copy_on_write): assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) -def test_replace_list_none_inplace_refs(using_copy_on_write): +def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}) arr = get_array(df, "a") df_orig = df.copy() view = df[:] - df.replace(["a"], value=None, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(["a"], value=None, inplace=True) if using_copy_on_write: assert df._mgr._has_no_reference(0) assert not np.shares_memory(arr, get_array(df, "a")) @@ -431,7 +434,7 @@ def test_replace_listlike(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_replace_listlike_inplace(using_copy_on_write): +def test_replace_listlike_inplace(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) arr = get_array(df, "a") df.replace([200, 2], [10, 11], inplace=True) @@ -439,7 +442,8 @@ def test_replace_listlike_inplace(using_copy_on_write): view = df[:] df_orig = df.copy() - df.replace([200, 3], [10, 11], inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace([200, 3], [10, 11], inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr) tm.assert_frame_equal(view, df_orig) From 68c1af5358561b4655861f99ac1dfb27ac5d4d56 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Dec 2023 14:55:30 +0100 Subject: [PATCH 15/63] TST: clean CoW chained assignment warning iloc test case (#56400) --- .../test_chained_assignment_deprecation.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index 25935d9489730..80e38380ed27c 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -35,36 +35,38 @@ def test_methods_iloc_warn(using_copy_on_write): @pytest.mark.parametrize( "func, args", [ - ("replace", (1, 5)), + ("replace", (4, 5)), ("fillna", (1,)), ("interpolate", ()), ("bfill", ()), ("ffill", ()), ], ) -def test_methods_iloc_getitem_item_cache( - func, args, using_copy_on_write, warn_copy_on_write -): - df = DataFrame({"a": [1, 2, 3], "b": 1}) +def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() ser = df.iloc[:, 0] - with tm.assert_cow_warning(warn_copy_on_write and func == "replace"): - getattr(ser, func)(*args, inplace=True) + getattr(ser, func)(*args, inplace=True) # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() ser = df.copy()["a"] getattr(ser, func)(*args, inplace=True) - df = df.copy() - + df = df_orig.copy() df["a"] # populate the item_cache ser = df.iloc[:, 0] # iloc creates a new object - ser.fillna(0, inplace=True) + getattr(ser, func)(*args, inplace=True) + df = df_orig.copy() df["a"] # populate the item_cache ser = df["a"] - ser.fillna(0, inplace=True) + getattr(ser, func)(*args, inplace=True) - df = df.copy() + df = df_orig.copy() df["a"] # populate the item_cache if using_copy_on_write: with tm.raises_chained_assignment_error(): From 46c8da3e5b987154d2e0a44562c71be635ae826a Mon Sep 17 00:00:00 2001 From: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> Date: Fri, 8 Dec 2023 15:06:04 -0500 Subject: [PATCH 16/63] Fix negative n for str.replace with arrow string (#56406) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 10 +++++++++- pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c878fd2664dc4..919ac8b03f936 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -577,6 +577,7 @@ Strings - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) +- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) Interval diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e7a50dbba9935..ae6942db11fae 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2155,7 +2155,15 @@ def _str_replace( ) func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) return type(self)(result) def _str_repeat(self, repeats: int | Sequence[int]): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3ce3cee9714e4..1941e359299b6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1776,6 +1776,14 @@ def test_str_replace(pat, repl, n, regex, exp): tm.assert_series_equal(result, expected) +def test_str_replace_negative_n(): + # GH 56404 + ser = pd.Series(["abc", "aaaaaa"], dtype=ArrowDtype(pa.string())) + actual = ser.str.replace("a", "", -3, True) + expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string())) + tm.assert_series_equal(expected, actual) + + def test_str_repeat_unsupported(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) with pytest.raises(NotImplementedError, match="repeat is not"): From c2f06598460dd3996833ea3d225e002bd1fabf18 Mon Sep 17 00:00:00 2001 From: William Andrea Date: Fri, 8 Dec 2023 18:24:24 -0500 Subject: [PATCH 17/63] DOC: Fix typo in resample.py, "as_freq" (#56407) Fix typo in resample.py, "as_freq" --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8af81cd43d62e..31e41acbf1774 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1061,7 +1061,7 @@ def interpolate( 2023-03-01 07:00:04.000 3.0 Freq: 500ms, dtype: float64 - Internal reindexing with ``as_freq()`` prior to interpolation leads to + Internal reindexing with ``asfreq()`` prior to interpolation leads to an interpolated timeseries on the basis the reindexed timestamps (anchors). Since not all datapoints from original series become anchors, it can lead to misleading interpolation results as in the following example: From 45361a48582bde02941b02050fb4a9ae096faadd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Dec 2023 15:25:25 -0800 Subject: [PATCH 18/63] CLN: generate_range (#56416) --- pandas/core/arrays/datetimelike.py | 4 +++- pandas/core/arrays/datetimes.py | 25 ++++++++++--------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a88f40013b3f6..eb1c2ecc0b0fe 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2103,7 +2103,9 @@ def _validate_frequency(cls, index, freq: BaseOffset, **kwargs): ) from err @classmethod - def _generate_range(cls, start, end, periods, freq, *args, **kwargs) -> Self: + def _generate_range( + cls, start, end, periods: int | None, freq, *args, **kwargs + ) -> Self: raise AbstractMethodError(cls) # -------------------------------------------------------------- diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 496a6987c3264..64f08adcd48c4 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -405,7 +405,7 @@ def _generate_range( # type: ignore[override] cls, start, end, - periods, + periods: int | None, freq, tz=None, normalize: bool = False, @@ -441,9 +441,9 @@ def _generate_range( # type: ignore[override] else: unit = "ns" - if start is not None and unit is not None: + if start is not None: start = start.as_unit(unit, round_ok=False) - if end is not None and unit is not None: + if end is not None: end = end.as_unit(unit, round_ok=False) left_inclusive, right_inclusive = validate_inclusive(inclusive) @@ -452,14 +452,8 @@ def _generate_range( # type: ignore[override] if tz is not None: # Localize the start and end arguments - start_tz = None if start is None else start.tz - end_tz = None if end is None else end.tz - start = _maybe_localize_point( - start, start_tz, start, freq, tz, ambiguous, nonexistent - ) - end = _maybe_localize_point( - end, end_tz, end, freq, tz, ambiguous, nonexistent - ) + start = _maybe_localize_point(start, freq, tz, ambiguous, nonexistent) + end = _maybe_localize_point(end, freq, tz, ambiguous, nonexistent) if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for @@ -505,6 +499,7 @@ def _generate_range( # type: ignore[override] # Nanosecond-granularity timestamps aren't always correctly # representable with doubles, so we limit the range that we # pass to np.linspace as much as possible + periods = cast(int, periods) i8values = ( np.linspace(0, end._value - start._value, periods, dtype="int64") + start._value @@ -2688,7 +2683,9 @@ def _maybe_normalize_endpoints( return start, end -def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent): +def _maybe_localize_point( + ts: Timestamp | None, freq, tz, ambiguous, nonexistent +) -> Timestamp | None: """ Localize a start or end Timestamp to the timezone of the corresponding start or end Timestamp @@ -2696,8 +2693,6 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexis Parameters ---------- ts : start or end Timestamp to potentially localize - is_none : argument that should be None - is_not_none : argument that should not be None freq : Tick, DateOffset, or None tz : str, timezone object or None ambiguous: str, localization behavior for ambiguous times @@ -2710,7 +2705,7 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexis # Make sure start and end are timezone localized if: # 1) freq = a Timedelta-like frequency (Tick) # 2) freq = None i.e. generating a linspaced range - if is_none is None and is_not_none is not None: + if ts is not None and ts.tzinfo is None: # Note: We can't ambiguous='infer' a singular ambiguous time; however, # we have historically defaulted ambiguous=False ambiguous = ambiguous if ambiguous != "infer" else False From a3626f27f297b1d22b0690548fd3c5016e7522d4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 00:28:19 +0100 Subject: [PATCH 19/63] BUG: Index.str.cat casting result always to object (#56157) * BUG: Index.str.cat casting result always to object * Update accessor.py * Fix further bugs * Fix * Update accessor.py * Update v2.1.4.rst * Update v2.2.0.rst --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/strings/accessor.py | 10 ++- pandas/tests/strings/test_api.py | 2 + pandas/tests/strings/test_cat.py | 135 ++++++++++++++++++------------- 4 files changed, 89 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 919ac8b03f936..a273a2c054c90 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -576,6 +576,7 @@ Strings ^^^^^^^ - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) +- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 9fa6e9973291d..127aee24e094f 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -44,6 +44,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays import ExtensionArray from pandas.core.base import NoNewAttributesMixin from pandas.core.construction import extract_array @@ -456,7 +457,7 @@ def _get_series_list(self, others): # in case of list-like `others`, all elements must be # either Series/Index/np.ndarray (1-dim)... if all( - isinstance(x, (ABCSeries, ABCIndex)) + isinstance(x, (ABCSeries, ABCIndex, ExtensionArray)) or (isinstance(x, np.ndarray) and x.ndim == 1) for x in others ): @@ -690,12 +691,15 @@ def cat( out: Index | Series if isinstance(self._orig, ABCIndex): # add dtype for case that result is all-NA + dtype = None + if isna(result).all(): + dtype = object - out = Index(result, dtype=object, name=self._orig.name) + out = Index(result, dtype=dtype, name=self._orig.name) else: # Series if isinstance(self._orig.dtype, CategoricalDtype): # We need to infer the new categories. - dtype = None + dtype = self._orig.dtype.categories.dtype # type: ignore[assignment] else: dtype = self._orig.dtype res_ser = Series( diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 2914b22a52e94..fd2501835318d 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + CategoricalDtype, DataFrame, Index, MultiIndex, @@ -178,6 +179,7 @@ def test_api_for_categorical(any_string_method, any_string_dtype): s = Series(list("aabb"), dtype=any_string_dtype) s = s + " " + s c = s.astype("category") + c = c.astype(CategoricalDtype(c.dtype.categories.astype("object"))) assert isinstance(c.str, StringMethods) method_name, args, kwargs = any_string_method diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 3e620b7664335..284932491a65e 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -10,6 +12,7 @@ Series, _testing as tm, concat, + option_context, ) @@ -26,45 +29,49 @@ def test_str_cat_name(index_or_series, other): assert result.name == "name" -def test_str_cat(index_or_series): - box = index_or_series - # test_cat above tests "str_cat" from ndarray; - # here testing "str.cat" from Series/Index to ndarray/list - s = box(["a", "a", "b", "b", "c", np.nan]) +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_str_cat(index_or_series, infer_string): + with option_context("future.infer_string", infer_string): + box = index_or_series + # test_cat above tests "str_cat" from ndarray; + # here testing "str.cat" from Series/Index to ndarray/list + s = box(["a", "a", "b", "b", "c", np.nan]) - # single array - result = s.str.cat() - expected = "aabbc" - assert result == expected + # single array + result = s.str.cat() + expected = "aabbc" + assert result == expected - result = s.str.cat(na_rep="-") - expected = "aabbc-" - assert result == expected + result = s.str.cat(na_rep="-") + expected = "aabbc-" + assert result == expected - result = s.str.cat(sep="_", na_rep="NA") - expected = "a_a_b_b_c_NA" - assert result == expected + result = s.str.cat(sep="_", na_rep="NA") + expected = "a_a_b_b_c_NA" + assert result == expected - t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) - expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) + t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) + expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) - # Series/Index with array - result = s.str.cat(t, na_rep="-") - tm.assert_equal(result, expected) + # Series/Index with array + result = s.str.cat(t, na_rep="-") + tm.assert_equal(result, expected) - # Series/Index with list - result = s.str.cat(list(t), na_rep="-") - tm.assert_equal(result, expected) + # Series/Index with list + result = s.str.cat(list(t), na_rep="-") + tm.assert_equal(result, expected) - # errors for incorrect lengths - rgx = r"If `others` contains arrays or lists \(or other list-likes.*" - z = Series(["1", "2", "3"]) + # errors for incorrect lengths + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]) - with pytest.raises(ValueError, match=rgx): - s.str.cat(z.values) + with pytest.raises(ValueError, match=rgx): + s.str.cat(z.values) - with pytest.raises(ValueError, match=rgx): - s.str.cat(list(z)) + with pytest.raises(ValueError, match=rgx): + s.str.cat(list(z)) def test_str_cat_raises_intuitive_error(index_or_series): @@ -78,39 +85,54 @@ def test_str_cat_raises_intuitive_error(index_or_series): s.str.cat(" ") +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("sep", ["", None]) @pytest.mark.parametrize("dtype_target", ["object", "category"]) @pytest.mark.parametrize("dtype_caller", ["object", "category"]) -def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep): +def test_str_cat_categorical( + index_or_series, dtype_caller, dtype_target, sep, infer_string +): box = index_or_series - s = Index(["a", "a", "b", "a"], dtype=dtype_caller) - s = s if box == Index else Series(s, index=s) - t = Index(["b", "a", "b", "c"], dtype=dtype_target) - - expected = Index(["ab", "aa", "bb", "ac"]) - expected = expected if box == Index else Series(expected, index=s) + with option_context("future.infer_string", infer_string): + s = Index(["a", "a", "b", "a"], dtype=dtype_caller) + s = s if box == Index else Series(s, index=s) + t = Index(["b", "a", "b", "c"], dtype=dtype_target) - # Series/Index with unaligned Index -> t.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) - - # Series/Index with Series having matching Index - t = Series(t.values, index=s) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) - - # Series/Index with Series.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) + expected = Index(["ab", "aa", "bb", "ac"]) + expected = ( + expected + if box == Index + else Series(expected, index=Index(s, dtype=dtype_caller)) + ) - # Series/Index with Series having different Index - t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "bb", "bb", "aa"]) - expected = expected if box == Index else Series(expected, index=expected.str[:1]) + # Series/Index with unaligned Index -> t.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series having matching Index + t = Series(t.values, index=Index(s, dtype=dtype_caller)) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series having different Index + t = Series(t.values, index=t.values) + expected = Index(["aa", "aa", "bb", "bb", "aa"]) + dtype = object if dtype_caller == "object" else s.dtype.categories.dtype + expected = ( + expected + if box == Index + else Series(expected, index=Index(expected.str[:1], dtype=dtype)) + ) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -321,8 +343,9 @@ def test_str_cat_all_na(index_or_series, index_or_series2): # all-NA target if box == Series: - expected = Series([np.nan] * 4, index=s.index, dtype=object) + expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype) else: # box == Index + # TODO: Strimg option, this should return string dtype expected = Index([np.nan] * 4, dtype=object) result = s.str.cat(t, join="left") tm.assert_equal(result, expected) From e3073b503527c17ed8bed848f534db8188428b71 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 00:30:40 +0100 Subject: [PATCH 20/63] Adjust tests in extension folder for new string option (#56191) * Adjust tests in extension folder for new string option * Fix typing * Update setitem.py --- pandas/tests/extension/base/dtype.py | 7 ++++++- pandas/tests/extension/base/groupby.py | 7 ++++++- pandas/tests/extension/base/missing.py | 2 +- pandas/tests/extension/base/ops.py | 22 +++++++++++++++++----- pandas/tests/extension/base/setitem.py | 4 ++-- pandas/tests/extension/test_categorical.py | 6 +++++- pandas/tests/extension/test_numpy.py | 4 ++-- 7 files changed, 39 insertions(+), 13 deletions(-) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 5ba65ceaeeada..c7b768f6e3c88 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -59,7 +59,12 @@ def test_check_dtype(self, data): # check equivalency for using .dtypes df = pd.DataFrame( - {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} + { + "A": pd.Series(data, dtype=dtype), + "B": data, + "C": pd.Series(["foo"] * len(data), dtype=object), + "D": 1, + } ) result = df.dtypes == str(dtype) assert np.dtype("int64") != "Int64" diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 5c21c4f7137a5..4e8221f67a74d 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -21,7 +21,12 @@ class BaseGroupbyTests: def test_grouping_grouper(self, data_for_grouping): df = pd.DataFrame( - {"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping} + { + "A": pd.Series( + ["B", "B", None, None, "A", "A", "B", "C"], dtype=object + ), + "B": data_for_grouping, + } ) gr1 = df.groupby("A").grouper.groupings[0] gr2 = df.groupby("B").grouper.groupings[0] diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 40cc952d44200..ffb7a24b4b390 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -44,7 +44,7 @@ def test_dropna_series(self, data_missing): tm.assert_series_equal(result, expected) def test_dropna_frame(self, data_missing): - df = pd.DataFrame({"A": data_missing}) + df = pd.DataFrame({"A": data_missing}, columns=pd.Index(["A"], dtype=object)) # defaults result = df.dropna() diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 40fab5ec11d7d..5cd66d8a874c7 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -27,13 +29,23 @@ def _get_expected_exception( # The self.obj_bar_exc pattern isn't great in part because it can depend # on op_name or dtypes, but we use it here for backward-compatibility. if op_name in ["__divmod__", "__rdivmod__"]: - return self.divmod_exc - if isinstance(obj, pd.Series) and isinstance(other, pd.Series): - return self.series_array_exc + result = self.divmod_exc + elif isinstance(obj, pd.Series) and isinstance(other, pd.Series): + result = self.series_array_exc elif isinstance(obj, pd.Series): - return self.series_scalar_exc + result = self.series_scalar_exc else: - return self.frame_scalar_exc + result = self.frame_scalar_exc + + if using_pyarrow_string_dtype() and result is not None: + import pyarrow as pa + + result = ( # type: ignore[assignment] + result, + pa.lib.ArrowNotImplementedError, + NotImplementedError, + ) + return result def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # In _check_op we check that the result of a pointwise operation diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 067b401ce2f23..187da89729f0e 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -351,11 +351,11 @@ def test_setitem_preserves_views(self, data): def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 - df = expected = pd.DataFrame({"data": pd.Series(data)}) + df = expected = pd.DataFrame({0: pd.Series(data)}) result = pd.DataFrame(index=df.index) key = full_indexer(df) - result.loc[key, "data"] = df["data"] + result.loc[key, 0] = df[0] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 5cde5df4bc007..6f33b18b19c51 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import Categorical import pandas._testing as tm @@ -100,7 +102,9 @@ def test_contains(self, data, data_missing): if na_value_obj is na_value: continue assert na_value_obj not in data - assert na_value_obj in data_missing # this line differs from super method + # this section suffers from super method + if not using_pyarrow_string_dtype(): + assert na_value_obj in data_missing def test_empty(self, dtype): cls = dtype.construct_array_type() diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index f1939ea174841..c0692064cfaec 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -196,7 +196,7 @@ def test_series_constructor_scalar_with_index(self, data, dtype): class TestDtype(BaseNumPyTests, base.BaseDtypeTests): - def test_check_dtype(self, data, request): + def test_check_dtype(self, data, request, using_infer_string): if data.dtype.numpy_dtype == "object": request.applymarker( pytest.mark.xfail( @@ -429,7 +429,7 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): if data.dtype.numpy_dtype != object: if not isinstance(key, slice) or key != slice(None): expected = pd.DataFrame({"data": data.to_numpy()}) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) @skip_nested From 124b671787bdf726274af3ce225c79a8090c9bf0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Dec 2023 15:39:11 -0800 Subject: [PATCH 21/63] DEPR: Index.insert dtype-inference (#55257) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexes/base.py | 18 ++++++++++++++---- pandas/core/indexing.py | 19 +++++++++++++++++-- pandas/core/internals/managers.py | 10 ++++++++-- pandas/tests/indexes/test_old_base.py | 11 +++++++++-- 5 files changed, 49 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index a273a2c054c90..55e40b3811d28 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -454,6 +454,7 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) +- Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6c9f93d3482a7..3abe77b97fe58 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6939,14 +6939,24 @@ def insert(self, loc: int, item) -> Index: loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - idx = Index._with_infer(new_values, name=self.name) + out = Index._with_infer(new_values, name=self.name) if ( using_pyarrow_string_dtype() - and is_string_dtype(idx.dtype) + and is_string_dtype(out.dtype) and new_values.dtype == object ): - idx = idx.astype(new_values.dtype) - return idx + out = out.astype(new_values.dtype) + if self.dtype == object and out.dtype != object: + # GH#51363 + warnings.warn( + "The behavior of Index.insert with object-dtype is deprecated, " + "in a future version this will return an object-dtype Index " + "instead of inferring a non-object dtype. To retain the old " + "behavior, do `idx.insert(loc, item).infer_objects(copy=False)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return out def drop( self, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e3928621a4e48..c233295b25700 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1893,7 +1893,15 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): # just replacing the block manager here # so the object is the same index = self.obj._get_axis(i) - labels = index.insert(len(index), key) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype " + "is deprecated", + category=FutureWarning, + ) + labels = index.insert(len(index), key) # We are expanding the Series/DataFrame values to match # the length of thenew index `labels`. GH#40096 ensure @@ -2186,7 +2194,14 @@ def _setitem_with_indexer_missing(self, indexer, value): # and set inplace if self.ndim == 1: index = self.obj.index - new_index = index.insert(len(index), indexer) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype is deprecated", + category=FutureWarning, + ) + new_index = index.insert(len(index), indexer) # we have a coerced indexer, e.g. a float # that matches in an int64 Index, so diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cc88312d5b58f..6eb4099b4d830 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1376,8 +1376,14 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: value : np.ndarray or ExtensionArray refs : The reference tracking object of the value to set. """ - # insert to the axis; this could possibly raise a TypeError - new_axis = self.items.insert(loc, item) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype is deprecated", + category=FutureWarning, + ) + new_axis = self.items.insert(loc, item) if value.ndim == 2: value = value.T diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index f08de8e65451c..0fff6abcfc6a5 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -407,13 +407,20 @@ def test_where(self, listlike_box, simple_index): tm.assert_index_equal(result, expected) def test_insert_base(self, index): - result = index[1:4] + trimmed = index[1:4] if not len(index): pytest.skip("Not applicable for empty index") # test 0th element - assert index[0:4].equals(result.insert(0, index[0])) + warn = None + if index.dtype == object and index.inferred_type == "boolean": + # GH#51363 + warn = FutureWarning + msg = "The behavior of Index.insert with object-dtype is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = trimmed.insert(0, index[0]) + assert index[0:4].equals(result) def test_insert_out_of_bounds(self, index): # TypeError/IndexError matches what np.insert raises in these cases From 9893c437648f3c675016fec035e640d9c92b4806 Mon Sep 17 00:00:00 2001 From: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> Date: Fri, 8 Dec 2023 18:41:56 -0500 Subject: [PATCH 22/63] Series.str.find fix for arrow strings when start < 0 (#56412) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 3 ++- pandas/tests/extension/test_arrow.py | 10 +++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 55e40b3811d28..8a1906d20c243 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -579,6 +579,7 @@ Strings - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) - Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) +- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ae6942db11fae..5abdfe69e52c0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2193,7 +2193,8 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) result = pc.find_substring(slices, sub) not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) + start_offset = max(0, start) + offset_result = pc.add(result, start_offset) result = pc.if_else(not_found, result, offset_result) elif start == 0 and end is None: slices = self._pa_array diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1941e359299b6..75e5fb00586e6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1835,7 +1835,7 @@ def test_str_fullmatch(pat, case, na, exp): @pytest.mark.parametrize( "sub, start, end, exp, exp_typ", - [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [2, None], pa.int64()]], + [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [1, None], pa.int64()]], ) def test_str_find(sub, start, end, exp, exp_typ): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) @@ -1844,6 +1844,14 @@ def test_str_find(sub, start, end, exp, exp_typ): tm.assert_series_equal(result, expected) +def test_str_find_negative_start(): + # GH 56411 + ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="b", start=-1000, end=3) + expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + def test_str_find_notimplemented(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) with pytest.raises(NotImplementedError, match="find not implemented"): From 114f067a93c0e2c120d6538696d32d46a94f8eb8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 02:00:55 +0100 Subject: [PATCH 23/63] CLN: Remove unnecessary copy keyword (#56420) * CLN: Remove unnecessary copy keyword * CLN: Remove unnecessary copy keyword * Fixup --- pandas/_testing/asserters.py | 4 +- pandas/core/frame.py | 4 +- pandas/core/groupby/generic.py | 2 +- pandas/core/internals/managers.py | 44 +++++-------------- .../frame/constructors/test_from_records.py | 4 +- .../frame/methods/test_to_dict_of_blocks.py | 18 +------- pandas/tests/internals/test_internals.py | 32 -------------- 7 files changed, 19 insertions(+), 89 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 6cad71b3dfd18..d9db2bc5cddb4 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -1206,8 +1206,8 @@ def assert_frame_equal( # compare by blocks if by_blocks: - rblocks = right._to_dict_of_blocks(copy=False) - lblocks = left._to_dict_of_blocks(copy=False) + rblocks = right._to_dict_of_blocks() + lblocks = left._to_dict_of_blocks() for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): assert dtype in lblocks assert dtype in rblocks diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 179279cc08bab..24b7951e3bb85 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12484,7 +12484,7 @@ def isin_(x): # ---------------------------------------------------------------------- # Internal Interface Methods - def _to_dict_of_blocks(self, copy: bool = True): + def _to_dict_of_blocks(self): """ Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. @@ -12496,7 +12496,7 @@ def _to_dict_of_blocks(self, copy: bool = True): mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v, in mgr.to_dict(copy=copy).items() + for k, v, in mgr.to_dict().items() } @property diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5a2f8d8454526..204083ac6c04e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2012,7 +2012,7 @@ def _get_data_to_aggregate( mgr = obj._mgr if numeric_only: - mgr = mgr.get_numeric_data(copy=False) + mgr = mgr.get_numeric_data() return mgr def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6eb4099b4d830..14d05c59272e8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -495,17 +495,12 @@ def is_view(self) -> bool: def _get_data_subset(self, predicate: Callable) -> Self: blocks = [blk for blk in self.blocks if predicate(blk.values)] - return self._combine(blocks, copy=False) + return self._combine(blocks) - def get_bool_data(self, copy: bool = False) -> Self: + def get_bool_data(self) -> Self: """ Select blocks that are bool-dtype and columns from object-dtype blocks that are all-bool. - - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks """ new_blocks = [] @@ -518,26 +513,16 @@ def get_bool_data(self, copy: bool = False) -> Self: nbs = blk._split() new_blocks.extend(nb for nb in nbs if nb.is_bool) - return self._combine(new_blocks, copy) + return self._combine(new_blocks) - def get_numeric_data(self, copy: bool = False) -> Self: - """ - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks - """ + def get_numeric_data(self) -> Self: numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] if len(numeric_blocks) == len(self.blocks): # Avoid somewhat expensive _combine - if copy: - return self.copy(deep=True) return self - return self._combine(numeric_blocks, copy) + return self._combine(numeric_blocks) - def _combine( - self, blocks: list[Block], copy: bool = True, index: Index | None = None - ) -> Self: + def _combine(self, blocks: list[Block], index: Index | None = None) -> Self: """return a new manager with the blocks""" if len(blocks) == 0: if self.ndim == 2: @@ -554,11 +539,8 @@ def _combine( inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) new_blocks: list[Block] = [] - # TODO(CoW) we could optimize here if we know that the passed blocks - # are fully "owned" (eg created from an operation, not coming from - # an existing manager) for b in blocks: - nb = b.copy(deep=copy) + nb = b.copy(deep=False) nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) new_blocks.append(nb) @@ -1636,14 +1618,10 @@ def unstack(self, unstacker, fill_value) -> BlockManager: bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) return bm - def to_dict(self, copy: bool = True) -> dict[str, Self]: + def to_dict(self) -> dict[str, Self]: """ Return a dict of str(dtype) -> BlockManager - Parameters - ---------- - copy : bool, default True - Returns ------- values : a dict of dtype -> BlockManager @@ -1654,7 +1632,7 @@ def to_dict(self, copy: bool = True) -> dict[str, Self]: bd.setdefault(str(b.dtype), []).append(b) # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} + return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} def as_array( self, @@ -2034,9 +2012,9 @@ def array_values(self) -> ExtensionArray: """The array that Series.array returns""" return self._block.array_values - def get_numeric_data(self, copy: bool = False) -> Self: + def get_numeric_data(self) -> Self: if self._block.is_numeric: - return self.copy(deep=copy) + return self.copy(deep=False) return self.make_empty() @property diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index bcf4e8fb0e64a..edb21fb92f6a2 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -80,7 +80,7 @@ def test_from_records_sequencelike(self): # this is actually tricky to create the recordlike arrays and # have the dtypes be intact - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() tuples = [] columns = [] dtypes = [] @@ -169,7 +169,7 @@ def test_from_records_dictlike(self): # columns is in a different order here than the actual items iterated # from the dict - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() columns = [] for b in blocks.values(): columns.extend(b.columns) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 28973fe0d7900..f7d9dc914a2ee 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -14,22 +14,6 @@ class TestToDictOfBlocks: - def test_copy_blocks(self, float_frame): - # GH#9607 - df = DataFrame(float_frame, copy=True) - column = df.columns[0] - - # use the default copy=True, change a column - _last_df = None - blocks = df._to_dict_of_blocks(copy=True) - for _df in blocks.values(): - _last_df = _df - if column in _df: - _df.loc[:, column] = _df[column] + 1 - - # make sure we did not change the original DataFrame - assert _last_df is not None and not _last_df[column].equals(df[column]) - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_no_copy_blocks(self, float_frame, using_copy_on_write): # GH#9607 @@ -38,7 +22,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): _last_df = None # use the copy=False, change a column - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() for _df in blocks.values(): _last_df = _df if column in _df: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index a9dbdb21f59fb..ce88bae6e02f2 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -790,24 +790,6 @@ def test_get_numeric_data(self, using_copy_on_write): np.array([100.0, 200.0, 300.0]), ) - numeric2 = mgr.get_numeric_data(copy=True) - tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) - numeric2.iset( - numeric2.items.get_loc("float"), - np.array([1000.0, 2000.0, 3000.0]), - inplace=True, - ) - if using_copy_on_write: - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([1.0, 1.0, 1.0]), - ) - else: - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), - ) - def test_get_bool_data(self, using_copy_on_write): mgr = create_mgr( "int: int; float: float; complex: complex;" @@ -835,20 +817,6 @@ def test_get_bool_data(self, using_copy_on_write): np.array([True, False, True]), ) - # Check sharing - bools2 = mgr.get_bool_data(copy=True) - bools2.iset(0, np.array([False, True, False])) - if using_copy_on_write: - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, True, True]), - ) - else: - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), - ) - def test_unicode_repr_doesnt_raise(self): repr(create_mgr("b,\u05d0: object")) From aa7b17e1f5d596f41950f85e9f00aa9258b5f2a7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 15:06:33 -1000 Subject: [PATCH 24/63] BUG: resample with ArrowDtype (#56371) * BUG: resample with ArrowDtype * Typing * xfail for windows * Fix again? * Avoid tuple * Add gh numbers --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/groupby/grouper.py | 1 - pandas/core/resample.py | 21 ++++++++++++++-- pandas/tests/resample/test_datetime_index.py | 26 ++++++++++++++++++++ pandas/tests/resample/test_timedelta.py | 11 +++++++++ 5 files changed, 57 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8a1906d20c243..bb16590b172ef 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -648,6 +648,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fc914831b7a72..4703c12db602d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -330,7 +330,6 @@ def _get_grouper( return grouper, obj - @final def _set_grouper( self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 31e41acbf1774..7e9aa2d8a9e3c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -38,6 +38,7 @@ rewrite_warning, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -48,6 +49,7 @@ ResamplerWindowApply, warn_alias_replacement, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import ( PandasObject, SelectionMixin, @@ -68,6 +70,7 @@ from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import MultiIndex +from pandas.core.indexes.base import Index from pandas.core.indexes.datetimes import ( DatetimeIndex, date_range, @@ -109,7 +112,6 @@ from pandas import ( DataFrame, - Index, Series, ) @@ -511,6 +513,9 @@ def _wrap_result(self, result): result.index = _asfreq_compat(obj.index[:0], freq=self.freq) result.name = getattr(obj, "name", None) + if self._timegrouper._arrow_dtype is not None: + result.index = result.index.astype(self._timegrouper._arrow_dtype) + return result @final @@ -2163,6 +2168,7 @@ def __init__( self.fill_method = fill_method self.limit = limit self.group_keys = group_keys + self._arrow_dtype: ArrowDtype | None = None if origin in ("epoch", "start", "start_day", "end", "end_day"): # error: Incompatible types in assignment (expression has type "Union[Union[ @@ -2213,7 +2219,7 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: TypeError if incompatible axis """ - _, ax, indexer = self._set_grouper(obj, gpr_index=None) + _, ax, _ = self._set_grouper(obj, gpr_index=None) if isinstance(ax, DatetimeIndex): return DatetimeIndexResampler( obj, @@ -2495,6 +2501,17 @@ def _get_period_bins(self, ax: PeriodIndex): return binner, bins, labels + def _set_grouper( + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: + obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index) + if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm": + self._arrow_dtype = ax.dtype + ax = Index( + cast(ArrowExtensionArray, ax.array)._maybe_convert_datelike_array() + ) + return obj, ax, indexer + def _take_new_index( obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0 diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8a725c6e51e3f..760ed35bab678 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -7,6 +7,8 @@ from pandas._libs import lib from pandas._typing import DatetimeNaTType +from pandas.compat import is_platform_windows +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -2195,3 +2197,27 @@ def test_resample_b_55282(unit): index=exp_dti, ) tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "tz", + [ + None, + pytest.param( + "UTC", + marks=pytest.mark.xfail( + condition=is_platform_windows(), + reason="TODO: Set ARROW_TIMEZONE_DATABASE env var in CI", + ), + ), + ], +) +def test_arrow_timestamp_resample(tz): + # GH 56371 + idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") + if tz is not None: + idx = idx.dt.tz_localize(tz) + expected = Series(np.arange(5, dtype=np.float64), index=idx) + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 5d6876343a0c9..7c70670d42908 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -207,3 +209,12 @@ def test_resample_closed_right(): ), ) tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_arrow_duration_resample(): + # GH 56371 + idx = pd.Index(timedelta_range("1 day", periods=5), dtype="duration[ns][pyarrow]") + expected = Series(np.arange(5, dtype=np.float64), index=idx) + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) From 726e8e8a61cb3f103bc89e773089645a748ce4a4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 02:08:03 +0100 Subject: [PATCH 25/63] BUG: reindex not matching categoricals and new string dtypes (#56106) * Fix string option tests in indexing * Update v2.1.4.rst * Fixup * Update whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexes/base.py | 13 ++++- pandas/tests/indexing/test_categorical.py | 69 +++++++++++++---------- 3 files changed, 51 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index bb16590b172ef..7a0075ab88a3a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -577,6 +577,7 @@ Strings ^^^^^^^ - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) +- Bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`) - Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3abe77b97fe58..3d3056f47f15e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -159,7 +159,10 @@ ExtensionArray, TimedeltaArray, ) -from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, +) from pandas.core.base import ( IndexOpsMixin, PandasObject, @@ -5574,6 +5577,14 @@ def equals(self, other: Any) -> bool: # quickly return if the lengths are different return False + if ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "pyarrow_numpy" + and other.dtype != self.dtype + ): + # special case for object behavior + return other.equals(self.astype(object)) + if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): # if other is not object, use other's logic for coercion return other.equals(self) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 6f0ef0b357269..1b58f8e8b9831 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -14,6 +16,7 @@ Series, Timedelta, Timestamp, + option_context, ) import pandas._testing as tm @@ -428,38 +431,42 @@ def test_ix_categorical_index(self): expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) - def test_ix_categorical_index_non_unique(self): + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_ix_categorical_index_non_unique(self, infer_string): # non-unique - df = DataFrame( - np.random.default_rng(2).standard_normal((3, 3)), - index=list("ABA"), - columns=list("XYX"), - ) - cdf = df.copy() - cdf.index = CategoricalIndex(df.index) - cdf.columns = CategoricalIndex(df.columns) - - exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) - expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) - tm.assert_frame_equal(cdf.loc["A", :], expect) - - exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) - expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) - tm.assert_frame_equal(cdf.loc[:, "X"], expect) - - expect = DataFrame( - df.loc[["A", "B"], :], - columns=cdf.columns, - index=CategoricalIndex(list("AAB")), - ) - tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) - - expect = DataFrame( - df.loc[:, ["X", "Y"]], - index=cdf.index, - columns=CategoricalIndex(list("XXY")), - ) - tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) + with option_context("future.infer_string", infer_string): + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=list("ABA"), + columns=list("XYX"), + ) + cdf = df.copy() + cdf.index = CategoricalIndex(df.index) + cdf.columns = CategoricalIndex(df.columns) + + exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) + expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) + tm.assert_frame_equal(cdf.loc["A", :], expect) + + exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) + expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) + tm.assert_frame_equal(cdf.loc[:, "X"], expect) + + expect = DataFrame( + df.loc[["A", "B"], :], + columns=cdf.columns, + index=CategoricalIndex(list("AAB")), + ) + tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) + + expect = DataFrame( + df.loc[:, ["X", "Y"]], + index=cdf.index, + columns=CategoricalIndex(list("XXY")), + ) + tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) def test_loc_slice(self, df): # GH9748 From 1ab4d031db3bdfe30ebe385f019946f930099543 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 8 Dec 2023 17:17:48 -0800 Subject: [PATCH 26/63] Compiled pandas with -Wextra (#56327) * Compiled pandas with -Wextra * np_datetime_strings.c fallthrough * test fixes * win compat * win compat again? * size_t arg fixup * casts * size_t fixes * more size_t fix * json fixes * builtin Py_UNUSED --- meson.build | 3 +- .../include/pandas/datetime/pd_datetime.h | 2 +- pandas/_libs/include/pandas/portable.h | 12 ++ .../numpy/datetime/np_datetime_strings.h | 2 +- pandas/_libs/src/datetime/pd_datetime.c | 3 +- pandas/_libs/src/parser/pd_parser.c | 2 +- pandas/_libs/src/parser/tokenizer.c | 36 +++--- .../src/vendored/numpy/datetime/np_datetime.c | 119 +++++++++--------- .../numpy/datetime/np_datetime_strings.c | 18 ++- .../src/vendored/ujson/lib/ultrajsonenc.c | 3 + .../src/vendored/ujson/python/JSONtoObj.c | 46 ++++--- .../src/vendored/ujson/python/objToJSON.c | 76 +++++------ .../_libs/src/vendored/ujson/python/ujson.c | 6 +- 13 files changed, 189 insertions(+), 139 deletions(-) diff --git a/meson.build b/meson.build index 0bc04c59d8716..06623a305ab54 100644 --- a/meson.build +++ b/meson.build @@ -7,7 +7,8 @@ project( meson_version: '>=1.2.1', default_options: [ 'buildtype=release', - 'c_std=c11' + 'c_std=c11', + 'warning_level=2', ] ) diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index a51f8cea71513..98e5521af2506 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -50,7 +50,7 @@ typedef struct { NPY_DATETIMEUNIT *, int *, int *, const char *, int, FormatRequirement); int (*get_datetime_iso_8601_strlen)(int, NPY_DATETIMEUNIT); - int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, int, int, + int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, size_t, int, NPY_DATETIMEUNIT); int (*make_iso_8601_timedelta)(pandas_timedeltastruct *, char *, size_t *); } PandasDateTime_CAPI; diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h index be9080172fe42..1d0509d9e9724 100644 --- a/pandas/_libs/include/pandas/portable.h +++ b/pandas/_libs/include/pandas/portable.h @@ -23,3 +23,15 @@ The full license is in the LICENSE file, distributed with this software. #define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5)) #define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c)) #define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c)) + +#if defined(_WIN32) +#define PD_FALLTHROUGH \ + do { \ + } while (0) /* fallthrough */ +#elif __has_attribute(__fallthrough__) +#define PD_FALLTHROUGH __attribute__((__fallthrough__)) +#else +#define PD_FALLTHROUGH \ + do { \ + } while (0) /* fallthrough */ +#endif diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h index d96ca79d70cb7..75e69f30ada1e 100644 --- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h +++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h @@ -85,7 +85,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, +int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen, int utc, NPY_DATETIMEUNIT base); /* diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 606edf1184aad..19de51be6e1b2 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -21,6 +21,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include "datetime.h" #include "pandas/datetime/pd_datetime.h" +#include "pandas/portable.h" static void pandas_datetime_destructor(PyObject *op) { void *ptr = PyCapsule_GetPointer(op, PandasDateTime_CAPSULE_NAME); @@ -188,7 +189,7 @@ static npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { return npy_dt; } -static int pandas_datetime_exec(PyObject *module) { +static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) { PyDateTime_IMPORT; PandasDateTime_CAPI *capi = PyMem_Malloc(sizeof(PandasDateTime_CAPI)); if (capi == NULL) { diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c index 2e16f5b756dd0..48f3cd14cbc30 100644 --- a/pandas/_libs/src/parser/pd_parser.c +++ b/pandas/_libs/src/parser/pd_parser.c @@ -100,7 +100,7 @@ static void pandas_parser_destructor(PyObject *op) { PyMem_Free(ptr); } -static int pandas_parser_exec(PyObject *module) { +static int pandas_parser_exec(PyObject *Py_UNUSED(module)) { PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI)); if (capi == NULL) { PyErr_NoMemory(); diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 74a3a51c61e15..0e4188bea4dc7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -795,7 +795,7 @@ static int tokenize_bytes(parser_t *self, size_t line_limit, break; } else if (!isblank(c)) { self->state = START_FIELD; - // fall through to subsequent state + PD_FALLTHROUGH; // fall through to subsequent state } else { // if whitespace char, keep slurping break; @@ -849,12 +849,12 @@ static int tokenize_bytes(parser_t *self, size_t line_limit, self->state = WHITESPACE_LINE; break; } - // fall through } // normal character - fall through // to handle as START_FIELD self->state = START_FIELD; + PD_FALLTHROUGH; } case START_FIELD: // expecting field @@ -1130,10 +1130,10 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* if word_deletions == 0 (i.e. this case) then char_count must * be 0 too, as no data needs to be skipped */ - const int64_t char_count = word_deletions >= 1 - ? (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1) - : 0; + const uint64_t char_count = + word_deletions >= 1 ? (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1) + : 0; TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); @@ -1415,9 +1415,11 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, int negative = 0; switch (*p) { case '-': - negative = 1; // Fall through to increment position. + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. case '+': p++; + break; } int exponent = 0; @@ -1485,9 +1487,11 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, negative = 0; switch (*++p) { case '-': - negative = 1; // Fall through to increment pos. + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. case '+': p++; + break; } // Process string of digits. @@ -1595,9 +1599,11 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, int negative = 0; switch (*p) { case '-': - negative = 1; // Fall through to increment position. + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. case '+': p++; + break; } double number = 0.; @@ -1656,9 +1662,11 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, negative = 0; switch (*++p) { case '-': - negative = 1; // Fall through to increment pos. + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. case '+': p++; + break; } // Process string of digits. @@ -1762,8 +1770,8 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, return s_copy; } -double round_trip(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing, int *error, int *maybe_int) { +double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), + char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and // remove thousands separator. char *endptr; @@ -1975,7 +1983,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, break; } if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { + ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { number = number * 10 + (d - '0'); d = *++p; @@ -1987,7 +1995,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } else { while (isdigit_ascii(d)) { if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { + ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { number = number * 10 + (d - '0'); d = *++p; diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index ab24752203c57..06e3251db8315 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -243,7 +243,7 @@ static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) { for (i = 0; i < 12; ++i) { if (days < month_lengths[i]) { dts->month = i + 1; - dts->day = days + 1; + dts->day = (npy_int32)days + 1; return; } else { days -= month_lengths[i]; @@ -568,7 +568,7 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, case NPY_FR_M: out->year = 1970 + extract_unit(&dt, 12); - out->month = dt + 1; + out->month = (npy_int32)dt + 1; break; case NPY_FR_W: @@ -584,72 +584,72 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, perday = 24LL; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = dt; + out->hour = (npy_int32)dt; break; case NPY_FR_m: perday = 24LL * 60; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 60); - out->min = (int)dt; + out->hour = (npy_int32)extract_unit(&dt, 60); + out->min = (npy_int32)dt; break; case NPY_FR_s: perday = 24LL * 60 * 60; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 60 * 60); - out->min = (int)extract_unit(&dt, 60); - out->sec = (int)dt; + out->hour = (npy_int32)extract_unit(&dt, 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 60); + out->sec = (npy_int32)dt; break; case NPY_FR_ms: perday = 24LL * 60 * 60 * 1000; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 60); - out->sec = (int)extract_unit(&dt, 1000LL); - out->us = (int)(dt * 1000); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL); + out->us = (npy_int32)(dt * 1000); break; case NPY_FR_us: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000); - out->us = (int)dt; + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->us = (npy_int32)dt; break; case NPY_FR_ns: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL); - out->ps = (int)(dt * 1000); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL); + out->ps = (npy_int32)(dt * 1000); break; case NPY_FR_ps: perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL); - out->ps = (int)(dt * 1000); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL); + out->ps = (npy_int32)(dt * 1000); break; case NPY_FR_fs: /* entire range is only +- 2.6 hours */ - out->hour = - (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60 * 60); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 60 * 60); if (out->hour < 0) { out->year = 1969; out->month = 12; @@ -657,17 +657,18 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, out->hour += 24; assert(out->hour >= 0); } - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->ps = (int)extract_unit(&dt, 1000LL); - out->as = (int)(dt * 1000); + out->min = + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->ps = (npy_int32)extract_unit(&dt, 1000LL); + out->as = (npy_int32)(dt * 1000); break; case NPY_FR_as: /* entire range is only +- 9.2 seconds */ out->sec = - (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 1000); + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 1000); if (out->sec < 0) { out->year = 1969; out->month = 12; @@ -677,9 +678,9 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, out->sec += 60; assert(out->sec >= 0); } - out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); - out->ps = (int)extract_unit(&dt, 1000LL * 1000); - out->as = (int)dt; + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->ps = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->as = (npy_int32)dt; break; default: @@ -741,21 +742,21 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, } if (frac >= 3600) { - out->hrs = frac / 3600LL; + out->hrs = (npy_int32)(frac / 3600LL); frac -= out->hrs * 3600LL; } else { out->hrs = 0; } if (frac >= 60) { - out->min = frac / 60LL; + out->min = (npy_int32)(frac / 60LL); frac -= out->min * 60LL; } else { out->min = 0; } if (frac >= 0) { - out->sec = frac; + out->sec = (npy_int32)frac; frac -= out->sec; } else { out->sec = 0; @@ -769,11 +770,11 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, ifrac = td - (out->days * per_day + sfrac); if (ifrac != 0) { - out->ms = ifrac / (1000LL * 1000LL); + out->ms = (npy_int32)(ifrac / (1000LL * 1000LL)); ifrac -= out->ms * 1000LL * 1000LL; - out->us = ifrac / 1000LL; + out->us = (npy_int32)(ifrac / 1000LL); ifrac -= out->us * 1000LL; - out->ns = ifrac; + out->ns = (npy_int32)ifrac; } else { out->ms = 0; out->us = 0; @@ -813,21 +814,21 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, } if (frac >= 3600) { - out->hrs = frac / 3600LL; + out->hrs = (npy_int32)(frac / 3600LL); frac -= out->hrs * 3600LL; } else { out->hrs = 0; } if (frac >= 60) { - out->min = frac / 60LL; + out->min = (npy_int32)(frac / 60LL); frac -= out->min * 60LL; } else { out->min = 0; } if (frac >= 0) { - out->sec = frac; + out->sec = (npy_int32)frac; frac -= out->sec; } else { out->sec = 0; @@ -841,11 +842,11 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, ifrac = td - (out->days * per_day + sfrac); if (ifrac != 0) { - out->ms = ifrac / 1000LL; + out->ms = (npy_int32)(ifrac / 1000LL); ifrac -= out->ms * 1000LL; - out->us = ifrac / 1L; + out->us = (npy_int32)(ifrac / 1L); ifrac -= out->us * 1L; - out->ns = ifrac; + out->ns = (npy_int32)ifrac; } else { out->ms = 0; out->us = 0; @@ -885,21 +886,21 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, } if (frac >= 3600) { - out->hrs = frac / 3600LL; + out->hrs = (npy_int32)(frac / 3600LL); frac -= out->hrs * 3600LL; } else { out->hrs = 0; } if (frac >= 60) { - out->min = frac / 60LL; + out->min = (npy_int32)(frac / 60LL); frac -= out->min * 60LL; } else { out->min = 0; } if (frac >= 0) { - out->sec = frac; + out->sec = (npy_int32)frac; frac -= out->sec; } else { out->sec = 0; @@ -913,7 +914,7 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, ifrac = td - (out->days * per_day + sfrac); if (ifrac != 0) { - out->ms = ifrac; + out->ms = (npy_int32)ifrac; out->us = 0; out->ns = 0; } else { @@ -956,21 +957,21 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, } if (frac >= 3600) { - out->hrs = frac / 3600LL; + out->hrs = (npy_int32)(frac / 3600LL); frac -= out->hrs * 3600LL; } else { out->hrs = 0; } if (frac >= 60) { - out->min = frac / 60LL; + out->min = (npy_int32)(frac / 60LL); frac -= out->min * 60LL; } else { out->min = 0; } if (frac >= 0) { - out->sec = frac; + out->sec = (npy_int32)frac; frac -= out->sec; } else { out->sec = 0; @@ -998,9 +999,9 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, out->days = td / 1440LL; td -= out->days * 1440LL; - out->hrs = td / 60LL; + out->hrs = (npy_int32)(td / 60LL); td -= out->hrs * 60LL; - out->min = td; + out->min = (npy_int32)td; out->sec = 0; out->ms = 0; @@ -1011,7 +1012,7 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, case NPY_FR_h: out->days = td / 24LL; td -= out->days * 24LL; - out->hrs = td; + out->hrs = (npy_int32)td; out->min = 0; out->sec = 0; diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index 73bce9ab27a8b..a46f5bc467c5d 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -35,6 +35,7 @@ This file implements string parsing and creation for NumPy datetime. #include #include +#include "pandas/portable.h" #include "pandas/vendored/numpy/datetime/np_datetime.h" #include "pandas/vendored/numpy/datetime/np_datetime_strings.h" @@ -767,27 +768,38 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { /* return 4;*/ case NPY_FR_as: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_fs: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_ps: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_ns: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_us: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_ms: len += 4; /* ".###" */ + PD_FALLTHROUGH; case NPY_FR_s: len += 3; /* ":##" */ + PD_FALLTHROUGH; case NPY_FR_m: len += 3; /* ":##" */ + PD_FALLTHROUGH; case NPY_FR_h: len += 3; /* "T##" */ + PD_FALLTHROUGH; case NPY_FR_D: case NPY_FR_W: len += 3; /* "-##" */ + PD_FALLTHROUGH; case NPY_FR_M: len += 3; /* "-##" */ + PD_FALLTHROUGH; case NPY_FR_Y: len += 21; /* 64-bit year */ break; @@ -823,10 +835,10 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, +int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen, int utc, NPY_DATETIMEUNIT base) { char *substr = outstr; - int sublen = outlen; + size_t sublen = outlen; int tmplen; /* @@ -851,7 +863,7 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); #endif // _WIN32 /* If it ran out of space or there isn't space for the NULL terminator */ - if (tmplen < 0 || tmplen > sublen) { + if (tmplen < 0 || (size_t)tmplen > sublen) { goto string_too_short; } substr += tmplen; diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index 876d9f276f5d2..c8d8b5ab6bd6e 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -40,6 +40,7 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE +#include "pandas/portable.h" #include "pandas/vendored/ujson/lib/ultrajson.h" #include #include @@ -461,6 +462,7 @@ int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io, { if (enc->encodeHTMLChars) { // Fall through to \u00XX case below. + PD_FALLTHROUGH; } else { // Same as default case below. (*of++) = (*io); @@ -645,6 +647,7 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, case 29: { if (enc->encodeHTMLChars) { // Fall through to \u00XX case 30 below. + PD_FALLTHROUGH; } else { // Same as case 1 above. *(of++) = (*io++); diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index b7ee58c63a275..7cc20a52f1849 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -42,66 +42,74 @@ Numeric decoder derived from TCL library #define PY_SSIZE_T_CLEAN #include -static int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { +static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, + JSOBJ value) { int ret = PyDict_SetItem(obj, name, value); Py_DECREF((PyObject *)name); Py_DECREF((PyObject *)value); return ret == 0 ? 1 : 0; } -static int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { +static int Object_arrayAddItem(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ value) { int ret = PyList_Append(obj, value); Py_DECREF((PyObject *)value); return ret == 0 ? 1 : 0; } -static JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) { +static JSOBJ Object_newString(void *Py_UNUSED(prv), wchar_t *start, + wchar_t *end) { return PyUnicode_FromWideChar(start, (end - start)); } -static JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; } +static JSOBJ Object_newTrue(void *Py_UNUSED(prv)) { Py_RETURN_TRUE; } -static JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; } +static JSOBJ Object_newFalse(void *Py_UNUSED(prv)) { Py_RETURN_FALSE; } -static JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; } +static JSOBJ Object_newNull(void *Py_UNUSED(prv)) { Py_RETURN_NONE; } -static JSOBJ Object_newPosInf(void *prv) { +static JSOBJ Object_newPosInf(void *Py_UNUSED(prv)) { return PyFloat_FromDouble(Py_HUGE_VAL); } -static JSOBJ Object_newNegInf(void *prv) { +static JSOBJ Object_newNegInf(void *Py_UNUSED(prv)) { return PyFloat_FromDouble(-Py_HUGE_VAL); } -static JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); } +static JSOBJ Object_newObject(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) { + return PyDict_New(); +} -static JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; } +static JSOBJ Object_endObject(void *Py_UNUSED(prv), JSOBJ obj) { return obj; } -static JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); } +static JSOBJ Object_newArray(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) { + return PyList_New(0); +} -static JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; } +static JSOBJ Object_endArray(void *Py_UNUSED(prv), JSOBJ obj) { return obj; } -static JSOBJ Object_newInteger(void *prv, JSINT32 value) { - return PyLong_FromLong((long)value); +static JSOBJ Object_newInteger(void *Py_UNUSED(prv), JSINT32 value) { + return PyLong_FromLong(value); } -static JSOBJ Object_newLong(void *prv, JSINT64 value) { +static JSOBJ Object_newLong(void *Py_UNUSED(prv), JSINT64 value) { return PyLong_FromLongLong(value); } -static JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) { +static JSOBJ Object_newUnsignedLong(void *Py_UNUSED(prv), JSUINT64 value) { return PyLong_FromUnsignedLongLong(value); } -static JSOBJ Object_newDouble(void *prv, double value) { +static JSOBJ Object_newDouble(void *Py_UNUSED(prv), double value) { return PyFloat_FromDouble(value); } -static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) { +static void Object_releaseObject(void *Py_UNUSED(prv), JSOBJ obj, + void *Py_UNUSED(decoder)) { Py_XDECREF(((PyObject *)obj)); } -PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { +PyObject *JSONToObj(PyObject *Py_UNUSED(self), PyObject *args, + PyObject *kwargs) { JSONObjectDecoder dec = {.newString = Object_newString, .objectAddKey = Object_objectAddKey, .arrayAddItem = Object_arrayAddItem, diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index ef88b97918d76..8bba95dd456de 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -66,9 +66,9 @@ int object_is_na_type(PyObject *obj); typedef struct __NpyArrContext { PyObject *array; char *dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) + npy_intp curdim; // current dimension in array's order + npy_intp stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) npy_intp dim; npy_intp stride; npy_intp ndim; @@ -81,8 +81,8 @@ typedef struct __NpyArrContext { } NpyArrContext; typedef struct __PdBlockContext { - int colIdx; - int ncols; + Py_ssize_t colIdx; + Py_ssize_t ncols; int transpose; NpyArrContext **npyCtxts; // NpyArrContext for each column @@ -1934,40 +1934,42 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, PyObject *odefHandler = 0; int indent = 0; - PyObjectEncoder pyEncoder = {{ - Object_beginTypeContext, - Object_endTypeContext, - Object_getStringValue, - Object_getLongValue, - NULL, // getIntValue is unused - Object_getDoubleValue, - Object_getBigNumStringValue, - Object_iterBegin, - Object_iterNext, - Object_iterEnd, - Object_iterGetValue, - Object_iterGetName, - Object_releaseObject, - PyObject_Malloc, - PyObject_Realloc, - PyObject_Free, - -1, // recursionMax - idoublePrecision, - 1, // forceAscii - 0, // encodeHTMLChars - indent, // indent - }}; + PyObjectEncoder pyEncoder = { + { + .beginTypeContext = Object_beginTypeContext, + .endTypeContext = Object_endTypeContext, + .getStringValue = Object_getStringValue, + .getLongValue = Object_getLongValue, + .getIntValue = NULL, + .getDoubleValue = Object_getDoubleValue, + .getBigNumStringValue = Object_getBigNumStringValue, + .iterBegin = Object_iterBegin, + .iterNext = Object_iterNext, + .iterEnd = Object_iterEnd, + .iterGetValue = Object_iterGetValue, + .iterGetName = Object_iterGetName, + .releaseObject = Object_releaseObject, + .malloc = PyObject_Malloc, + .realloc = PyObject_Realloc, + .free = PyObject_Free, + .recursionMax = -1, + .doublePrecision = idoublePrecision, + .forceASCII = 1, + .encodeHTMLChars = 0, + .indent = indent, + .errorMsg = NULL, + }, + .npyCtxtPassthru = NULL, + .blkCtxtPassthru = NULL, + .npyType = -1, + .npyValue = NULL, + .datetimeIso = 0, + .datetimeUnit = NPY_FR_ms, + .outputFormat = COLUMNS, + .defaultHandler = NULL, + }; JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; - pyEncoder.npyCtxtPassthru = NULL; - pyEncoder.blkCtxtPassthru = NULL; - pyEncoder.npyType = -1; - pyEncoder.npyValue = NULL; - pyEncoder.datetimeIso = 0; - pyEncoder.datetimeUnit = NPY_FR_ms; - pyEncoder.outputFormat = COLUMNS; - pyEncoder.defaultHandler = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient, &sdateFormat, diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index 351f5226b57c9..075411a23b075 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -56,9 +56,11 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs); "encode_html_chars=True to encode < > & as unicode escape sequences." static PyMethodDef ujsonMethods[] = { - {"ujson_dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS, + {"ujson_dumps", (PyCFunction)(void (*)(void))objToJSON, + METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT}, - {"ujson_loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS, + {"ujson_loads", (PyCFunction)(void (*)(void))JSONToObj, + METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure. Use precise_float=True " "to use high precision float decoder."}, {NULL, NULL, 0, NULL} /* Sentinel */ From 04307e717d3d227cbdf250695f2698c2cae7752d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 15:43:34 -1000 Subject: [PATCH 27/63] STY: Fix doctest and docstring formatting errors (#56408) * STY: Fix doctest and docstring formatting errors * ensure stderr is output too * Fix more failures * Don't add redirects for single page, fix example * A few more * Remove e flag --- ci/code_checks.sh | 2 ++ doc/make.py | 5 ++-- pandas/core/arrays/sparse/accessor.py | 2 +- pandas/core/generic.py | 4 +-- pandas/core/groupby/generic.py | 30 +++++++++------------ pandas/core/groupby/groupby.py | 18 ++++++++----- pandas/core/indexes/multi.py | 3 ++- pandas/core/resample.py | 13 +++------ pandas/core/shared_docs.py | 4 +-- pandas/core/window/rolling.py | 39 ++++++++++++++------------- pandas/io/sql.py | 2 +- pandas/plotting/_core.py | 16 +++++------ pandas/plotting/_misc.py | 2 +- scripts/validate_docstrings.py | 11 ++++---- 14 files changed, 76 insertions(+), 75 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e91629744463f..e41f625e583c0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -14,6 +14,8 @@ # $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free # $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks +set -uo pipefail + [[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } diff --git a/doc/make.py b/doc/make.py index dfa8ae6c1e34c..c4b7ab124f68f 100755 --- a/doc/make.py +++ b/doc/make.py @@ -236,8 +236,9 @@ def html(self): os.remove(zip_fname) if ret_code == 0: - if self.single_doc_html is not None and not self.no_browser: - self._open_browser(self.single_doc_html) + if self.single_doc_html is not None: + if not self.no_browser: + self._open_browser(self.single_doc_html) else: self._add_redirects() if self.whatsnew and not self.no_browser: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6eb1387c63a0a..c98fbb836cc6d 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -270,7 +270,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3) + >>> mat = scipy.sparse.eye(3, dtype=float) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 0 1.0 0.0 0.0 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 90946b8d9b5f4..ef10958ac1153 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3955,7 +3955,7 @@ def to_csv( >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}}) - >>> df.to_csv('out.csv', index=False) # doctest: +SKIP + >>> df.to_csv('out.csv', index=False) # doctest: +SKIP Create 'out.zip' containing 'out.csv' @@ -8972,7 +8972,7 @@ def clip( Clips using specific lower and upper thresholds per column: - >>> df.clip([-2, -1], [4,5]) + >>> df.clip([-2, -1], [4, 5]) col_0 col_1 0 4 -1 1 -2 -1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 204083ac6c04e..c4cc30f9631ea 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -470,10 +470,9 @@ def _aggregate_named(self, func, *args, **kwargs): __examples_series_doc = dedent( """ - >>> ser = pd.Series( - ... [390.0, 350.0, 30.0, 20.0], - ... index=["Falcon", "Falcon", "Parrot", "Parrot"], - ... name="Max Speed") + >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed") >>> grouped = ser.groupby([1, 1, 2, 2]) >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) Falcon 0.707107 @@ -1331,14 +1330,10 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ Examples -------- - >>> df = pd.DataFrame( - ... { - ... "A": [1, 1, 2, 2], + >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], - ... "C": [0.362838, 0.227877, 1.267767, -0.562860], - ... } - ... ) - + ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} + >>> df = pd.DataFrame(data) >>> df A B C 0 1 1 0.362838 @@ -1393,7 +1388,8 @@ class DataFrameGroupBy(GroupBy[DataFrame]): >>> df.groupby("A").agg( ... b_min=pd.NamedAgg(column="B", aggfunc="min"), - ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum") + ... ) b_min c_sum A 1 1 0.590715 @@ -2154,7 +2150,7 @@ def idxmax( >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -2236,7 +2232,7 @@ def idxmin( >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -2319,9 +2315,9 @@ def value_counts( Examples -------- >>> df = pd.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], - ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], - ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] ... }) >>> df diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7d284db4eba2c..e51983f0aabb7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -232,8 +232,8 @@ class providing the base-class of operations. """, "dataframe_examples": """ >>> df = pd.DataFrame({'A': 'a a b'.split(), - ... 'B': [1,2,3], - ... 'C': [4,6,5]}) + ... 'B': [1, 2, 3], + ... 'C': [4, 6, 5]}) >>> g1 = df.groupby('A', group_keys=False) >>> g2 = df.groupby('A', group_keys=True) @@ -313,7 +313,7 @@ class providing the base-class of operations. The resulting dtype will reflect the return value of the passed ``func``. - >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2) a 0.0 a 2.0 b 1.0 @@ -322,7 +322,7 @@ class providing the base-class of operations. In the above, the groups are not part of the index. We can have them included by using ``g2`` where ``group_keys=True``: - >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2) a a 0.0 a 2.0 b b 1.0 @@ -421,14 +421,18 @@ class providing the base-class of operations. functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing ->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"]) +>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP You can write >>> (df.groupby('group') ... .pipe(f) -... .pipe(g, arg1=a) -... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP which is much more readable. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 86693f241ddb1..46343b84afb43 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -862,7 +862,8 @@ def levels(self) -> FrozenList: Examples -------- >>> index = pd.MultiIndex.from_product([['mammal'], - ... ('goat', 'human', 'cat', 'dog')], names=['Category', 'Animals']) + ... ('goat', 'human', 'cat', 'dog')], + ... names=['Category', 'Animals']) >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) >>> leg_num Legs diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 7e9aa2d8a9e3c..d54f6d31f6144 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -859,7 +859,7 @@ def fillna(self, method, limit: int | None = None): Missing values present before the upsampling are not affected. >>> sm = pd.Series([1, None, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) + ... index=pd.date_range('20180101', periods=3, freq='h')) >>> sm 2018-01-01 00:00:00 1.0 2018-01-01 01:00:00 NaN @@ -1028,13 +1028,8 @@ def interpolate( Examples -------- - >>> import datetime as dt - >>> timesteps = [ - ... dt.datetime(2023, 3, 1, 7, 0, 0), - ... dt.datetime(2023, 3, 1, 7, 0, 1), - ... dt.datetime(2023, 3, 1, 7, 0, 2), - ... dt.datetime(2023, 3, 1, 7, 0, 3), - ... dt.datetime(2023, 3, 1, 7, 0, 4)] + >>> start = "2023-03-01T07:00:00" + >>> timesteps = pd.date_range(start, periods=5, freq="s") >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps) >>> series 2023-03-01 07:00:00 1 @@ -1042,7 +1037,7 @@ def interpolate( 2023-03-01 07:00:02 2 2023-03-01 07:00:03 1 2023-03-01 07:00:04 3 - dtype: int64 + Freq: s, dtype: int64 Upsample the dataframe to 0.5Hz by providing the period time of 2s. diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index fdb8ef1cc5dad..25f7e7e9f832b 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -797,7 +797,7 @@ ... 'B': ['a', 'b', 'c', 'd', 'e'], ... 'C': ['f', 'g', 'h', 'i', 'j']}}) - >>> df.replace(to_replace='^[a-g]', value = 'e', regex=True) + >>> df.replace(to_replace='^[a-g]', value='e', regex=True) A B C 0 0 e e 1 1 e e @@ -808,7 +808,7 @@ If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary keys will be the DataFrame columns that the replacement will be applied. - >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value = 'e', regex=True) + >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value='e', regex=True) A B C 0 0 e f 1 1 e g diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f90863a8ea1ef..f268d36d7fdc4 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2439,14 +2439,14 @@ def var( create_section_header("Examples"), dedent( """\ - >>> ser = pd.Series([1, 5, 2, 7, 12, 6]) + >>> ser = pd.Series([1, 5, 2, 7, 15, 6]) >>> ser.rolling(3).skew().round(6) 0 NaN 1 NaN 2 1.293343 3 -0.585583 - 4 0.000000 - 5 1.545393 + 4 0.670284 + 5 1.652317 dtype: float64 """ ), @@ -2794,12 +2794,12 @@ def cov( >>> v1 = [3, 3, 3, 5, 8] >>> v2 = [3, 4, 4, 4, 8] - >>> # numpy returns a 2X2 array, the correlation coefficient - >>> # is the number at entry [0][1] - >>> print(f"{{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}}") - 0.333333 - >>> print(f"{{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}}") - 0.916949 + >>> np.corrcoef(v1[:-1], v2[:-1]) + array([[1. , 0.33333333], + [0.33333333, 1. ]]) + >>> np.corrcoef(v1[1:], v2[1:]) + array([[1. , 0.9169493], + [0.9169493, 1. ]]) >>> s1 = pd.Series(v1) >>> s2 = pd.Series(v2) >>> s1.rolling(4).corr(s2) @@ -2813,15 +2813,18 @@ def cov( The below example shows a similar rolling calculation on a DataFrame using the pairwise option. - >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ - [46., 31.], [50., 36.]]) - >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) - [[1. 0.6263001] - [0.6263001 1. ]] - >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) - [[1. 0.5553681] - [0.5553681 1. ]] - >>> df = pd.DataFrame(matrix, columns=['X','Y']) + >>> matrix = np.array([[51., 35.], + ... [49., 30.], + ... [47., 32.], + ... [46., 31.], + ... [50., 36.]]) + >>> np.corrcoef(matrix[:-1, 0], matrix[:-1, 1]) + array([[1. , 0.6263001], + [0.6263001, 1. ]]) + >>> np.corrcoef(matrix[1:, 0], matrix[1:, 1]) + array([[1. , 0.55536811], + [0.55536811, 1. ]]) + >>> df = pd.DataFrame(matrix, columns=['X', 'Y']) >>> df X Y 0 51.0 35.0 diff --git a/pandas/io/sql.py b/pandas/io/sql.py index d4b6602d9f0eb..a83c2bf241450 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -680,7 +680,7 @@ def read_sql( pandas now supports reading via ADBC drivers - >>> from adbc_driver_postgresql import dbapi + >>> from adbc_driver_postgresql import dbapi # doctest:+SKIP >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP ... pd.read_sql('SELECT int_column FROM test_data', conn) int_column diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index a017787f2dc2d..bd04e812e0be1 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -241,10 +241,10 @@ def hist_frame( .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] - ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> data = {'length': [1.5, 0.5, 1.2, 0.9, 3], + ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]} + >>> index = ['pig', 'rabbit', 'duck', 'chicken', 'horse'] + >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) """ plot_backend = _get_plot_backend(backend) @@ -607,10 +607,10 @@ def boxplot_frame_groupby( >>> import itertools >>> tuples = [t for t in itertools.product(range(1000), range(4))] >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) - >>> data = np.random.randn(len(index),4) + >>> data = np.random.randn(len(index), 4) >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) >>> grouped = df.groupby(level='lvl1') - >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8,10)) # doctest: +SKIP + >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10)) # doctest: +SKIP The ``subplots=False`` option shows the boxplots in a single figure. @@ -1400,9 +1400,7 @@ def hist( .. plot:: :context: close-figs - >>> df = pd.DataFrame( - ... np.random.randint(1, 7, 6000), - ... columns = ['one']) + >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 5e5a55b4d0a98..18db460d388a4 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -439,7 +439,7 @@ def bootstrap_plot( :context: close-figs >>> s = pd.Series(np.random.uniform(size=100)) - >>> pd.plotting.bootstrap_plot(s) + >>> pd.plotting.bootstrap_plot(s) # doctest: +SKIP
""" plot_backend = _get_plot_backend("matplotlib") diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 0a6a852bb0f85..98b55f8d690cf 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -228,11 +228,12 @@ def validate_pep8(self): file.name, ] response = subprocess.run(cmd, capture_output=True, check=False, text=True) - stdout = response.stdout - stdout = stdout.replace(file.name, "") - messages = stdout.strip("\n").splitlines() - if messages: - error_messages.extend(messages) + for output in ("stdout", "stderr"): + out = getattr(response, output) + out = out.replace(file.name, "") + messages = out.strip("\n").splitlines() + if messages: + error_messages.extend(messages) finally: file.close() os.unlink(file.name) From ebde35478a86776caed68098a83c52a3ad83f9c7 Mon Sep 17 00:00:00 2001 From: ccccjone <144291871+ccccjone@users.noreply.github.com> Date: Sat, 9 Dec 2023 05:34:47 -0800 Subject: [PATCH 28/63] TST: Add test for large integer result in groupby.prod (#56384) --- pandas/tests/groupby/test_reductions.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 3e78e728f5ea9..425079f943aba 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1057,3 +1057,27 @@ def test_regression_allowlist_methods(op, axis, skipna, sort): if sort: expected = expected.sort_index(axis=axis) tm.assert_frame_equal(result, expected) + + +def test_groupby_prod_with_int64_dtype(): + # GH#46573 + data = [ + [1, 11], + [1, 41], + [1, 17], + [1, 37], + [1, 7], + [1, 29], + [1, 31], + [1, 2], + [1, 3], + [1, 43], + [1, 5], + [1, 47], + [1, 19], + [1, 88], + ] + df = DataFrame(data, columns=["A", "B"], dtype="int64") + result = df.groupby(["A"]).prod().reset_index() + expected = DataFrame({"A": [1], "B": [180970905912331920]}, dtype="int64") + tm.assert_frame_equal(result, expected) From 536ce302fb4b801b9d6690076b538a5c9c20ac17 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Dec 2023 10:27:41 -0800 Subject: [PATCH 29/63] DEPR: Series[categorical].replace special-casing (#56385) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/categorical.py | 13 ++++++ .../tests/arrays/categorical/test_replace.py | 28 +++++++++++-- pandas/tests/copy_view/test_replace.py | 41 ++++++++++++++++--- pandas/tests/frame/methods/test_replace.py | 25 ++++++++--- pandas/tests/groupby/test_groupby_dropna.py | 4 +- .../tests/io/pytables/test_file_handling.py | 10 ++++- pandas/tests/series/methods/test_replace.py | 17 ++++++-- 8 files changed, 116 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 7a0075ab88a3a..fb91219582b14 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -474,6 +474,7 @@ Other Deprecations - Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`) - Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`) - Deprecated the ``unit`` keyword in :class:`TimedeltaIndex` construction, use :func:`to_timedelta` instead (:issue:`55499`) +- Deprecated the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype`; in a future version replace will change the values while preserving the categories. To change the categories, use ``ser.cat.rename_categories`` instead (:issue:`55147`) - Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`) - Deprecated the default of ``observed=False`` in :meth:`DataFrame.pivot_table`; will be ``True`` in a future version (:issue:`56236`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index eec833c600177..f0aabbb863a79 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2626,6 +2626,8 @@ def isin(self, values) -> npt.NDArray[np.bool_]: def _replace(self, *, to_replace, value, inplace: bool = False): from pandas import Index + orig_dtype = self.dtype + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() @@ -2656,6 +2658,17 @@ def _replace(self, *, to_replace, value, inplace: bool = False): new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) NDArrayBacked.__init__(cat, new_codes, new_dtype) + if new_dtype != orig_dtype: + warnings.warn( + # GH#55147 + "The behavior of Series.replace (and DataFrame.replace) with " + "CategoricalDtype is deprecated. In a future version, replace " + "will only be used for cases that preserve the categories. " + "To change the categories, use ser.cat.rename_categories " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if not inplace: return cat diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 0611d04d36d10..3c677142846d7 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -31,6 +31,9 @@ ([1, 2, "3"], "5", ["5", "5", 3], True), ], ) +@pytest.mark.filterwarnings( + "ignore:.*with CategoricalDtype is deprecated:FutureWarning" +) def test_replace_categorical_series(to_replace, value, expected, flip_categories): # GH 31720 @@ -60,7 +63,13 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): # GH#26988 cat = Categorical(["a", "b"]) expected = Categorical(result) - result = pd.Series(cat, copy=False).replace(to_replace, value)._values + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if expected_error_msg is not None else None + with tm.assert_produces_warning(warn, match=msg): + result = pd.Series(cat, copy=False).replace(to_replace, value)._values tm.assert_categorical_equal(result, expected) if to_replace == "b": # the "c" test is supposed to be unchanged @@ -69,14 +78,20 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): tm.assert_categorical_equal(cat, expected) ser = pd.Series(cat, copy=False) - ser.replace(to_replace, value, inplace=True) + with tm.assert_produces_warning(warn, match=msg): + ser.replace(to_replace, value, inplace=True) tm.assert_categorical_equal(cat, expected) def test_replace_categorical_ea_dtype(): # GH49404 cat = Categorical(pd.array(["a", "b"], dtype="string")) - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) @@ -85,7 +100,12 @@ def test_replace_maintain_ordering(): # GH51016 dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) ser = pd.Series([0, 1, 2], dtype=dtype) - result = ser.replace(0, 2) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace(0, 2) expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) expected = pd.Series([2, 1, 2], dtype=expected_dtype) tm.assert_series_equal(expected, result, check_category_order=True) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 268e859e782ec..6d16bc3083883 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -162,13 +162,19 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write): def test_replace_list_categorical(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - df.replace(["c"], value="a", inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) if using_copy_on_write: assert df._mgr._has_no_reference(0) df_orig = df.copy() - df2 = df.replace(["b"], value="a") + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = df.replace(["b"], value="a") assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -178,7 +184,12 @@ def test_replace_list_inplace_refs_categorical(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - df.replace(["c"], value="a", inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) if using_copy_on_write: assert not np.shares_memory( get_array(view, "a").codes, get_array(df, "a").codes @@ -238,7 +249,13 @@ def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_repl df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=to_replace, value=val, inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=to_replace, value=val, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) @@ -253,7 +270,13 @@ def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_repl def test_replace_categorical_inplace(using_copy_on_write, val): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - df.replace(to_replace=1, value=val, inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=1, value=val, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) if using_copy_on_write: @@ -267,7 +290,13 @@ def test_replace_categorical_inplace(using_copy_on_write, val): def test_replace_categorical(using_copy_on_write, val): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - df2 = df.replace(to_replace=1, value=val) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df2 = df.replace(to_replace=1, value=val) if using_copy_on_write: assert df._mgr._has_no_reference(0) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 13e2c1a249ac2..53c45a5f4b5c6 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1279,7 +1279,9 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): b = pd.Categorical(final_data[:, 1], categories=ex_cat) expected = DataFrame({"a": a, "b": b}) - result = df.replace(replace_dict, 3) + msg2 = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + result = df.replace(replace_dict, 3) tm.assert_frame_equal(result, expected) msg = ( r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " @@ -1288,7 +1290,8 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): with pytest.raises(AssertionError, match=msg): # ensure non-inplace call does not affect original tm.assert_frame_equal(df, expected) - return_value = df.replace(replace_dict, 3, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg2): + return_value = df.replace(replace_dict, 3, inplace=True) assert return_value is None tm.assert_frame_equal(df, expected) @@ -1438,9 +1441,14 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + input_df = input_df.replace("d", "z") + input_df = input_df.replace("obj1", "obj9") + result = input_df.replace("cat2", "catX") tm.assert_frame_equal(result, expected) @@ -1466,7 +1474,12 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index b40c8f45b6d19..4f54621b19b64 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -546,9 +546,9 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() - expected["x"] = expected["x"].replace(4, None) + expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": - expected["x2"] = expected["x2"].replace(4, None) + expected["x2"] = expected["x2"].cat.remove_categories([4]) if as_index: if index_kind == "multi": expected = expected.set_index(["x", "x2"]) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index e292bc7fe251e..d93de16816725 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -341,7 +341,15 @@ def test_latin_encoding(tmp_path, setup_path, dtype, val): ser.to_hdf(store, key=key, format="table", encoding=enc, nan_rep=nan_rep) retr = read_hdf(store, key) - s_nan = ser.replace(nan_rep, np.nan) + # TODO:(3.0): once Categorical replace deprecation is enforced, + # we may be able to re-simplify the construction of s_nan + if dtype == "category": + if nan_rep in ser.cat.categories: + s_nan = ser.cat.remove_categories([nan_rep]) + else: + s_nan = ser + else: + s_nan = ser.replace(nan_rep, np.nan) tm.assert_series_equal(s_nan, retr) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 477f36bdf4214..4330153c186ca 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -403,6 +403,7 @@ def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) msg = "Downcasting behavior in `replace`" + msg = "with CategoricalDtype is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") @@ -418,7 +419,9 @@ def test_replace_categorical(self, categorical, numeric): def test_replace_categorical_inplace(self, data, data_exp): # GH 53358 result = pd.Series(data, dtype="category") - result.replace(to_replace="a", value="b", inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result.replace(to_replace="a", value="b", inplace=True) expected = pd.Series(data_exp, dtype="category") tm.assert_series_equal(result, expected) @@ -434,16 +437,22 @@ def test_replace_categorical_single(self): expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - result = c.replace(c[2], "foo") + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = c.replace(c[2], "foo") tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - return_value = c.replace(c[2], "foo", inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = c.replace(c[2], "foo", inplace=True) assert return_value is None tm.assert_series_equal(expected, c) first_value = c[0] - return_value = c.replace(c[1], c[0], inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = c.replace(c[1], c[0], inplace=True) assert return_value is None assert c[0] == c[1] == first_value # test replacing with existing value From 4f8bb2b0ff03b83d8783de0918c336c4ab000327 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Dec 2023 10:28:36 -0800 Subject: [PATCH 30/63] TST: misplaced tests (#56424) --- pandas/tests/io/parser/test_parse_dates.py | 46 ---------------------- pandas/tests/io/sas/test_sas7bdat.py | 22 +++++------ pandas/tests/io/test_feather.py | 3 +- pandas/tests/tslibs/test_parsing.py | 45 +++++++++++++++++++++ 4 files changed, 57 insertions(+), 59 deletions(-) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 113402cda1b9a..9355d6089b742 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -12,13 +12,11 @@ from io import StringIO from dateutil.parser import parse as du_parse -from hypothesis import given import numpy as np import pytest import pytz from pandas._libs.tslibs import parsing -from pandas._libs.tslibs.parsing import py_parse_datetime_string import pandas as pd from pandas import ( @@ -30,7 +28,6 @@ Timestamp, ) import pandas._testing as tm -from pandas._testing._hypothesis import DATETIME_NO_TZ from pandas.core.indexes.datetimes import date_range from pandas.core.tools.datetimes import start_caching_at @@ -1838,49 +1835,6 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) -def _helper_hypothesis_delimited_date(call, date_string, **kwargs): - msg, result = None, None - try: - result = call(date_string, **kwargs) - except ValueError as err: - msg = str(err) - return msg, result - - -@given(DATETIME_NO_TZ) -@pytest.mark.parametrize("delimiter", list(" -./")) -@pytest.mark.parametrize("dayfirst", [True, False]) -@pytest.mark.parametrize( - "date_format", - ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], -) -def test_hypothesis_delimited_date( - request, date_format, dayfirst, delimiter, test_datetime -): - if date_format == "%m %Y" and delimiter == ".": - request.applymarker( - pytest.mark.xfail( - reason="parse_datetime_string cannot reliably tell whether " - "e.g. %m.%Y is a float or a date" - ) - ) - date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) - - except_out_dateutil, result = _helper_hypothesis_delimited_date( - py_parse_datetime_string, date_string, dayfirst=dayfirst - ) - except_in_dateutil, expected = _helper_hypothesis_delimited_date( - du_parse, - date_string, - default=datetime(1, 1, 1), - dayfirst=dayfirst, - yearfirst=False, - ) - - assert except_out_dateutil == except_in_dateutil - assert result == expected - - # ArrowKeyError: Column 'fdate1' in include_columns does not exist in CSV file @skip_pyarrow @pytest.mark.parametrize( diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 181405dbfee36..2c4fe10ea97a8 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -41,15 +41,15 @@ def data_test_ix(request, dirpath): class TestSAS7BDAT: @pytest.mark.slow def test_from_file(self, dirpath, data_test_ix): - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = os.path.join(dirpath, f"test{k}.sas7bdat") df = pd.read_sas(fname, encoding="utf-8") - tm.assert_frame_equal(df, df0) + tm.assert_frame_equal(df, expected) @pytest.mark.slow def test_from_buffer(self, dirpath, data_test_ix): - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = os.path.join(dirpath, f"test{k}.sas7bdat") with open(fname, "rb") as f: @@ -59,37 +59,37 @@ def test_from_buffer(self, dirpath, data_test_ix): buf, format="sas7bdat", iterator=True, encoding="utf-8" ) as rdr: df = rdr.read() - tm.assert_frame_equal(df, df0) + tm.assert_frame_equal(df, expected) @pytest.mark.slow def test_from_iterator(self, dirpath, data_test_ix): - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = os.path.join(dirpath, f"test{k}.sas7bdat") with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: df = rdr.read(2) - tm.assert_frame_equal(df, df0.iloc[0:2, :]) + tm.assert_frame_equal(df, expected.iloc[0:2, :]) df = rdr.read(3) - tm.assert_frame_equal(df, df0.iloc[2:5, :]) + tm.assert_frame_equal(df, expected.iloc[2:5, :]) @pytest.mark.slow def test_path_pathlib(self, dirpath, data_test_ix): - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = Path(os.path.join(dirpath, f"test{k}.sas7bdat")) df = pd.read_sas(fname, encoding="utf-8") - tm.assert_frame_equal(df, df0) + tm.assert_frame_equal(df, expected) @td.skip_if_no("py.path") @pytest.mark.slow def test_path_localpath(self, dirpath, data_test_ix): from py.path import local as LocalPath - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = LocalPath(os.path.join(dirpath, f"test{k}.sas7bdat")) df = pd.read_sas(fname, encoding="utf-8") - tm.assert_frame_equal(df, df0) + tm.assert_frame_equal(df, expected) @pytest.mark.slow @pytest.mark.parametrize("chunksize", (3, 5, 10, 11)) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 572abbf7c48f7..15c5953e79bda 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -15,7 +15,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -pyarrow = pytest.importorskip("pyarrow") +pa = pytest.importorskip("pyarrow") @pytest.mark.single_cpu @@ -169,7 +169,6 @@ def test_http_path(self, feather_file, httpserver): def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 - pa = pytest.importorskip("pyarrow") df = pd.DataFrame( { "a": pd.Series([1, np.nan, 3], dtype="Int64"), diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 425decc14251a..d8f23156bd4d4 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -6,6 +6,7 @@ from dateutil.parser import parse as du_parse from dateutil.tz import tzlocal +from hypothesis import given import numpy as np import pytest @@ -21,6 +22,7 @@ import pandas.util._test_decorators as td import pandas._testing as tm +from pandas._testing._hypothesis import DATETIME_NO_TZ @pytest.mark.skipif( @@ -367,3 +369,46 @@ def test_guess_datetime_format_f(input): result = parsing.guess_datetime_format(input) expected = "%Y-%m-%dT%H:%M:%S.%f" assert result == expected + + +def _helper_hypothesis_delimited_date(call, date_string, **kwargs): + msg, result = None, None + try: + result = call(date_string, **kwargs) + except ValueError as err: + msg = str(err) + return msg, result + + +@given(DATETIME_NO_TZ) +@pytest.mark.parametrize("delimiter", list(" -./")) +@pytest.mark.parametrize("dayfirst", [True, False]) +@pytest.mark.parametrize( + "date_format", + ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], +) +def test_hypothesis_delimited_date( + request, date_format, dayfirst, delimiter, test_datetime +): + if date_format == "%m %Y" and delimiter == ".": + request.applymarker( + pytest.mark.xfail( + reason="parse_datetime_string cannot reliably tell whether " + "e.g. %m.%Y is a float or a date" + ) + ) + date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) + + except_out_dateutil, result = _helper_hypothesis_delimited_date( + parsing.py_parse_datetime_string, date_string, dayfirst=dayfirst + ) + except_in_dateutil, expected = _helper_hypothesis_delimited_date( + du_parse, + date_string, + default=datetime(1, 1, 1), + dayfirst=dayfirst, + yearfirst=False, + ) + + assert except_out_dateutil == except_in_dateutil + assert result == expected From 2dcb963b6ab609b864cd5214478f272783bdf4f8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Dec 2023 10:29:50 -0800 Subject: [PATCH 31/63] TYP: EA.isin (#56423) --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/categorical.py | 11 ++--------- pandas/core/arrays/datetimelike.py | 20 ++++++++++++-------- pandas/core/arrays/interval.py | 8 ++------ pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/string_arrow.py | 3 ++- 7 files changed, 22 insertions(+), 28 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5abdfe69e52c0..d8b074fe61322 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1023,7 +1023,7 @@ def fillna( return super().fillna(value=value, method=method, limit=limit, copy=copy) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: # short-circuit to return all False array. if not len(values): return np.zeros(len(self), dtype=bool) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e61e374009163..3272a594f4cf4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1355,7 +1355,7 @@ def equals(self, other: object) -> bool: equal_na = self.isna() & other.isna() # type: ignore[operator] return bool((equal_values | equal_na).all()) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Pointwise comparison for set containment in the given values. @@ -1363,7 +1363,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: Parameters ---------- - values : Sequence + values : np.ndarray or ExtensionArray Returns ------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f0aabbb863a79..20aec52b606b6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2570,7 +2570,7 @@ def describe(self) -> DataFrame: return result - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Check whether `values` are contained in Categorical. @@ -2580,7 +2580,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: Parameters ---------- - values : set or list-like + values : np.ndarray or ExtensionArray The sequence of values to test. Passing in a single string will raise a ``TypeError``. Instead, turn a single string into a list of one element. @@ -2611,13 +2611,6 @@ def isin(self, values) -> npt.NDArray[np.bool_]: >>> s.isin(['lama']) array([ True, False, True, False, True, False]) """ - if not is_list_like(values): - values_type = type(values).__name__ - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a `{values_type}`" - ) - values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer_for(values) code_values = code_values[null_mask | (code_values >= 0)] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index eb1c2ecc0b0fe..2a6a45ad18421 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -734,22 +734,19 @@ def map(self, mapper, na_action=None): else: return result.array - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Compute boolean array of whether each value is found in the passed set of values. Parameters ---------- - values : set or sequence of values + values : np.ndarray or ExtensionArray Returns ------- ndarray[bool] """ - if not hasattr(values, "dtype"): - values = np.asarray(values) - if values.dtype.kind in "fiuc": # TODO: de-duplicate with equals, validate_comparison_value return np.zeros(self.shape, dtype=bool) @@ -781,15 +778,22 @@ def isin(self, values) -> npt.NDArray[np.bool_]: if self.dtype.kind in "mM": self = cast("DatetimeArray | TimedeltaArray", self) - values = values.as_unit(self.unit) + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "as_unit" + values = values.as_unit(self.unit) # type: ignore[union-attr] try: - self._check_compatible_with(values) + # error: Argument 1 to "_check_compatible_with" of "DatetimeLikeArrayMixin" + # has incompatible type "ExtensionArray | ndarray[Any, Any]"; expected + # "Period | Timestamp | Timedelta | NaTType" + self._check_compatible_with(values) # type: ignore[arg-type] except (TypeError, ValueError): # Includes tzawareness mismatch and IncompatibleFrequencyError return np.zeros(self.shape, dtype=bool) - return isin(self.asi8, values.asi8) + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "asi8" + return isin(self.asi8, values.asi8) # type: ignore[union-attr] # ------------------------------------------------------------------ # Null Handling diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 126484ed4a2a0..383f8a49fd02c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1789,12 +1789,8 @@ def contains(self, other): other < self._right if self.open_right else other <= self._right ) - def isin(self, values) -> npt.NDArray[np.bool_]: - if not hasattr(values, "dtype"): - values = np.array(values) - values = extract_array(values, extract_numpy=True) - - if isinstance(values.dtype, IntervalDtype): + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, IntervalArray): if self.closed != values.closed: # not comparable -> no overlap return np.zeros(self.shape, dtype=bool) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 201ce44ed0163..2f0cf7a67c1cc 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -955,7 +955,7 @@ def take( # error: Return type "BooleanArray" of "isin" incompatible with return type # "ndarray" in supertype "ExtensionArray" - def isin(self, values) -> BooleanArray: # type: ignore[override] + def isin(self, values: ArrayLike) -> BooleanArray: # type: ignore[override] from pandas.core.arrays import BooleanArray # algorithms.isin will eventually convert values to an ndarray, so no extra diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 976a8d3c32b23..21fe7cd8180ad 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -54,6 +54,7 @@ from collections.abc import Sequence from pandas._typing import ( + ArrayLike, AxisInt, Dtype, Scalar, @@ -212,7 +213,7 @@ def _maybe_convert_setitem_value(self, value): raise TypeError("Scalar must be NA or str") return super()._maybe_convert_setitem_value(value) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] From d95a7a79fd9626b46c29509860893954e2889638 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Dec 2023 10:32:18 -0800 Subject: [PATCH 32/63] DEPR: casting in datetimelike isin (#56427) * DEPR: casting in datetimelike isin * GH ref * update doctest --- doc/source/whatsnew/v2.2.0.rst | 2 ++ pandas/_libs/lib.pyx | 7 ++++-- pandas/core/arrays/datetimelike.py | 21 ++++++++++++++++ pandas/core/indexes/base.py | 12 --------- pandas/tests/test_algos.py | 39 ++++++++++++++++++++++++++++++ 5 files changed, 67 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index fb91219582b14..e51a347dec46c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -455,6 +455,7 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) +- Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`) @@ -526,6 +527,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`) - Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`) +- Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`) - Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8493f8bd066e0..c483f35513a40 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2756,8 +2756,11 @@ def maybe_convert_objects(ndarray[object] objects, res[:] = NPY_NAT return res elif dtype is not None: - # EA, we don't expect to get here, but _could_ implement - raise NotImplementedError(dtype) + # i.e. PeriodDtype, DatetimeTZDtype + cls = dtype.construct_array_type() + obj = cls._from_sequence([], dtype=dtype) + taker = -np.ones((objects).shape, dtype=np.intp) + return obj.take(taker, allow_fill=True) else: # we don't guess seen.object_ = True diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2a6a45ad18421..3249f432e22a9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -751,6 +751,8 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: # TODO: de-duplicate with equals, validate_comparison_value return np.zeros(self.shape, dtype=bool) + values = ensure_wrapped_if_datetimelike(values) + if not isinstance(values, type(self)): inferable = [ "timedelta", @@ -761,6 +763,14 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: "period", ] if values.dtype == object: + values = lib.maybe_convert_objects( + values, + convert_non_numeric=True, + dtype_if_all_nat=self.dtype, + ) + if values.dtype != object: + return self.isin(values) + inferred = lib.infer_dtype(values, skipna=False) if inferred not in inferable: if inferred == "string": @@ -775,6 +785,17 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: values = type(self)._from_sequence(values) except ValueError: return isin(self.astype(object), values) + else: + warnings.warn( + # GH#53111 + f"The behavior of 'isin' with dtype={self.dtype} and " + "castable values (e.g. strings) is deprecated. In a " + "future version, these will not be considered matching " + "by isin. Explicitly cast to the appropriate dtype before " + "calling isin instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if self.dtype.kind in "mM": self = cast("DatetimeArray | TimedeltaArray", self) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3d3056f47f15e..febceeb7623b5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6534,18 +6534,6 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: >>> midx.isin([(1, 'red'), (3, 'red')]) array([ True, False, False]) - - For a DatetimeIndex, string values in `values` are converted to - Timestamps. - - >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] - >>> dti = pd.to_datetime(dates) - >>> dti - DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], - dtype='datetime64[ns]', freq=None) - - >>> dti.isin(['2000-03-11']) - array([ True, False, False]) """ if level is not None: self._validate_index_level(level) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5356704cc64a2..d9162c428b492 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -992,6 +992,45 @@ def test_large(self): expected[1] = True tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"]) + def test_isin_datetimelike_all_nat(self, dtype): + # GH#56427 + dta = date_range("2013-01-01", periods=3)._values + arr = Series(dta.view("i8")).array.view(dtype) + + arr[0] = NaT + result = algos.isin(arr, [NaT]) + expected = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"]) + def test_isin_datetimelike_strings_deprecated(self, dtype): + # GH#53111 + dta = date_range("2013-01-01", periods=3)._values + arr = Series(dta.view("i8")).array.view(dtype) + + vals = [str(x) for x in arr] + msg = "The behavior of 'isin' with dtype=.* is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = algos.isin(arr, vals) + assert res.all() + + vals2 = np.array(vals, dtype=str) + with tm.assert_produces_warning(FutureWarning, match=msg): + res2 = algos.isin(arr, vals2) + assert res2.all() + + def test_isin_dt64tz_with_nat(self): + # the all-NaT values used to get inferred to tznaive, which was evaluated + # as non-matching GH#56427 + dti = date_range("2016-01-01", periods=3, tz="UTC") + ser = Series(dti) + ser[0] = NaT + + res = algos.isin(ser._values, [NaT]) + exp = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(res, exp) + def test_categorical_from_codes(self): # GH 16639 vals = np.array([0, 1, 2, 0]) From a6c0ae48fe4da5ee6356e8370f0b401be0f16b1a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Dec 2023 10:36:28 -0800 Subject: [PATCH 33/63] PERF: datetimelike addition (#56373) * PERF: datetimelike addition * re-add import * remove no-longer-used * mypy fixup * troubleshoot 32bit builds --- asv_bench/benchmarks/arithmetic.py | 37 ------------ pandas/_libs/tslibs/__init__.py | 2 + pandas/_libs/tslibs/np_datetime.pxd | 2 + pandas/_libs/tslibs/np_datetime.pyi | 4 ++ pandas/_libs/tslibs/np_datetime.pyx | 41 +++++++++++++ pandas/core/algorithms.py | 92 ----------------------------- pandas/core/arrays/datetimelike.py | 20 ++----- pandas/core/arrays/period.py | 11 +--- pandas/tests/test_algos.py | 51 ---------------- pandas/tests/tslibs/test_api.py | 1 + 10 files changed, 59 insertions(+), 202 deletions(-) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 5e23cba2e1074..6b1f75187f887 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -12,7 +12,6 @@ date_range, to_timedelta, ) -from pandas.core.algorithms import checked_add_with_arr from .pandas_vb_common import numeric_dtypes @@ -389,42 +388,6 @@ def time_add_timedeltas(self, df): df["timedelta"] + df["timedelta"] -class AddOverflowScalar: - params = [1, -1, 0] - param_names = ["scalar"] - - def setup(self, scalar): - N = 10**6 - self.arr = np.arange(N) - - def time_add_overflow_scalar(self, scalar): - checked_add_with_arr(self.arr, scalar) - - -class AddOverflowArray: - def setup(self): - N = 10**6 - self.arr = np.arange(N) - self.arr_rev = np.arange(-N, 0) - self.arr_mixed = np.array([1, -1]).repeat(N / 2) - self.arr_nan_1 = np.random.choice([True, False], size=N) - self.arr_nan_2 = np.random.choice([True, False], size=N) - - def time_add_overflow_arr_rev(self): - checked_add_with_arr(self.arr, self.arr_rev) - - def time_add_overflow_arr_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) - - def time_add_overflow_b_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1) - - def time_add_overflow_both_arg_nan(self): - checked_add_with_arr( - self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2 - ) - - hcal = pd.tseries.holiday.USFederalHolidayCalendar() # These offsets currently raise a NotImplementedError with .apply_index() non_apply = [ diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index c622121578dcb..b626959203295 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -34,6 +34,7 @@ "npy_unit_to_abbrev", "get_supported_reso", "guess_datetime_format", + "add_overflowsafe", ] from pandas._libs.tslibs import dtypes # pylint: disable=import-self @@ -55,6 +56,7 @@ from pandas._libs.tslibs.np_datetime import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, + add_overflowsafe, astype_overflowsafe, is_unitless, py_get_unit_from_dtype as get_unit_from_dtype, diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index a87c3d3f0955d..cb2658d343772 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -118,3 +118,5 @@ cdef int64_t convert_reso( NPY_DATETIMEUNIT to_reso, bint round_ok, ) except? -1 + +cpdef cnp.ndarray add_overflowsafe(cnp.ndarray left, cnp.ndarray right) diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index c42bc43ac9d89..5a4ba673dbeff 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -19,3 +19,7 @@ def is_unitless(dtype: np.dtype) -> bool: ... def compare_mismatched_resolutions( left: np.ndarray, right: np.ndarray, op ) -> npt.NDArray[np.bool_]: ... +def add_overflowsafe( + left: npt.NDArray[np.int64], + right: npt.NDArray[np.int64], +) -> npt.NDArray[np.int64]: ... diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 9958206c51b7a..5f5e75b1e64d0 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,3 +1,4 @@ +cimport cython from cpython.datetime cimport ( PyDateTime_CheckExact, PyDateTime_DATE_GET_HOUR, @@ -678,3 +679,43 @@ cdef int64_t _convert_reso_with_dtstruct( raise OutOfBoundsDatetime from err return result + + +@cython.overflowcheck(True) +cpdef cnp.ndarray add_overflowsafe(cnp.ndarray left, cnp.ndarray right): + """ + Overflow-safe addition for datetime64/timedelta64 dtypes. + + `right` may either be zero-dim or of the same shape as `left`. + """ + cdef: + Py_ssize_t N = left.size + int64_t lval, rval, res_value + ndarray iresult = cnp.PyArray_EMPTY( + left.ndim, left.shape, cnp.NPY_INT64, 0 + ) + cnp.broadcast mi = cnp.PyArray_MultiIterNew3(iresult, left, right) + + # Note: doing this try/except outside the loop improves performance over + # doing it inside the loop. + try: + for i in range(N): + # Analogous to: lval = lvalues[i] + lval = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] + + # Analogous to: rval = rvalues[i] + rval = (cnp.PyArray_MultiIter_DATA(mi, 2))[0] + + if lval == NPY_DATETIME_NAT or rval == NPY_DATETIME_NAT: + res_value = NPY_DATETIME_NAT + else: + res_value = lval + rval + + # Analogous to: result[i] = res_value + (cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_value + + cnp.PyArray_MultiIter_NEXT(mi) + except OverflowError as err: + raise OverflowError("Overflow in int64 addition") from err + + return iresult diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 82de8ae96160f..03f06da5f84e1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1119,98 +1119,6 @@ def rank( return ranks -def checked_add_with_arr( - arr: npt.NDArray[np.int64], - b: int | npt.NDArray[np.int64], - arr_mask: npt.NDArray[np.bool_] | None = None, - b_mask: npt.NDArray[np.bool_] | None = None, -) -> npt.NDArray[np.int64]: - """ - Perform array addition that checks for underflow and overflow. - - Performs the addition of an int64 array and an int64 integer (or array) - but checks that they do not result in overflow first. For elements that - are indicated to be NaN, whether or not there is overflow for that element - is automatically ignored. - - Parameters - ---------- - arr : np.ndarray[int64] addend. - b : array or scalar addend. - arr_mask : np.ndarray[bool] or None, default None - array indicating which elements to exclude from checking - b_mask : np.ndarray[bool] or None, default None - array or scalar indicating which element(s) to exclude from checking - - Returns - ------- - sum : An array for elements x + b for each element x in arr if b is - a scalar or an array for elements x + y for each element pair - (x, y) in (arr, b). - - Raises - ------ - OverflowError if any x + y exceeds the maximum or minimum int64 value. - """ - # For performance reasons, we broadcast 'b' to the new array 'b2' - # so that it has the same size as 'arr'. - b2 = np.broadcast_to(b, arr.shape) - if b_mask is not None: - # We do the same broadcasting for b_mask as well. - b2_mask = np.broadcast_to(b_mask, arr.shape) - else: - b2_mask = None - - # For elements that are NaN, regardless of their value, we should - # ignore whether they overflow or not when doing the checked add. - if arr_mask is not None and b2_mask is not None: - not_nan = np.logical_not(arr_mask | b2_mask) - elif arr_mask is not None: - not_nan = np.logical_not(arr_mask) - elif b_mask is not None: - # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has - # incompatible type "Optional[ndarray[Any, dtype[bool_]]]"; - # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence - # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str - # , bytes, _NestedSequence[Union[bool, int, float, complex, str - # , bytes]]]" - not_nan = np.logical_not(b2_mask) # type: ignore[arg-type] - else: - not_nan = np.empty(arr.shape, dtype=bool) - not_nan.fill(True) - - # gh-14324: For each element in 'arr' and its corresponding element - # in 'b2', we check the sign of the element in 'b2'. If it is positive, - # we then check whether its sum with the element in 'arr' exceeds - # np.iinfo(np.int64).max. If so, we have an overflow error. If it - # it is negative, we then check whether its sum with the element in - # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow - # error as well. - i8max = lib.i8max - i8min = iNaT - - mask1 = b2 > 0 - mask2 = b2 < 0 - - if not mask1.any(): - to_raise = ((i8min - b2 > arr) & not_nan).any() - elif not mask2.any(): - to_raise = ((i8max - b2 < arr) & not_nan).any() - else: - to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or ( - (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() - - if to_raise: - raise OverflowError("Overflow in int64 addition") - - result = arr + b - if arr_mask is not None or b2_mask is not None: - np.putmask(result, ~not_nan, iNaT) - - return result - - # ---- # # take # # ---- # diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 3249f432e22a9..879f477106ae9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -35,6 +35,7 @@ Tick, Timedelta, Timestamp, + add_overflowsafe, astype_overflowsafe, get_unit_from_dtype, iNaT, @@ -112,7 +113,6 @@ ops, ) from pandas.core.algorithms import ( - checked_add_with_arr, isin, map_array, unique1d, @@ -1038,7 +1038,7 @@ def _get_i8_values_and_mask( self, other ) -> tuple[int | npt.NDArray[np.int64], None | npt.NDArray[np.bool_]]: """ - Get the int64 values and b_mask to pass to checked_add_with_arr. + Get the int64 values and b_mask to pass to add_overflowsafe. """ if isinstance(other, Period): i8values = other.ordinal @@ -1094,9 +1094,7 @@ def _add_datetimelike_scalar(self, other) -> DatetimeArray: self = cast("TimedeltaArray", self) other_i8, o_mask = self._get_i8_values_and_mask(other) - result = checked_add_with_arr( - self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + result = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) res_values = result.view(f"M8[{self.unit}]") dtype = tz_to_dtype(tz=other.tz, unit=self.unit) @@ -1159,9 +1157,7 @@ def _sub_datetimelike(self, other: Timestamp | DatetimeArray) -> TimedeltaArray: raise type(err)(new_message) from err other_i8, o_mask = self._get_i8_values_and_mask(other) - res_values = checked_add_with_arr( - self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + res_values = add_overflowsafe(self.asi8, np.asarray(-other_i8, dtype="i8")) res_m8 = res_values.view(f"timedelta64[{self.unit}]") new_freq = self._get_arithmetic_result_freq(other) @@ -1227,9 +1223,7 @@ def _add_timedeltalike(self, other: Timedelta | TimedeltaArray): self = cast("DatetimeArray | TimedeltaArray", self) other_i8, o_mask = self._get_i8_values_and_mask(other) - new_values = checked_add_with_arr( - self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + new_values = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) res_values = new_values.view(self._ndarray.dtype) new_freq = self._get_arithmetic_result_freq(other) @@ -1297,9 +1291,7 @@ def _sub_periodlike(self, other: Period | PeriodArray) -> npt.NDArray[np.object_ self._check_compatible_with(other) other_i8, o_mask = self._get_i8_values_and_mask(other) - new_i8_data = checked_add_with_arr( - self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + new_i8_data = add_overflowsafe(self.asi8, np.asarray(-other_i8, dtype="i8")) new_data = np.array([self.freq.base * x for x in new_i8_data]) if o_mask is None: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 1ff3896eea798..e3492dd21ea57 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -25,6 +25,7 @@ NaT, NaTType, Timedelta, + add_overflowsafe, astype_overflowsafe, dt64arr_to_periodarr as c_dt64arr_to_periodarr, get_unit_from_dtype, @@ -72,7 +73,6 @@ ) from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com @@ -855,7 +855,7 @@ def _addsub_int_array_or_scalar( assert op in [operator.add, operator.sub] if op is operator.sub: other = -other - res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) + res_values = add_overflowsafe(self.asi8, np.asarray(other, dtype="i8")) return type(self)(res_values, dtype=self.dtype) def _add_offset(self, other: BaseOffset): @@ -920,12 +920,7 @@ def _add_timedelta_arraylike( "not an integer multiple of the PeriodArray's freq." ) from err - b_mask = np.isnat(delta) - - res_values = algos.checked_add_with_arr( - self.asi8, delta.view("i8"), arr_mask=self._isnan, b_mask=b_mask - ) - np.putmask(res_values, self._isnan | b_mask, iNaT) + res_values = add_overflowsafe(self.asi8, np.asarray(delta.view("i8"))) return type(self)(res_values, dtype=self.dtype) def _check_timedeltalike_freq_compat(self, other): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d9162c428b492..718d1b3ee2e83 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1837,57 +1837,6 @@ def test_pct_max_many_rows(self): assert result == 1 -def test_int64_add_overflow(): - # see gh-14068 - msg = "Overflow in int64 addition" - m = np.iinfo(np.int64).max - n = np.iinfo(np.int64).min - - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), m) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([n, n]), n) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([n, n]), np.array([n, n])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True]) - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True]) - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), - np.array([m, m]), - arr_mask=np.array([False, True]), - b_mask=np.array([False, True]), - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m])) - - # Check that the nan boolean arrays override whether or not - # the addition overflows. We don't check the result but just - # the fact that an OverflowError is not raised. - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True]) - ) - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True]) - ) - algos.checked_add_with_arr( - np.array([m, m]), - np.array([m, m]), - arr_mask=np.array([True, False]), - b_mask=np.array([False, True]), - ) - - class TestMode: def test_no_mode(self): exp = Series([], dtype=np.float64, index=Index([], dtype=int)) diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index b52bc78d58296..e02cea2fef426 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -58,6 +58,7 @@ def test_namespace(): "get_supported_reso", "npy_unit_to_abbrev", "guess_datetime_format", + "add_overflowsafe", ] expected = set(submodules + api) From 8614088df2338c1db170bcd0961348220669e082 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Dec 2023 10:38:17 -0800 Subject: [PATCH 34/63] BUG: fillna with mixed-resolution dt64/td64 (#56413) * BUG: fillna with mixed-resolution dt64/td64 * mypy fixup * troubleshoot docbuild * typo fixup in whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/arrays/_mixins.py | 6 ++ pandas/core/arrays/datetimelike.py | 10 +- pandas/tests/series/methods/test_fillna.py | 111 +++++++++++++++++---- 4 files changed, 109 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e51a347dec46c..6006fcbcdbf20 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -537,6 +537,7 @@ Datetimelike - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) +- Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) - Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) @@ -550,7 +551,6 @@ Datetimelike - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) -- Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index d6f4dbfe7f549..8d1f5262e7911 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -430,6 +430,12 @@ def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self: value = self._validate_setitem_value(value) res_values = np.where(mask, self._ndarray, value) + if res_values.dtype != self._ndarray.dtype: + raise AssertionError( + # GH#56410 + "Something has gone wrong, please report a bug at " + "github.com/pandas-dev/pandas/" + ) return self._from_backing_data(res_values) # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 879f477106ae9..8928c72de750c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -646,6 +646,9 @@ def _validation_error_message(self, value, allow_listlike: bool = False) -> str: def _validate_listlike(self, value, allow_object: bool = False): if isinstance(value, type(self)): + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value if isinstance(value, list) and len(value) == 0: @@ -694,6 +697,9 @@ def _validate_listlike(self, value, allow_object: bool = False): msg = self._validation_error_message(value, True) raise TypeError(msg) + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value def _validate_setitem_value(self, value): @@ -2138,12 +2144,12 @@ def unit(self) -> str: # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] - def as_unit(self, unit: str) -> Self: + def as_unit(self, unit: str, round_ok: bool = True) -> Self: if unit not in ["s", "ms", "us", "ns"]: raise ValueError("Supported units are 's', 'ms', 'us', 'ns'") dtype = np.dtype(f"{self.dtype.kind}8[{unit}]") - new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=True) + new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=round_ok) if isinstance(self.dtype, np.dtype): new_dtype = new_values.dtype diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index a5170898b1720..acc5805578f22 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -19,6 +19,7 @@ Timestamp, date_range, isna, + timedelta_range, ) import pandas._testing as tm from pandas.core.arrays import period_array @@ -239,7 +240,7 @@ def test_fillna_downcast_infer_objects_to_numeric(self): expected = Series([0, 1, 2.5, 4, 4], dtype=np.float64) tm.assert_series_equal(res, expected) - def test_timedelta_fillna(self, frame_or_series): + def test_timedelta_fillna(self, frame_or_series, unit): # GH#3371 ser = Series( [ @@ -247,7 +248,8 @@ def test_timedelta_fillna(self, frame_or_series): Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01"), - ] + ], + dtype=f"M8[{unit}]", ) td = ser.diff() obj = frame_or_series(td).copy() @@ -260,7 +262,8 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -279,7 +282,8 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -291,7 +295,8 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -303,7 +308,8 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -316,7 +322,7 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), ], - dtype="m8[ns]", + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -375,6 +381,72 @@ def test_datetime64_fillna(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "scalar", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="GH#56410 scalar case not yet addressed" + ), + ), + ], + ) + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_datetime64_fillna_mismatched_reso_no_rounding(self, tz, scalar): + # GH#56410 + dti = date_range("2016-01-01", periods=3, unit="s", tz=tz) + item = Timestamp("2016-02-03 04:05:06.789", tz=tz) + vec = date_range(item, periods=3, unit="ms") + + exp_dtype = "M8[ms]" if tz is None else "M8[ms, UTC]" + expected = Series([item, dti[1], dti[2]], dtype=exp_dtype) + + ser = Series(dti) + ser[0] = NaT + ser2 = ser.copy() + + res = ser.fillna(item) + res2 = ser2.fillna(Series(vec)) + + if scalar: + tm.assert_series_equal(res, expected) + else: + tm.assert_series_equal(res2, expected) + + @pytest.mark.parametrize( + "scalar", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="GH#56410 scalar case not yet addressed" + ), + ), + ], + ) + def test_timedelta64_fillna_mismatched_reso_no_rounding(self, scalar): + # GH#56410 + tdi = date_range("2016-01-01", periods=3, unit="s") - Timestamp("1970-01-01") + item = Timestamp("2016-02-03 04:05:06.789") - Timestamp("1970-01-01") + vec = timedelta_range(item, periods=3, unit="ms") + + expected = Series([item, tdi[1], tdi[2]], dtype="m8[ms]") + + ser = Series(tdi) + ser[0] = NaT + ser2 = ser.copy() + + res = ser.fillna(item) + res2 = ser2.fillna(Series(vec)) + + if scalar: + tm.assert_series_equal(res, expected) + else: + tm.assert_series_equal(res2, expected) + def test_datetime64_fillna_backfill(self): # GH#6587 # make sure that we are treating as integer when filling @@ -392,7 +464,7 @@ def test_datetime64_fillna_backfill(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) - def test_datetime64_tz_fillna(self, tz): + def test_datetime64_tz_fillna(self, tz, unit): # DatetimeLikeBlock ser = Series( [ @@ -400,7 +472,8 @@ def test_datetime64_tz_fillna(self, tz): NaT, Timestamp("2011-01-03 10:00"), NaT, - ] + ], + dtype=f"M8[{unit}]", ) null_loc = Series([False, True, False, True]) @@ -411,7 +484,8 @@ def test_datetime64_tz_fillna(self, tz): Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00"), - ] + ], + dtype=f"M8[{unit}]", ) tm.assert_series_equal(expected, result) # check s is not changed @@ -468,15 +542,18 @@ def test_datetime64_tz_fillna(self, tz): Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-04 10:00"), - ] + ], + dtype=f"M8[{unit}]", ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) # DatetimeTZBlock - idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz) + idx = DatetimeIndex( + ["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz + ).as_unit(unit) ser = Series(idx) - assert ser.dtype == f"datetime64[ns, {tz}]" + assert ser.dtype == f"datetime64[{unit}, {tz}]" tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00")) @@ -500,7 +577,7 @@ def test_datetime64_tz_fillna(self, tz): "2011-01-02 10:00", ], tz=tz, - ) + ).as_unit(unit) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -514,7 +591,7 @@ def test_datetime64_tz_fillna(self, tz): "2011-01-02 10:00", ], tz=tz, - ) + ).as_unit(unit) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -562,7 +639,7 @@ def test_datetime64_tz_fillna(self, tz): Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-04 10:00", tz=tz), ] - ) + ).dt.as_unit(unit) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -589,7 +666,7 @@ def test_datetime64_tz_fillna(self, tz): Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2013-01-01", tz="US/Pacific").tz_convert(tz), ] - ) + ).dt.as_unit(unit) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) From cb56347d1078e1c683f29e7e99ca593901d63a42 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Dec 2023 10:57:27 -0800 Subject: [PATCH 35/63] DEPR: type argument in Index.view (#56421) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexes/base.py | 10 ++++++++++ pandas/tests/indexes/datetimes/test_setops.py | 2 +- pandas/tests/indexes/numeric/test_numeric.py | 4 +++- pandas/tests/indexes/ranges/test_range.py | 8 ++++++-- pandas/tests/indexes/test_datetimelike.py | 4 +++- pandas/tests/indexes/test_old_base.py | 4 +++- 7 files changed, 27 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 6006fcbcdbf20..486440af46b79 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -438,6 +438,7 @@ Other Deprecations - Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`) - Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) +- Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) - Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index febceeb7623b5..f9f42b9788a25 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1015,6 +1015,16 @@ def view(self, cls=None): result = self._data.view(cls) else: + if cls is not None: + warnings.warn( + # GH#55709 + f"Passing a type in {type(self).__name__}.view is deprecated " + "and will raise in a future version. " + "Call view without any argument to retain the old behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = self._view() if isinstance(result, Index): result._id = self._id diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 3ed7fcc027a06..fc3a1d4721841 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -535,7 +535,7 @@ def test_intersection(self): assert isinstance(the_int, DatetimeIndex) assert the_int.freq == rng.freq - the_int = rng1.intersection(rng2.view(DatetimeIndex)) + the_int = rng1.intersection(rng2) tm.assert_index_equal(the_int, expected) # non-overlapping diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 944e215ee17bd..7ce55db6c0bbc 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -318,7 +318,9 @@ def test_cant_or_shouldnt_cast(self, dtype): def test_view_index(self, simple_index): index = simple_index - index.view(Index) + msg = "Passing a type in .*Index.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.view(Index) def test_prevent_casting(self, simple_index): index = simple_index diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index ffb2dac840198..06e19eeca6766 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -199,7 +199,9 @@ def test_view(self): i_view = i.view("i8") tm.assert_numpy_array_equal(i.values, i_view) - i_view = i.view(RangeIndex) + msg = "Passing a type in RangeIndex.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + i_view = i.view(RangeIndex) tm.assert_index_equal(i, i_view) def test_dtype(self, simple_index): @@ -382,7 +384,9 @@ def test_cant_or_shouldnt_cast(self, start, stop, step): def test_view_index(self, simple_index): index = simple_index - index.view(Index) + msg = "Passing a type in RangeIndex.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.view(Index) def test_prevent_casting(self, simple_index): index = simple_index diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 27e01427006ec..21a686e8bc05b 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -89,7 +89,9 @@ def test_view(self, simple_index): result = type(simple_index)(idx) tm.assert_index_equal(result, idx) - idx_view = idx.view(type(simple_index)) + msg = "Passing a type in .*Index.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx_view = idx.view(type(simple_index)) result = type(simple_index)(idx) tm.assert_index_equal(result, idx_view) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 0fff6abcfc6a5..b467e93b75e96 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -960,7 +960,9 @@ def test_view(self, simple_index): idx_view = idx.view(dtype) tm.assert_index_equal(idx, index_cls(idx_view, name="Foo"), exact=True) - idx_view = idx.view(index_cls) + msg = "Passing a type in .*Index.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx_view = idx.view(index_cls) tm.assert_index_equal(idx, index_cls(idx_view, name="Foo"), exact=True) def test_format(self, simple_index): From 23c20deb7a6e7857b57abba72c15a21d8dbdfe7c Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 9 Dec 2023 14:00:17 -0500 Subject: [PATCH 36/63] BUG: outer join on equal indexes not sorting (#56426) * outer join on equal indexes to sort by default * whatsnew * fix test * remove Index._join_precedence --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/computation/align.py | 2 +- pandas/core/indexes/base.py | 31 +++++++++--------------- pandas/core/indexes/datetimelike.py | 2 -- pandas/core/reshape/merge.py | 5 +--- pandas/tests/indexes/multi/test_join.py | 13 ++++------ pandas/tests/indexes/test_base.py | 11 +++++---- pandas/tests/indexes/test_old_base.py | 6 ++++- pandas/tests/reshape/merge/test_merge.py | 20 +++++++-------- pandas/tests/series/test_arithmetic.py | 3 +++ 10 files changed, 43 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 486440af46b79..49c28e3dc5d85 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -246,7 +246,7 @@ These are bug fixes that might have notable behavior changes. In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not always return a result that followed the documented sort behavior. pandas now -follows the documented sort behavior in merge and join operations (:issue:`54611`). +follows the documented sort behavior in merge and join operations (:issue:`54611`, :issue:`56426`). As documented, ``sort=True`` sorts the join keys lexicographically in the resulting :class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 85d412d044ba8..cd852ba9249cf 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -110,7 +110,7 @@ def _align_core(terms): ax, itm = axis, items if not axes[ax].is_(itm): - axes[ax] = axes[ax].join(itm, how="outer") + axes[ax] = axes[ax].union(itm) for i, ndim in ndims.items(): for axis, items in zip(range(ndim), axes): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f9f42b9788a25..9d998b46dbeed 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -368,9 +368,6 @@ class Index(IndexOpsMixin, PandasObject): Index([1, 2, 3], dtype='uint8') """ - # To hand over control to subclasses - _join_precedence = 1 - # similar to __array_priority__, positions Index after Series and DataFrame # but before ExtensionArray. Should NOT be overridden by subclasses. __pandas_priority__ = 2000 @@ -4564,6 +4561,7 @@ def join( Index([1, 2, 3, 4, 5, 6], dtype='int64') """ other = ensure_index(other) + sort = sort or how == "outer" if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex): if (self.tz is None) ^ (other.tz is None): @@ -4614,15 +4612,6 @@ def join( rindexer = np.array([]) return join_index, None, rindexer - if self._join_precedence < other._join_precedence: - flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"} - how = flip.get(how, how) - join_index, lidx, ridx = other.join( - self, how=how, level=level, return_indexers=True - ) - lidx, ridx = ridx, lidx - return join_index, lidx, ridx - if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) this = self.astype(dtype, copy=False) @@ -4666,18 +4655,20 @@ def _join_via_get_indexer( # Note: at this point we have checked matching dtypes if how == "left": - join_index = self + join_index = self.sort_values() if sort else self elif how == "right": - join_index = other + join_index = other.sort_values() if sort else other elif how == "inner": join_index = self.intersection(other, sort=sort) elif how == "outer": - # TODO: sort=True here for backwards compat. It may - # be better to use the sort parameter passed into join - join_index = self.union(other) - - if sort and how in ["left", "right"]: - join_index = join_index.sort_values() + try: + join_index = self.union(other, sort=sort) + except TypeError: + join_index = self.union(other) + try: + join_index = _maybe_try_sort(join_index, sort) + except TypeError: + pass if join_index is self: lindexer = None diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 264ca8aa11495..2b03a64236128 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -442,8 +442,6 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, ABC): _is_monotonic_decreasing = Index.is_monotonic_decreasing _is_unique = Index.is_unique - _join_precedence = 10 - @property def unit(self) -> str: return self._data.unit diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0756b25adedcd..f07c4fb8f7d5f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -759,7 +759,7 @@ def __init__( self.on = com.maybe_make_list(on) self.suffixes = suffixes - self.sort = sort + self.sort = sort or how == "outer" self.left_index = left_index self.right_index = right_index @@ -1694,9 +1694,6 @@ def get_join_indexers( elif not sort and how in ["left", "outer"]: return _get_no_sort_one_missing_indexer(left_n, False) - if not sort and how == "outer": - sort = True - # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 700af142958b3..edd0feaaa1159 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -51,8 +51,11 @@ def test_join_level_corner_case(idx): def test_join_self(idx, join_type): - joined = idx.join(idx, how=join_type) - tm.assert_index_equal(joined, idx) + result = idx.join(idx, how=join_type) + expected = idx + if join_type == "outer": + expected = expected.sort_values() + tm.assert_index_equal(result, expected) def test_join_multi(): @@ -89,12 +92,6 @@ def test_join_multi(): tm.assert_numpy_array_equal(ridx, exp_ridx) -def test_join_self_unique(idx, join_type): - if idx.is_unique: - joined = idx.join(idx, how=join_type) - assert (idx == joined).all() - - def test_join_multi_wrong_order(): # GH 25760 # GH 28956 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index bb8822f047330..2b0bb884f0706 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -987,8 +987,11 @@ def test_slice_keep_name(self): indirect=True, ) def test_join_self(self, index, join_type): - joined = index.join(index, how=join_type) - assert index is joined + result = index.join(index, how=join_type) + expected = index + if join_type == "outer": + expected = expected.sort_values() + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) def test_str_attribute(self, method): @@ -1072,10 +1075,8 @@ def test_outer_join_sort(self): with tm.assert_produces_warning(RuntimeWarning): result = left_index.join(right_index, how="outer") - # right_index in this case because DatetimeIndex has join precedence - # over int64 Index with tm.assert_produces_warning(RuntimeWarning): - expected = right_index.astype(object).union(left_index.astype(object)) + expected = left_index.astype(object).union(right_index.astype(object)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index b467e93b75e96..fd0d984053bd6 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -30,6 +30,7 @@ period_range, ) import pandas._testing as tm +import pandas.core.algorithms as algos from pandas.core.arrays import BaseMaskedArray @@ -653,7 +654,10 @@ def test_join_self_unique(self, join_type, simple_index): idx = simple_index if idx.is_unique: joined = idx.join(idx, how=join_type) - assert (idx == joined).all() + expected = simple_index + if join_type == "outer": + expected = algos.safe_sort(expected) + tm.assert_index_equal(joined, expected) def test_map(self, simple_index): # callable diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7538894bbf1c9..d7a343ae9f152 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1838,6 +1838,9 @@ def test_merge_empty(self, left_empty, how, exp): elif exp == "empty_cross": expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64") + if how == "outer": + expected = expected.sort_values("A", ignore_index=True) + tm.assert_frame_equal(result, expected) @@ -2913,16 +2916,13 @@ def test_merge_combinations( expected = expected["key"].repeat(repeats.values) expected = expected.to_frame() elif how == "outer": - if on_index and left_unique and left["key"].equals(right["key"]): - expected = DataFrame({"key": left["key"]}) - else: - left_counts = left["key"].value_counts() - right_counts = right["key"].value_counts() - expected_counts = left_counts.mul(right_counts, fill_value=1) - expected_counts = expected_counts.astype(np.intp) - expected = expected_counts.index.values.repeat(expected_counts.values) - expected = DataFrame({"key": expected}) - expected = expected.sort_values("key") + left_counts = left["key"].value_counts() + right_counts = right["key"].value_counts() + expected_counts = left_counts.mul(right_counts, fill_value=1) + expected_counts = expected_counts.astype(np.intp) + expected = expected_counts.index.values.repeat(expected_counts.values) + expected = DataFrame({"key": expected}) + expected = expected.sort_values("key") if on_index: expected = expected.set_index("key") diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 75237be212030..b40e2e99dae2e 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -754,6 +754,9 @@ def test_series_add_tz_mismatch_converts_to_utc(self): uts2 = ser2.tz_convert("utc") expected = uts1 + uts2 + # sort since input indexes are not equal + expected = expected.sort_index() + assert result.index.tz is timezone.utc tm.assert_series_equal(result, expected) From 71a3e3c12abd934159f43fc4d925803e063c8c6d Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Sat, 9 Dec 2023 13:07:44 -0600 Subject: [PATCH 37/63] BUG: `concat` should keep series names unless `ignore_index=True` (#56365) * Keep series names when not ignoring them * Split test into two shorter tests * whatsnew * tolist * Split test for concat on index --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/concat.py | 22 +++++++------- pandas/tests/reshape/concat/test_concat.py | 35 +++++++++++++++++++--- 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 49c28e3dc5d85..ffefb9f41fb56 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -661,6 +661,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) +- Bug in :func:`concat` renaming :class:`Series` when ``ignore_index=False`` (:issue:`15047`) - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 1bc548de91f01..d46348fff7a02 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -464,7 +464,7 @@ def __init__( # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed if len(ndims) > 1: - objs, sample = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) + objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) self.objs = objs @@ -580,7 +580,7 @@ def _sanitize_mixed_ndim( sample: Series | DataFrame, ignore_index: bool, axis: AxisInt, - ) -> tuple[list[Series | DataFrame], Series | DataFrame]: + ) -> list[Series | DataFrame]: # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed @@ -601,19 +601,21 @@ def _sanitize_mixed_ndim( else: name = getattr(obj, "name", None) if ignore_index or name is None: - name = current_column - current_column += 1 - - # doing a row-wise concatenation so need everything - # to line up - if self._is_frame and axis == 1: - name = 0 + if axis == 1: + # doing a row-wise concatenation so need everything + # to line up + name = 0 + else: + # doing a column-wise concatenation so need series + # to have unique names + name = current_column + current_column += 1 obj = sample._constructor({name: obj}, copy=False) new_objs.append(obj) - return new_objs, sample + return new_objs def get_result(self): cons: Callable[..., DataFrame | Series] diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index ea0d510d2b8f8..9e34d02091e69 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -267,11 +267,10 @@ def test_with_mixed_tuples(self, sort): # it works concat([df1, df2], sort=sort) - def test_concat_mixed_objs(self): - # concat mixed series/frames + def test_concat_mixed_objs_columns(self): + # Test column-wise concat for mixed series/frames (axis=1) # G2385 - # axis 1 index = date_range("01-Jan-2013", periods=10, freq="h") arr = np.arange(10, dtype="int64") s1 = Series(arr, index=index) @@ -324,13 +323,41 @@ def test_concat_mixed_objs(self): result = concat([s1, df, s2], axis=1, ignore_index=True) tm.assert_frame_equal(result, expected) - # axis 0 + def test_concat_mixed_objs_index(self): + # Test row-wise concat for mixed series/frames with a common name + # GH2385, GH15047 + + index = date_range("01-Jan-2013", periods=10, freq="h") + arr = np.arange(10, dtype="int64") + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1, 1), index=index) + expected = DataFrame( np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0] ) result = concat([s1, df, s2]) tm.assert_frame_equal(result, expected) + def test_concat_mixed_objs_index_names(self): + # Test row-wise concat for mixed series/frames with distinct names + # GH2385, GH15047 + + index = date_range("01-Jan-2013", periods=10, freq="h") + arr = np.arange(10, dtype="int64") + s1 = Series(arr, index=index, name="foo") + s2 = Series(arr, index=index, name="bar") + df = DataFrame(arr.reshape(-1, 1), index=index) + + expected = DataFrame( + np.kron(np.where(np.identity(3) == 1, 1, np.nan), arr).T, + index=index.tolist() * 3, + columns=["foo", 0, "bar"], + ) + result = concat([s1, df, s2]) + tm.assert_frame_equal(result, expected) + + # Rename all series to 0 when ignore_index=True expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) result = concat([s1, df, s2], ignore_index=True) tm.assert_frame_equal(result, expected) From ce4169ac51d20786864157912072b68ae331dc52 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 20:16:53 +0100 Subject: [PATCH 38/63] Fix new string dtype tests for frame folder (#55409) * Start fixing string tests * BUG: interpolate raising wrong error for ea * Fix more tests * REGR: join segfaulting for arrow string with nulls * Fix more tests * Fix more tests * BUG: rank raising for arrow string dtypes * BUG: eq not implemented for categorical and arrow backed strings * More tests * BUG: ndim of string block incorrect with string inference * Fix test * Fix tests * Fix tests * Fix more indexing tests * BUG: Index.insert raising when inserting None into new string dtype * Fix tests * BUG: Inserting ndim=0 array does not infer string dtype * Fix tests * Fix tests * Fix more tests * Fix more tests * BUG: idxmax raising for arrow strings * Fix * Fix more tests * Fix more tests * Fix more tests * Fix remaining tests * Fix remaining tests * Change default * BUG: Groupby not keeping string dtype for empty objects * Start fixing gb tests * Fix tests * Merge main * Update config_init.py * Fixup * Update --- .../frame/constructors/test_from_dict.py | 5 + .../frame/constructors/test_from_records.py | 5 + pandas/tests/frame/indexing/test_getitem.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 35 ++++-- pandas/tests/frame/indexing/test_set_value.py | 8 +- pandas/tests/frame/indexing/test_setitem.py | 2 +- pandas/tests/frame/indexing/test_where.py | 6 +- pandas/tests/frame/methods/test_align.py | 4 +- pandas/tests/frame/methods/test_astype.py | 22 ++-- .../tests/frame/methods/test_combine_first.py | 6 +- .../frame/methods/test_convert_dtypes.py | 6 +- pandas/tests/frame/methods/test_cov_corr.py | 12 +- pandas/tests/frame/methods/test_drop.py | 2 +- .../frame/methods/test_drop_duplicates.py | 2 +- pandas/tests/frame/methods/test_dtypes.py | 7 +- pandas/tests/frame/methods/test_duplicated.py | 2 +- pandas/tests/frame/methods/test_equals.py | 4 +- pandas/tests/frame/methods/test_explode.py | 2 +- pandas/tests/frame/methods/test_fillna.py | 26 +++- .../frame/methods/test_get_numeric_data.py | 6 +- .../tests/frame/methods/test_interpolate.py | 13 +- .../methods/test_is_homogeneous_dtype.py | 3 +- pandas/tests/frame/methods/test_nlargest.py | 2 +- pandas/tests/frame/methods/test_rank.py | 14 ++- pandas/tests/frame/methods/test_reindex.py | 6 +- pandas/tests/frame/methods/test_replace.py | 114 +++++++++++++++--- .../tests/frame/methods/test_reset_index.py | 12 +- .../tests/frame/methods/test_select_dtypes.py | 12 +- pandas/tests/frame/methods/test_to_csv.py | 7 +- pandas/tests/frame/methods/test_update.py | 14 ++- pandas/tests/frame/test_api.py | 2 + pandas/tests/frame/test_arithmetic.py | 16 ++- pandas/tests/frame/test_block_internals.py | 4 +- pandas/tests/frame/test_constructors.py | 39 +++--- pandas/tests/frame/test_logical_ops.py | 12 +- pandas/tests/frame/test_query_eval.py | 19 ++- pandas/tests/frame/test_reductions.py | 64 +++++++--- pandas/tests/frame/test_repr.py | 3 + pandas/tests/frame/test_stack_unstack.py | 23 ++-- pandas/tests/frame/test_unary.py | 20 ++- pandas/tests/groupby/test_apply.py | 29 +++-- pandas/tests/groupby/test_categorical.py | 5 +- pandas/tests/groupby/test_groupby.py | 35 ++++-- pandas/tests/groupby/test_numeric_only.py | 13 +- pandas/tests/groupby/test_raises.py | 2 +- 45 files changed, 477 insertions(+), 170 deletions(-) diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 845174bbf600e..60a8e688b3b8a 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( DataFrame, Index, @@ -42,6 +44,9 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="columns inferring logic broken" + ) def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index edb21fb92f6a2..3622571f1365d 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,6 +6,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import is_platform_little_endian from pandas import ( @@ -56,6 +58,9 @@ def test_from_records_with_datetimes(self): expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" + ) def test_from_records_sequencelike(self): df = DataFrame( { diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 8502f98df5962..a36b0c0e850b3 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -103,7 +103,7 @@ def test_getitem_list_duplicates(self): def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) - msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" + msg = "\"None of [Index(['baf'], dtype=" with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 40c6b8e180c5b..4be5be77b015c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -288,7 +288,9 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write): + def test_setitem( + self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string + ): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -331,7 +333,10 @@ def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write): with pytest.raises(SettingWithCopyError, match=msg): smaller["col10"] = ["1", "2"] - assert smaller["col10"].dtype == np.object_ + if using_infer_string: + assert smaller["col10"].dtype == "string" + else: + assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() def test_setitem2(self): @@ -426,7 +431,7 @@ def test_setitem_cast(self, float_frame): float_frame["something"] = 2.5 assert float_frame["something"].dtype == np.float64 - def test_setitem_corner(self, float_frame): + def test_setitem_corner(self, float_frame, using_infer_string): # corner case df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) del df["B"] @@ -463,10 +468,16 @@ def test_setitem_corner(self, float_frame): dm["foo"] = "bar" del dm["foo"] dm["foo"] = "bar" - assert dm["foo"].dtype == np.object_ + if using_infer_string: + assert dm["foo"].dtype == "string" + else: + assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] - assert dm["coercible"].dtype == np.object_ + if using_infer_string: + assert dm["coercible"].dtype == "string" + else: + assert dm["coercible"].dtype == np.object_ def test_setitem_corner2(self): data = { @@ -483,7 +494,7 @@ def test_setitem_corner2(self): assert df.loc[1, "title"] == "foobar" assert df.loc[1, "cruft"] == 0 - def test_setitem_ambig(self): + def test_setitem_ambig(self, using_infer_string): # Difficulties with mixed-type data # Created as float type dm = DataFrame(index=range(3), columns=range(3)) @@ -499,18 +510,22 @@ def test_setitem_ambig(self): dm[2] = uncoercable_series assert len(dm.columns) == 3 - assert dm[2].dtype == np.object_ + if using_infer_string: + assert dm[2].dtype == "string" + else: + assert dm[2].dtype == np.object_ - def test_setitem_None(self, float_frame): + def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] + key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, None], float_frame["A"], check_names=False + float_frame.loc[:, key], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index 32312868adacb..1e3c793c8449f 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -16,7 +16,7 @@ def test_set_value(self, float_frame): float_frame._set_value(idx, col, 1) assert float_frame[col][idx] == 1 - def test_set_value_resize(self, float_frame): + def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame._set_value("foobar", "B", 0) assert res is None assert float_frame.index[-1] == "foobar" @@ -27,8 +27,10 @@ def test_set_value_resize(self, float_frame): res = float_frame.copy() res._set_value("foobar", "baz", "sam") - assert res["baz"].dtype == np.object_ - + if using_infer_string: + assert res["baz"].dtype == "string" + else: + assert res["baz"].dtype == np.object_ res = float_frame.copy() with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index c0ba2f245efed..d0caaa3756170 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1319,7 +1319,7 @@ def test_setitem_column_frame_as_category(self): df["col2"] = Series([1, 2, 3], dtype="category") expected_types = Series( - ["int64", "category", "category"], index=[0, "col1", "col2"] + ["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object ) tm.assert_series_equal(df.dtypes, expected_types) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 103ec67951a01..3d36d0471f02f 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -1077,9 +1077,13 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement): +def test_where_int_overflow(replacement, using_infer_string, request): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) + if using_infer_string and replacement not in (None, "snake"): + request.node.add_marker( + pytest.mark.xfail(reason="Can't set non-string into string column") + ) result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 312d6f6d37dde..5a9c47866dae8 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -107,7 +107,7 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) msg = ( "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " @@ -117,7 +117,7 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) # Try to align DataFrame to Series along bad axis msg = "No axis named 2 for object type DataFrame" diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 2578dfb622fbf..5a1e3cd786f84 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -166,7 +166,8 @@ def test_astype_str(self): "c": [Timedelta(x)._repr_base() for x in c._values], "d": list(map(str, d._values)), "e": list(map(str, e._values)), - } + }, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -174,13 +175,13 @@ def test_astype_str(self): def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"]) + expected = DataFrame(["nan"], dtype="object") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val]) + expected = DataFrame([val], dtype="object") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -199,7 +200,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"]), + "b": Series(["0", "1", "2", "3", "4"], dtype="object"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -282,7 +283,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: vals[:, 0].astype(str), + 0: Series(vals[:, 0].astype(str), dtype=object), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -620,6 +621,7 @@ def test_astype_arg_for_errors_dictlist(self): {"a": 2.2, "b": "15.3", "c": "another_test"}, ] ) + expected["c"] = expected["c"].astype("object") type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") @@ -680,6 +682,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -754,7 +757,9 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): + def test_astype_dt64_to_string( + self, frame_or_series, tz_naive_fixture, using_infer_string + ): # GH#41409 tz = tz_naive_fixture @@ -772,7 +777,10 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] - assert item is pd.NA + if using_infer_string: + assert item is np.nan + else: + assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 0335279b3a123..941e4c03464ea 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -30,7 +30,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - def test_combine_first(self, float_frame): + def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -76,7 +76,9 @@ def test_combine_first(self, float_frame): tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases - comb = float_frame.combine_first(DataFrame()) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="empty entries"): + comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 4c371afcc4e00..a181a271181ca 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -11,9 +11,13 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes(self, convert_integer, expected, string_storage): + def test_convert_dtypes( + self, convert_integer, expected, string_storage, using_infer_string + ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here + if using_infer_string: + string_storage = "pyarrow_numpy" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 108816697ef3e..04a08c8b9bc52 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -326,7 +326,7 @@ def test_corrwith(self, datetime_frame, dtype): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - def test_corrwith_with_objects(self): + def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -338,8 +338,14 @@ def test_corrwith_with_objects(self): df1["obj"] = "foo" df2["obj"] = "bar" - with pytest.raises(TypeError, match="Could not convert"): - df1.corrwith(df2) + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + df1.corrwith(df2) + else: + with pytest.raises(TypeError, match="Could not convert"): + df1.corrwith(df2) result = df1.corrwith(df2, numeric_only=True) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index f72c0594fa1f7..06cd51b43a0aa 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -510,7 +510,7 @@ def test_drop_with_duplicate_columns2(self): def test_drop_inplace_no_leftover_column_reference(self): # GH 13934 - df = DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object")) a = df.a df.drop(["a"], axis=1, inplace=True) tm.assert_index_equal(df.columns, Index([], dtype="object")) diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index df12139258a6d..6bea97b2cf189 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -16,7 +16,7 @@ def test_drop_duplicates_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 4bdf16977dae6..ab632ac17318e 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -142,9 +142,12 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) - def test_frame_apply_np_array_return_type(self): + def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - expected = Series(["bar"]) + if using_infer_string: + expected = Series([np.array(["bar"])]) + else: + expected = Series(["bar"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py index 788aede805110..6052b61ea8db5 100644 --- a/pandas/tests/frame/methods/test_duplicated.py +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -16,7 +16,7 @@ def test_duplicated_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.duplicated(subset) diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index 6fcf670f96ef0..d0b9d96cafa0d 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -14,11 +14,11 @@ def test_dataframe_not_equal(self): df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) assert df1.equals(df2) is False - def test_equals_different_blocks(self, using_array_manager): + def test_equals_different_blocks(self, using_array_manager, using_infer_string): # GH#9330 df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] - if not using_array_manager: + if not using_array_manager and not using_infer_string: # this assert verifies that the above operations have # induced a block rearrangement assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index d1e4a603c5710..5cd54db62d783 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -203,7 +203,7 @@ def test_usecase(): ) def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): # GH 28005 - df = pd.DataFrame(input_dict, index=input_index) + df = pd.DataFrame(input_dict, index=input_index, dtype=object) result = df.explode("col1") expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 960f05a6457a4..1403a45a5cccd 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -89,6 +91,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -122,19 +125,27 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - def test_fillna_different_dtype(self): + def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - result = df.fillna({2: "foo"}) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna({2: "foo"}) + else: + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) tm.assert_frame_equal(result, expected) - return_value = df.fillna({2: "foo"}, inplace=True) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + return_value = df.fillna({2: "foo"}, inplace=True) + else: + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -358,7 +369,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - def test_fillna_dtype_conversion(self): + def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes @@ -373,7 +384,11 @@ def test_fillna_dtype_conversion(self): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - result = df.fillna("nan") + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna("nan") + else: + result = df.fillna("nan") expected = DataFrame("nan", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -649,6 +664,7 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index ec1c768603a59..c5d32d56d03c1 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -15,12 +15,12 @@ class TestGetNumericData: def test_get_numeric_data_preserve_dtype(self): # get the numeric data - obj = DataFrame({"A": [1, "2", 3.0]}) + obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object")) result = obj._get_numeric_data() expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[]) tm.assert_frame_equal(result, expected) - def test_get_numeric_data(self): + def test_get_numeric_data(self, using_infer_string): datetime64name = np.dtype("M8[s]").name objectname = np.dtype(np.object_).name @@ -33,7 +33,7 @@ def test_get_numeric_data(self): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname), + np.dtype(objectname) if not using_infer_string else "string", np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 5f37ed6d9e18a..e0641fcb65bd3 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td @@ -67,6 +69,9 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) def test_interp_basic(self, using_copy_on_write): df = DataFrame( { @@ -108,7 +113,10 @@ def test_interp_basic(self, using_copy_on_write): assert np.shares_memory(df["C"]._values, cvalues) assert np.shares_memory(df["D"]._values, dvalues) - def test_interp_basic_with_non_range_index(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) + def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -119,7 +127,8 @@ def test_interp_basic_with_non_range_index(self): ) msg = "DataFrame.interpolate with object dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): + warning = FutureWarning if not using_infer_string else None + with tm.assert_produces_warning(warning, match=msg): result = df.set_index("C").interpolate() expected = df.set_index("C") expected.loc[3, "A"] = 3 diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py index a5f285d31301b..1fe28cb8eb856 100644 --- a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -25,7 +25,8 @@ { "A": np.array([1, 2], dtype=object), "B": np.array(["a", "b"], dtype=object), - } + }, + dtype="object", ), True, ), diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 1196f8cd3886a..3ba893501914a 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -86,7 +86,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype object, " + f"Column 'b' has dtype (object|string), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index b5b5e42691e59..8d7a0b373f5f8 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -13,6 +13,7 @@ from pandas import ( DataFrame, + Index, Series, ) import pandas._testing as tm @@ -469,21 +470,28 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): + def test_rank_object_first( + self, frame_or_series, na_option, ascending, expected, using_infer_string + ): obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) + if using_infer_string and isinstance(obj, Series): + expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( "data,expected", [ - ({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})), + ( + {"a": [1, 2, "a"], "b": [4, 5, 6]}, + DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)), + ), ({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])), ], ) def test_rank_mixed_axis_zero(self, data, expected): - df = DataFrame(data) + df = DataFrame(data, columns=Index(list(data.keys()), dtype=object)) with pytest.raises(TypeError, match="'<' not supported between instances of"): df.rank() result = df.rank(numeric_only=True) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index d0d971e29204a..d862e14ce86cb 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -120,7 +120,7 @@ def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index( "index" ) - exp = exp.astype(object) + exp = exp.astype(df.vals.dtype) tm.assert_frame_equal( df, exp, @@ -840,8 +840,8 @@ def test_reindex_fill_value(self): # other dtypes df["foo"] = "foo" - result = df.reindex(range(15), fill_value=0) - expected = df.reindex(range(15)).fillna(0) + result = df.reindex(range(15), fill_value="0") + expected = df.reindex(range(15)).fillna("0") tm.assert_frame_equal(result, expected) def test_reindex_uint_dtypes_fill_value(self, any_unsigned_int_numpy_dtype): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 53c45a5f4b5c6..8bfa98042eb07 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -28,6 +30,9 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -278,14 +283,25 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): + def test_regex_replace_dict_nested_non_first_character( + self, any_string_dtype, using_infer_string + ): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) - result = df.replace({"a": "."}, regex=True) + if using_infer_string and any_string_dtype == "object": + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) + + else: + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -294,6 +310,9 @@ def test_regex_replace_dict_nested_gh4115(self): result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -322,6 +341,9 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -337,6 +359,9 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -415,12 +440,31 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, data, to_replace, expected, frame_or_series, any_string_dtype + self, + data, + to_replace, + expected, + frame_or_series, + any_string_dtype, + using_infer_string, + request, ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - result = obj.replace(to_replace, regex=True) + if using_infer_string and any_string_dtype == "object": + if len(to_replace) > 1 and isinstance(obj, DataFrame): + request.node.add_marker( + pytest.mark.xfail( + reason="object input array that gets downcasted raises on " + "second pass" + ) + ) + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = obj.replace(to_replace, regex=True) + dtype = "string[pyarrow_numpy]" + else: + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) @@ -522,6 +566,9 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -533,6 +580,9 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -588,7 +638,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - def test_replace_mixed2(self): + def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( { @@ -607,11 +657,15 @@ def test_replace_mixed2(self): expected = DataFrame( { - "A": Series(["foo", "bar"], dtype="object"), + "A": Series(["foo", "bar"]), "B": Series([0, "foo"], dtype="object"), } ) - result = df.replace([1, 2], ["foo", "bar"]) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace([1, 2], ["foo", "bar"]) + else: + result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) def test_replace_mixed3(self): @@ -892,6 +946,9 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -920,6 +977,9 @@ def test_replace_limit(self): # TODO pass + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_dict_no_regex(self): answer = Series( { @@ -943,6 +1003,9 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_series_no_regex(self): answer = Series( { @@ -1049,7 +1112,10 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - def test_replace_swapping_bug(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) expect = DataFrame({"a": ["Y", "N", "Y"]}) @@ -1060,6 +1126,9 @@ def test_replace_swapping_bug(self): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_period(self): d = { "fname": { @@ -1096,6 +1165,9 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_datetime(self): d = { "fname": { @@ -1321,6 +1393,9 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize( "replacer", [ @@ -1491,10 +1566,12 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - def test_replace_intervals(self): + def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) - result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="Downcasting"): + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) expected = DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) @@ -1595,6 +1672,9 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 @@ -1632,9 +1712,15 @@ def test_replace_categorical_no_replacement(self): result = df.replace(to_replace=[".", "def"], value=["_", None]) tm.assert_frame_equal(result, expected) - def test_replace_object_splitting(self): + def test_replace_object_splitting(self, using_infer_string): # GH#53977 df = DataFrame({"a": ["a"], "b": "b"}) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 df.replace(to_replace=r"^\s*$", value="", inplace=True, regex=True) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 20f0dcc816408..fbf36dbc4fb02 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -654,10 +654,14 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): ), ], ) -def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_frame_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes + if using_infer_string and dtype == object: + dtype = "string" expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -676,7 +680,9 @@ def test_reset_index_empty_frame_with_datetime64_multiindex(): tm.assert_frame_equal(result, expected) -def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( + using_infer_string, +): # https://github.com/pandas-dev/pandas/issues/35657 dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]") df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti}) @@ -687,6 +693,8 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): ) expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") + if using_infer_string: + expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index e2759c5d5b7b7..47c479faed1ef 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -282,7 +282,7 @@ def test_select_dtypes_duplicate_columns(self): result = df.select_dtypes(include=[np.number], exclude=["floating"]) tm.assert_frame_equal(result, expected) - def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -296,11 +296,17 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - e = df[["a", "b"]] + if using_infer_string: + e = df[["b"]] + else: + e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - e = df[["a", "b", "g"]] + if using_infer_string: + e = df[["b", "g"]] + else: + e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self): diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 97fbe597d1dab..250567eafc670 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -682,7 +682,7 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) @@ -692,7 +692,10 @@ def test_to_csv_interval_index(self): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - expected.index = expected.index.astype(str) + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index c79a37b5b30f0..7c7a0d23ff75f 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -166,11 +166,19 @@ def test_update_with_different_dtype(self, using_copy_on_write): with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) + expected = DataFrame( + { + "a": [1, 3], + "b": [np.nan, 2], + "c": Series(["foo", np.nan], dtype="object"), + } + ) tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_update_modify_view(self, using_copy_on_write, warn_copy_on_write): + def test_update_modify_view( + self, using_copy_on_write, warn_copy_on_write, using_infer_string + ): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) @@ -181,7 +189,7 @@ def test_update_modify_view(self, using_copy_on_write, warn_copy_on_write): df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) - if using_copy_on_write: + if using_copy_on_write or using_infer_string: tm.assert_frame_equal(result_view, df2_orig) else: tm.assert_frame_equal(result_view, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b392ddcfb44d..c7b444045a0f2 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype from pandas._config.config import option_context import pandas as pd @@ -112,6 +113,7 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index a4825c80ee815..ec3222efab5a8 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -251,6 +253,9 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't compare string and int" + ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError @@ -432,8 +437,8 @@ def test_bool_flex_frame_complex_dtype(self): def test_bool_flex_frame_object_dtype(self): # corner, dtype=object - df1 = DataFrame({"col": ["foo", np.nan, "bar"]}) - df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}) + df1 = DataFrame({"col": ["foo", np.nan, "bar"]}, dtype=object) + df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}, dtype=object) result = df1.ne(df2) exp = DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) @@ -1976,7 +1981,12 @@ def test_dataframe_blockwise_slicelike(): "df, col_dtype", [ (DataFrame([[1.0, 2.0], [4.0, 5.0]], columns=list("ab")), "float64"), - (DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")), "object"), + ( + DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")).astype( + {"b": object} + ), + "object", + ), ], ) def test_dataframe_operation_with_non_numeric_types(df, col_dtype): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index b132f136e9741..712494ef15f97 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -183,7 +183,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) - def test_construction_with_mixed(self, float_string_frame): + def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround @@ -206,7 +206,7 @@ def test_construction_with_mixed(self, float_string_frame): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c6fe3a154905c..e1abd0344e356 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,6 +21,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -79,7 +81,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str)) + expected = DataFrame(arr.astype(str), dtype=object) tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self, using_array_manager): @@ -261,8 +263,9 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) - def test_constructor_mixed(self, float_string_frame): - assert float_string_frame["foo"].dtype == np.object_ + def test_constructor_mixed(self, float_string_frame, using_infer_string): + dtype = "string" if using_infer_string else np.object_ + assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): # as of 2.0, we raise if we can't respect "dtype", previously we @@ -323,6 +326,7 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") @@ -330,6 +334,7 @@ def test_1d_object_array_does_not_copy(self): assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") @@ -773,7 +778,7 @@ def test_constructor_dict_block(self): ) tm.assert_numpy_array_equal(df.values, expected) - def test_constructor_dict_cast(self): + def test_constructor_dict_cast(self, using_infer_string): # cast float tests test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data, dtype=float) @@ -783,7 +788,7 @@ def test_constructor_dict_cast(self): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ + assert frame["B"].dtype == np.object_ if not using_infer_string else "string" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1195,7 +1200,7 @@ def test_constructor_dtype_nullable_extension_arrays( df = DataFrame({"a": data}, dtype=input_dtype) assert df["a"].dtype == expected_dtype() - def test_constructor_scalar_inference(self): + def test_constructor_scalar_inference(self, using_infer_string): data = {"int": 1, "bool": True, "float": 3.0, "complex": 4j, "object": "foo"} df = DataFrame(data, index=np.arange(10)) @@ -1203,7 +1208,7 @@ def test_constructor_scalar_inference(self): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ + assert df["object"].dtype == np.object_ if not using_infer_string else "string" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1282,11 +1287,11 @@ def empty_gen(): df = DataFrame(empty_gen(), columns=["A", "B"]) tm.assert_frame_equal(df, expected) - def test_constructor_list_of_lists(self): + def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ + assert df["str"].dtype == np.object_ if not using_infer_string else "string" # GH 4851 # list of 0-dim ndarrays @@ -1835,7 +1840,7 @@ def test_constructor_single_value(self): with pytest.raises(TypeError, match=msg): DataFrame("a", [1, 2], ["a", "c"], float) - def test_constructor_with_datetimes(self): + def test_constructor_with_datetimes(self, using_infer_string): intname = np.dtype(int).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name @@ -1854,7 +1859,7 @@ def test_constructor_with_datetimes(self): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname)] * 2 + + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1876,7 +1881,7 @@ def test_constructor_with_datetimes(self): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1898,7 +1903,7 @@ def test_constructor_with_datetimes(self): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1935,13 +1940,13 @@ def test_constructor_with_datetimes3(self): df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object) ) df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}, dtype=object) ) def test_constructor_with_datetimes4(self): @@ -2066,7 +2071,7 @@ def test_constructor_timedelta_non_ns(self, order, unit): # dtype=exp_dtype. tm.assert_frame_equal(df, expected) - def test_constructor_for_list_with_dtypes(self): + def test_constructor_for_list_with_dtypes(self, using_infer_string): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes @@ -2117,7 +2122,7 @@ def test_constructor_for_list_with_dtypes(self): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[ns]"), np.dtype("float64"), ], diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index a15d7d7f93f01..16ca3a202f1e0 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -96,7 +96,7 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) - def test_logical_ops_invalid(self): + def test_logical_ops_invalid(self, using_infer_string): # GH#5808 df1 = DataFrame(1.0, index=[1], columns=["A"]) @@ -108,8 +108,14 @@ def test_logical_ops_invalid(self): df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - with pytest.raises(TypeError, match=msg): - df1 | df2 + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): + df1 | df2 + else: + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 6353546648156..a498296e09c52 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1035,7 +1035,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine): + def test_object_array_eq_ne(self, parser, engine, using_infer_string): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1044,11 +1044,14 @@ def test_object_array_eq_ne(self, parser, engine): "d": np.random.default_rng(2).integers(9, size=12), } ) - res = df.query("a == b", parser=parser, engine=engine) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - res = df.query("a != b", parser=parser, engine=engine) + with tm.assert_produces_warning(warning): + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1087,12 +1090,16 @@ def test_query_with_nested_special_character(self, parser, engine): [">=", operator.ge], ], ) - def test_query_lex_compare_strings(self, parser, engine, op, func): + def test_query_lex_compare_strings( + self, parser, engine, op, func, using_infer_string + ): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1166,7 +1173,7 @@ def test_bool_arith_expr(self, frame, parser, engine): @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) - msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" + msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'|Cannot" with pytest.raises(TypeError, match=msg): df.eval(f"a {op} b", engine=engine, parser=parser) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3b1a751a738f9..66145c32c18d7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import ( IS64, is_platform_windows, @@ -243,11 +245,17 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): - if (opname in ("sum", "min", "max") and axis == 0) or opname in ( - "count", - "nunique", - ): + def test_stat_op_api_float_string_frame( + self, float_string_frame, axis, opname, using_infer_string + ): + if ( + (opname in ("sum", "min", "max") and axis == 0) + or opname + in ( + "count", + "nunique", + ) + ) and not (using_infer_string and opname == "sum"): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -273,7 +281,11 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): elif opname in ["min", "max"]: msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": - msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S) + msg = re.compile( + r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + ) + if not isinstance(msg, re.Pattern): + msg = msg + "|does not support" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -434,6 +446,7 @@ def test_mixed_ops(self, op): "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): @@ -445,11 +458,15 @@ def test_mixed_ops(self, op): "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -516,7 +533,9 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) - with pytest.raises(TypeError, match="unsupported operand type"): + with pytest.raises( + TypeError, match="unsupported operand type|does not support" + ): df.mean() result = df[["A", "C"]].mean() expected = Series([2.7, 681.6], index=["A", "C"], dtype=object) @@ -652,7 +671,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": [np.nan, np.nan, "a", np.nan], + "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -672,14 +691,15 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self): + def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - with tm.assert_produces_warning(UserWarning): + warning = None if using_infer_string else UserWarning + with tm.assert_produces_warning(warning): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) @@ -969,7 +989,8 @@ def test_sum_mixed_datetime(self): def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert|does not support" + with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) # xs sum mixed type, just want to know it works... @@ -1341,7 +1362,9 @@ def test_any_all_extra(self): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): + def test_any_all_object_dtype( + self, axis, bool_agg_func, skipna, using_infer_string + ): # GH#35450 df = DataFrame( data=[ @@ -1351,8 +1374,13 @@ def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): [np.nan, np.nan, "5", np.nan], ] ) + if using_infer_string: + # na in object is True while in string pyarrow numpy it's false + val = not axis == 0 and not skipna and bool_agg_func == "all" + else: + val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, True, True]) + expected = Series([True, True, val, True]) tm.assert_series_equal(result, expected) # GH#50947 deprecates this but it is not emitting a warning in some builds. @@ -1378,7 +1406,8 @@ def test_any_datetime(self): def test_any_all_bool_only(self): # GH 25101 df = DataFrame( - {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} + {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}, + columns=Index(["col1", "col2", "col3"], dtype=object), ) result = df.all(bool_only=True) @@ -1931,6 +1960,9 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" +) def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) @@ -1951,7 +1983,7 @@ def test_minmax_extensionarray(method, numeric_only): expected = Series( [getattr(int64_info, method)], dtype="Int64", - index=Index(["Int64"], dtype="object"), + index=Index(["Int64"]), ) tm.assert_series_equal(result, expected) @@ -1969,7 +2001,7 @@ def test_prod_sum_min_count_mixed_object(): df = DataFrame([1, "a", True]) result = df.prod(axis=0, min_count=1, numeric_only=False) - expected = Series(["a"]) + expected = Series(["a"], dtype=object) tm.assert_series_equal(result, expected) msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 6184e791cab5d..776007fb9691d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( NA, Categorical, @@ -174,6 +176,7 @@ def test_repr_mixed_big(self): repr(biggie) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2e7e8eba270c0..554a9d4ce2d5d 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -619,7 +619,7 @@ def test_unstack_to_series(self, float_frame): data = data.unstack() tm.assert_frame_equal(old_data, data) - def test_unstack_dtypes(self): + def test_unstack_dtypes(self, using_infer_string): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] @@ -655,8 +655,9 @@ def test_unstack_dtypes(self): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes + dtype = "string" if using_infer_string else np.dtype("object") expected = Series( - [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, + [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") ), @@ -1359,14 +1360,16 @@ def test_unstack_fill_frame_object(): # By default missing values will be NaN result = data.unstack() expected = DataFrame( - {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") + {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, + index=list("xyz"), + dtype=object, ) tm.assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected result = data.unstack(fill_value="d") expected = DataFrame( - {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") + {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz"), dtype=object ) tm.assert_frame_equal(result, expected) @@ -2083,7 +2086,7 @@ def test_stack_multiple_bug(self, future_stack): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) @@ -2298,7 +2301,7 @@ def test_stack_unstack_unordered_multiindex(self, future_stack): tm.assert_frame_equal(result, expected) def test_unstack_preserve_types( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, using_infer_string ): # GH#403 ymd = multiindex_year_month_day_dataframe_random_data @@ -2307,7 +2310,11 @@ def test_unstack_preserve_types( unstacked = ymd.unstack("month") assert unstacked["A", 1].dtype == np.float64 - assert unstacked["E", 1].dtype == np.object_ + assert ( + unstacked["E", 1].dtype == np.object_ + if not using_infer_string + else "string" + ) assert unstacked["F", 1].dtype == np.float64 def test_unstack_group_index_overflow(self, future_stack): @@ -2367,7 +2374,7 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): expected = DataFrame( [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]], - index=Index(["A", "B"], dtype="object", name="a"), + index=Index(["A", "B"], name="a"), columns=MultiIndex.from_tuples( [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"], diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 5e29d3c868983..850c92013694f 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -48,15 +48,25 @@ def test_neg_object(self, df, expected): pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), ], ) - def test_neg_raises(self, df): + def test_neg_raises(self, df, using_infer_string): msg = ( "bad operand type for unary -: 'str'|" r"bad operand type for unary -: 'DatetimeArray'" ) - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + if using_infer_string and df.dtypes.iloc[0] == "string": + import pyarrow as pa + + msg = "has no kernel" + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df) + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df["a"]) + + else: + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 60b386adb664a..34b6e7c4cde5f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -37,7 +37,7 @@ def store(group): tm.assert_frame_equal(groups[0], expected_value) -def test_apply_index_date(): +def test_apply_index_date(using_infer_string): # GH 5788 ts = [ "2011-05-16 00:00", @@ -77,7 +77,7 @@ def test_apply_index_date(): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(): +def test_apply_index_date_object(using_infer_string): # GH 5789 # don't auto coerce dates ts = [ @@ -109,8 +109,9 @@ def test_apply_index_date_object(): 1.40750, 1.40649, ] + dtype = "string[pyarrow_numpy]" if using_infer_string else object exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=object, name="date" + ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -121,14 +122,15 @@ def test_apply_index_date_object(): tm.assert_series_equal(result, expected) -def test_apply_trivial(): +def test_apply_trivial(using_infer_string): # GH 20066 # trivial apply: ignore input and return a constant dataframe. df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -138,13 +140,14 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -def test_apply_trivial_fail(): +def test_apply_trivial_fail(using_infer_string): # GH 20066 df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True) @@ -941,7 +944,7 @@ def test_func_returns_object(): "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike): +def test_apply_datetime_issue(group_column_dtlike, using_infer_string): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -952,9 +955,8 @@ def test_apply_datetime_issue(group_column_dtlike): with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - expected = DataFrame( - ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] - ) + dtype = "string" if using_infer_string else "object" + expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -1021,7 +1023,7 @@ def test_apply_multi_level_name(category): assert df.index.names == ["A", "B"] -def test_groupby_apply_datetime_result_dtypes(): +def test_groupby_apply_datetime_result_dtypes(using_infer_string): # GH 14849 data = DataFrame.from_records( [ @@ -1035,8 +1037,9 @@ def test_groupby_apply_datetime_result_dtypes(): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + dtype = "string" if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), object, object, np.int64, object], + [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d4ccbe4c1c376..7a91601bf688f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -82,7 +82,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(): # TODO: split this test +def test_basic(using_infer_string): # TODO: split this test cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -129,7 +129,8 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - expected["person_name"] = expected["person_name"].astype("object") + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) # GH 9921 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 62347ec1d3d6a..802cae9ff65f0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -11,6 +11,8 @@ ) import pandas.util._test_decorators as td +from pandas.core.dtypes.common import is_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -687,7 +689,7 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -980,7 +982,7 @@ def test_groupby_multi_corner(df): def test_raises_on_nuisance(df): grouped = df.groupby("A") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1036,7 +1038,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): msg = "could not convert string to float: 'one'" else: klass = TypeError - msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]") + msg = re.escape(f"agg function failed [how->{agg_function},dtype->") with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: @@ -1061,7 +1063,7 @@ def test_raise_on_nuisance_python_single(df): def test_raise_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1104,7 +1106,7 @@ def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1193,7 +1195,7 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) -def test_groupby_complex_numbers(): +def test_groupby_complex_numbers(using_infer_string): # GH 17927 df = DataFrame( [ @@ -1202,10 +1204,11 @@ def test_groupby_complex_numbers(): {"a": 4, "b": 1}, ] ) + dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype="object"), + columns=Index(["a"], dtype=dtype), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1720,14 +1723,18 @@ def g(group): @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper): +def test_set_group_name(df, grouper, using_infer_string): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - return group.sum() + if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): + with pytest.raises(TypeError, match="does not support"): + group.sum() + else: + return group.sum() def freducex(x): return freduce(x) @@ -2024,7 +2031,9 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): +def test_empty_groupby( + columns, keys, values, method, op, using_array_manager, dropna, using_infer_string +): # GH8093 & GH26411 override_dtype = None @@ -2065,7 +2074,11 @@ def get_categorical_invalid_expected(): # Categorical is special without 'observed=True' idx = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=idx) + if using_infer_string: + columns = Index([], dtype="string[pyarrow_numpy]") + else: + columns = [] + expected = DataFrame([], columns=columns, index=idx) return expected is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 0141adf44c86b..ff4685b1e412d 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -180,6 +180,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -196,6 +197,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -206,7 +208,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): @pytest.mark.parametrize("numeric_only", [True, False, None]) -def test_axis1_numeric_only(request, groupby_func, numeric_only): +def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): if groupby_func in ("idxmax", "idxmin"): pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") if groupby_func in ("corrwith", "skew"): @@ -268,8 +270,15 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): "can't multiply sequence by non-int of type 'float'", # cumsum, diff, pct_change "unsupported operand type", + "has no kernel", ) - with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"): + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError) + else: + errs = TypeError + with pytest.raises(errs, match=f"({'|'.join(msgs)})"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): method(*args, **kwargs) else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 2800f08b5fd90..0b451ce73db89 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -189,7 +189,7 @@ def test_groupby_raises_string( "sum": (None, ""), "var": ( TypeError, - re.escape("agg function failed [how->var,dtype->object]"), + re.escape("agg function failed [how->var,dtype->"), ), }[groupby_func] From fb05cc7781637906728e95be9b418e774b0e67b2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 20:18:22 +0100 Subject: [PATCH 39/63] BUG: read_csv not respecting object dtype when option is set (#56047) * BUG: read_csv not respecting object dtype when option is set * Update readers.py * Cover str too * Adjust * Fixup * Fixup * Update readers.py --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/io/parsers/arrow_parser_wrapper.py | 14 +----- pandas/io/parsers/readers.py | 44 ++++++++++++++++++- .../io/parser/dtypes/test_dtypes_basic.py | 35 +++++++++++++++ 4 files changed, 80 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 9cc79b7090499..57b83a294963b 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -24,6 +24,7 @@ Bug fixes - Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`) - Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) +- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`) - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) - Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 1c79392d54771..66a7ccacf675b 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -296,18 +296,8 @@ def read(self) -> DataFrame: dtype_mapping[pa.null()] = pd.Int64Dtype() frame = table.to_pandas(types_mapper=dtype_mapping.get) elif using_pyarrow_string_dtype(): - - def types_mapper(dtype): - dtype_dict = self.kwds["dtype"] - if dtype_dict is not None and dtype_dict.get(dtype, None) is not None: - return dtype_dict.get(dtype) - return arrow_string_types_mapper()(dtype) - - frame = table.to_pandas(types_mapper=types_mapper) + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: - if isinstance(self.kwds.get("dtype"), dict): - frame = table.to_pandas(types_mapper=self.kwds["dtype"].get) - else: - frame = table.to_pandas() + frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 66990de6d3b89..2f9243c895ae8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -5,7 +5,10 @@ """ from __future__ import annotations -from collections import abc +from collections import ( + abc, + defaultdict, +) import csv import sys from textwrap import fill @@ -23,6 +26,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( @@ -38,8 +43,10 @@ is_float, is_integer, is_list_like, + pandas_dtype, ) +from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex from pandas.core.shared_docs import _shared_docs @@ -1846,7 +1853,40 @@ def read(self, nrows: int | None = None) -> DataFrame: else: new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + if hasattr(self, "orig_options"): + dtype_arg = self.orig_options.get("dtype", None) + else: + dtype_arg = None + + if isinstance(dtype_arg, dict): + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] + dtype.update(dtype_arg) + elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( + np.str_, + np.object_, + ): + dtype = defaultdict(lambda: dtype_arg) + else: + dtype = None + + if dtype is not None: + new_col_dict = {} + for k, v in col_dict.items(): + d = ( + dtype[k] + if pandas_dtype(dtype[k]) in (np.str_, np.object_) + else None + ) + new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) + else: + new_col_dict = col_dict + + df = DataFrame( + new_col_dict, + columns=columns, + index=index, + copy=not using_copy_on_write(), + ) self._currow += new_rows return df diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 0deafda750904..ce02e752fb90b 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -574,6 +574,41 @@ def test_string_inference(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) +def test_string_inference_object_dtype(all_parsers, dtype): + # GH#56047 + pytest.importorskip("pyarrow") + + data = """a,b +x,a +y,a +z,a""" + parser = all_parsers + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype=dtype) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype=object), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype={"a": dtype}) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL From 91ddc8b66a0eef9312014387ce0108f1be29a3a9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 20:20:50 +0100 Subject: [PATCH 40/63] Adjust Index specific tests for string option (#56074) * BUG: setitem casting object Index to arrow strings * Fix * Start fixing index tests * BUG: Index.isin raising for arrow strings and null set * Fix more tests * TST: Fix shares_memory for arrow string dtype * TST: Fix shares_memory for arrow string dtype * TST: Fix shares_memory for arrow string dtype * Fix more tests * BUG: Index.getitem returning wrong result with negative step for arrow * Update * Update * Fix * Update array.py * Fix * Move * Move * Fix * Add gh ref * Update v2.1.4.rst * Finish * Update * Update test_base.py * Update test_old_base.py * Update conftest.py * Update conftest.py * Update test_old_base.py * Update * Update test_setops.py * Fix pre-commit --- pandas/conftest.py | 6 +- .../tests/indexes/base_class/test_formats.py | 3 + .../tests/indexes/base_class/test_reshape.py | 10 ++-- .../tests/indexes/base_class/test_setops.py | 2 +- .../tests/indexes/categorical/test_astype.py | 2 +- .../indexes/categorical/test_category.py | 5 +- .../tests/indexes/categorical/test_formats.py | 4 ++ .../tests/indexes/categorical/test_reindex.py | 2 +- .../indexes/datetimes/methods/test_map.py | 2 +- pandas/tests/indexes/interval/test_formats.py | 7 ++- .../tests/indexes/multi/test_constructors.py | 7 ++- pandas/tests/indexes/multi/test_get_set.py | 16 ++++-- pandas/tests/indexes/multi/test_reindex.py | 7 ++- pandas/tests/indexes/multi/test_setops.py | 25 +++++--- pandas/tests/indexes/object/test_astype.py | 2 +- pandas/tests/indexes/object/test_indexing.py | 51 ++++++++++++----- .../indexes/period/methods/test_astype.py | 2 +- pandas/tests/indexes/test_base.py | 57 +++++++++++++------ pandas/tests/indexes/test_old_base.py | 33 +++++++---- pandas/tests/indexes/test_setops.py | 4 +- .../indexes/timedeltas/methods/test_astype.py | 2 +- 21 files changed, 173 insertions(+), 76 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 003474b57c8e1..7c829ed4b8cb9 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1903,7 +1903,7 @@ def using_copy_on_write() -> bool: @pytest.fixture def warn_copy_on_write() -> bool: """ - Fixture to check if Copy-on-Write is enabled. + Fixture to check if Copy-on-Write is in warning mode. """ return ( pd.options.mode.copy_on_write == "warn" @@ -1914,9 +1914,9 @@ def warn_copy_on_write() -> bool: @pytest.fixture def using_infer_string() -> bool: """ - Fixture to check if infer_string is enabled. + Fixture to check if infer string option is enabled. """ - return pd.options.future.infer_string + return pd.options.future.infer_string is True warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 379aea8826414..f30b578cfcf56 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype import pandas._config.config as cf from pandas import Index @@ -15,6 +16,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -79,6 +81,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 6586f5f9de480..814a6a516904b 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -33,13 +33,15 @@ def test_insert(self): # test empty null_index = Index([]) - tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) + tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) - def test_insert_missing(self, nulls_fixture): + def test_insert_missing(self, nulls_fixture, using_infer_string): # GH#22295 # test there is no mangling of NA values - expected = Index(["a", nulls_fixture, "b", "c"]) - result = Index(list("abc")).insert(1, nulls_fixture) + expected = Index(["a", nulls_fixture, "b", "c"], dtype=object) + result = Index(list("abc"), dtype=object).insert( + 1, Index([nulls_fixture], dtype=object) + ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index e538ad512d691..3ef3f3ad4d3a2 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -154,7 +154,7 @@ def test_intersection_str_dates(self, sort): def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique index1 = Index(["A", "B", "A", "C"]) - expected = Index(expected_arr, dtype="object") + expected = Index(expected_arr) result = index1.intersection(index2, sort=sort) if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/categorical/test_astype.py b/pandas/tests/indexes/categorical/test_astype.py index da1d692f9eb2d..a17627b7515b2 100644 --- a/pandas/tests/indexes/categorical/test_astype.py +++ b/pandas/tests/indexes/categorical/test_astype.py @@ -18,7 +18,7 @@ def test_astype(self): ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) result = ci.astype(object) - tm.assert_index_equal(result, Index(np.array(ci))) + tm.assert_index_equal(result, Index(np.array(ci), dtype=object)) # this IS equal, but not the same class assert result.equals(ci) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 142a00d32815a..03a298a13dc2b 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -47,7 +49,7 @@ def test_insert(self, simple_index): # invalid -> cast to object expected = ci.astype(object).insert(0, "d") - result = ci.insert(0, "d") + result = ci.insert(0, "d").astype(object) tm.assert_index_equal(result, expected, exact=True) # GH 18295 (test missing) @@ -194,6 +196,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index ea3e4ce213e67..522ca1bc2afde 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -1,6 +1,9 @@ """ Tests for CategoricalIndex.__repr__ and related methods. """ +import pytest + +from pandas._config import using_pyarrow_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex @@ -16,6 +19,7 @@ def test_format_different_scalar_lengths(self): with tm.assert_produces_warning(FutureWarning, match=msg): assert idx.format() == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/categorical/test_reindex.py b/pandas/tests/indexes/categorical/test_reindex.py index 8ca5c6099b4e7..5b1f2b9fb159a 100644 --- a/pandas/tests/indexes/categorical/test_reindex.py +++ b/pandas/tests/indexes/categorical/test_reindex.py @@ -40,7 +40,7 @@ def test_reindex_duplicate_target(self): # See GH25459 cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) res, indexer = cat.reindex(["a", "c", "c"]) - exp = Index(["a", "c", "c"], dtype="object") + exp = Index(["a", "c", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) diff --git a/pandas/tests/indexes/datetimes/methods/test_map.py b/pandas/tests/indexes/datetimes/methods/test_map.py index c31e2407190ea..f35f07bd32068 100644 --- a/pandas/tests/indexes/datetimes/methods/test_map.py +++ b/pandas/tests/indexes/datetimes/methods/test_map.py @@ -16,7 +16,7 @@ def test_map(self): f = lambda x: x.strftime("%Y%m%d") result = rng.map(f) - exp = Index([f(x) for x in rng], dtype=" Date: Sat, 9 Dec 2023 20:24:30 +0100 Subject: [PATCH 41/63] Adjust tests in strings folder for new string option (#56159) * Adjust tests in strings folder for new string option * BUG: translate losing object dtype with new string dtype * Fix * BUG: Index.str.cat casting result always to object * Update accessor.py * Fix further bugs * Fix * Fix tests * Update accessor.py --- pandas/core/strings/accessor.py | 33 ++++++++++++++---- pandas/tests/strings/test_api.py | 4 ++- pandas/tests/strings/test_case_justify.py | 35 ++++++++++++++------ pandas/tests/strings/test_extract.py | 15 ++++++--- pandas/tests/strings/test_find_replace.py | 24 ++++++++------ pandas/tests/strings/test_split_partition.py | 8 +++-- pandas/tests/strings/test_string_array.py | 4 ++- pandas/tests/strings/test_strings.py | 20 ++++++----- 8 files changed, 97 insertions(+), 46 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 127aee24e094f..75866c6f6013a 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -918,7 +918,13 @@ def split( if is_re(pat): regex = True result = self._data.array._str_split(pat, n, expand, regex) - return self._wrap_result(result, returns_string=expand, expand=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) @Appender( _shared_docs["str_split"] @@ -936,7 +942,10 @@ def split( @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, *, n=-1, expand: bool = False): result = self._data.array._str_rsplit(pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) _shared_docs[ "str_partition" @@ -1032,7 +1041,13 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): @forbid_nonstring_types(["bytes"]) def partition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_partition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) @Appender( _shared_docs["str_partition"] @@ -1046,7 +1061,13 @@ def partition(self, sep: str = " ", expand: bool = True): @forbid_nonstring_types(["bytes"]) def rpartition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_rpartition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) def get(self, i): """ @@ -2752,7 +2773,7 @@ def extract( else: name = _get_single_group_name(regex) result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) - return self._wrap_result(result, name=name) + return self._wrap_result(result, name=name, dtype=result_dtype) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags: int = 0) -> DataFrame: @@ -3492,7 +3513,7 @@ def str_extractall(arr, pat, flags: int = 0) -> DataFrame: raise ValueError("pattern contains no capture groups") if isinstance(arr, ABCIndex): - arr = arr.to_series().reset_index(drop=True) + arr = arr.to_series().reset_index(drop=True).astype(arr.dtype) columns = _get_group_names(regex) match_list = [] diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index fd2501835318d..31e005466af7b 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -8,6 +8,7 @@ MultiIndex, Series, _testing as tm, + option_context, ) from pandas.core.strings.accessor import StringMethods @@ -163,7 +164,8 @@ def test_api_per_method( if inferred_dtype in allowed_types: # xref GH 23555, GH 23556 - method(*args, **kwargs) # works! + with option_context("future.no_silent_downcasting", True): + method(*args, **kwargs) # works! else: # GH 23011, GH 23163 msg = ( diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 1dee25e631648..41aedae90ca76 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -21,7 +21,8 @@ def test_title_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) result = s.str.title() expected = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan] + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan], + dtype=object, ) tm.assert_almost_equal(result, expected) @@ -41,11 +42,15 @@ def test_lower_upper_mixed_object(): s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = s.str.upper() - expected = Series(["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan]) + expected = Series( + ["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) result = s.str.lower() - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -71,7 +76,8 @@ def test_capitalize_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) result = s.str.capitalize() expected = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan] + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -87,7 +93,8 @@ def test_swapcase_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0]) result = s.str.swapcase() expected = Series( - ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan] + ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -138,19 +145,22 @@ def test_pad_mixed_object(): result = s.str.pad(5, side="left") expected = Series( - [" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan] + [" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) result = s.str.pad(5, side="right") expected = Series( - ["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan] + ["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) result = s.str.pad(5, side="both") expected = Series( - [" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan] + [" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -238,7 +248,8 @@ def test_center_ljust_rjust_mixed_object(): None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -255,7 +266,8 @@ def test_center_ljust_rjust_mixed_object(): None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -272,7 +284,8 @@ def test_center_ljust_rjust_mixed_object(): None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 9ad9b1eca41d9..77d008c650264 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -47,13 +47,16 @@ def test_extract_expand_False_mixed_object(): # two groups result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False) er = [np.nan, np.nan] # empty row - expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + expected = DataFrame( + [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object + ) tm.assert_frame_equal(result, expected) # single group result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False) expected = Series( - ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan] + ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -238,7 +241,9 @@ def test_extract_expand_True_mixed_object(): ) result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True) - expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + expected = DataFrame( + [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object + ) tm.assert_frame_equal(result, expected) @@ -603,8 +608,8 @@ def test_extractall_stringindex(any_string_dtype): # index.name doesn't affect to the result if any_string_dtype == "object": for idx in [ - Index(["a1a2", "b1", "c1"]), - Index(["a1a2", "b1", "c1"], name="xxx"), + Index(["a1a2", "b1", "c1"], dtype=object), + Index(["a1a2", "b1", "c1"], name="xxx", dtype=object), ]: result = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index bd64a5dce3b9a..3f58c6d703f8f 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -242,7 +242,7 @@ def test_contains_nan(any_string_dtype): @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) -@pytest.mark.parametrize("dtype", [None, "category"]) +@pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_startswith(pat, dtype, null_value, na): @@ -254,10 +254,10 @@ def test_startswith(pat, dtype, null_value, na): result = values.str.startswith(pat) exp = Series([False, np.nan, True, False, False, np.nan, True]) - if dtype is None and null_value is pd.NA: + if dtype == "object" and null_value is pd.NA: # GH#18463 exp = exp.fillna(null_value) - elif dtype is None and null_value is None: + elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) @@ -300,7 +300,7 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) -@pytest.mark.parametrize("dtype", [None, "category"]) +@pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_endswith(pat, dtype, null_value, na): @@ -312,10 +312,10 @@ def test_endswith(pat, dtype, null_value, na): result = values.str.endswith(pat) exp = Series([False, np.nan, False, False, True, np.nan, True]) - if dtype is None and null_value is pd.NA: + if dtype == "object" and null_value is pd.NA: # GH#18463 - exp = exp.fillna(pd.NA) - elif dtype is None and null_value is None: + exp = exp.fillna(null_value) + elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) @@ -382,7 +382,9 @@ def test_replace_mixed_object(): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace("BAD[_]*", "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -469,7 +471,9 @@ def test_replace_compiled_regex_mixed_object(): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace(pat, "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -913,7 +917,7 @@ def test_translate_mixed_object(): # Series with non-string values s = Series(["a", "b", "c", 1.2]) table = str.maketrans("abc", "cde") - expected = Series(["c", "d", "e", np.nan]) + expected = Series(["c", "d", "e", np.nan], dtype=object) result = s.str.translate(table) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 0a7d409773dd6..9ff1fc0e13ae9 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -681,14 +681,16 @@ def test_partition_sep_kwarg(any_string_dtype, method): def test_get(): ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = ser.str.split("_").str.get(1) - expected = Series(["b", "d", np.nan, "g"]) + expected = Series(["b", "d", np.nan, "g"], dtype=object) tm.assert_series_equal(result, expected) def test_get_mixed_object(): ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) result = ser.str.split("_").str.get(1) - expected = Series(["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan]) + expected = Series( + ["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -696,7 +698,7 @@ def test_get_mixed_object(): def test_get_bounds(idx): ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) result = ser.str.split("_").str.get(idx) - expected = Series(["3", "8", np.nan]) + expected = Series(["3", "8", np.nan], dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index a88dcc8956931..0b3f368afea5e 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -8,6 +8,7 @@ DataFrame, Series, _testing as tm, + option_context, ) @@ -56,7 +57,8 @@ def test_string_array(nullable_string_dtype, any_string_method): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == nullable_string_dtype) result[columns] = result[columns].astype(object) - expected[columns] = expected[columns].fillna(NA) # GH#18463 + with option_context("future.no_silent_downcasting", True): + expected[columns] = expected[columns].fillna(NA) # GH#18463 tm.assert_equal(result, expected) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 4315835b70a40..f662dfd7e2b14 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -76,7 +76,8 @@ def test_repeat_mixed_object(): ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = ser.str.repeat(3) expected = Series( - ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan] + ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -270,7 +271,8 @@ def test_spilt_join_roundtrip_mixed_object(): ) result = ser.str.split("_").str.join("_") expected = Series( - ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan] + ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -398,7 +400,7 @@ def test_slice(start, stop, step, expected, any_string_dtype): def test_slice_mixed_object(start, stop, step, expected): ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]) result = ser.str.slice(start, stop, step) - expected = Series(expected) + expected = Series(expected, dtype=object) tm.assert_series_equal(result, expected) @@ -453,7 +455,7 @@ def test_strip_lstrip_rstrip_mixed_object(method, exp): ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) result = getattr(ser.str, method)() - expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan]) + expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan], dtype=object) tm.assert_series_equal(result, expected) @@ -529,7 +531,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")) + expected = ser.map(lambda x: x.decode("utf-8")).astype(object) tm.assert_series_equal(result, expected) @@ -559,7 +561,7 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) tm.assert_series_equal(result, expected) @@ -672,7 +674,7 @@ def test_str_accessor_in_apply_func(): def test_zfill(): # https://github.com/pandas-dev/pandas/issues/20868 value = Series(["-1", "1", "1000", 10, np.nan]) - expected = Series(["-01", "001", "1000", np.nan, np.nan]) + expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object) tm.assert_series_equal(value.str.zfill(3), expected) value = Series(["-2", "+5"]) @@ -704,10 +706,10 @@ def test_get_with_dict_label(): ] ) result = s.str.get("name") - expected = Series(["Hello", "Goodbye", None]) + expected = Series(["Hello", "Goodbye", None], dtype=object) tm.assert_series_equal(result, expected) result = s.str.get("value") - expected = Series(["World", "Planet", "Sea"]) + expected = Series(["World", "Planet", "Sea"], dtype=object) tm.assert_series_equal(result, expected) From ee6a0626a42d531ece2de7fefdc514123690c191 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 20:26:09 +0100 Subject: [PATCH 42/63] DEPR: Disallow dtype inference when setting Index into DataFrame (#56102) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/frame.py | 16 +++++++++++++++- pandas/tests/frame/indexing/test_setitem.py | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ffefb9f41fb56..ad44e87cacf82 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -458,6 +458,7 @@ Other Deprecations - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) +- Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`) - Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`) - Deprecated not passing a tuple to :class:`.DataFrameGroupBy.get_group` or :class:`.SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 24b7951e3bb85..e741fa7b37f33 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5219,7 +5219,21 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True), None + arr = sanitize_array(value, self.index, copy=True, allow_2d=True) + if ( + isinstance(value, Index) + and value.dtype == "object" + and arr.dtype != value.dtype + ): # + # TODO: Remove kludge in sanitize_array for string mode when enforcing + # this deprecation + warnings.warn( + "Setting an Index with object dtype into a DataFrame will no longer " + "infer another dtype. Cast the Index explicitly before setting.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return arr, None @property def _series(self): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d0caaa3756170..e802a56ecbc81 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -786,6 +786,24 @@ def test_loc_setitem_ea_dtype(self): df.iloc[:, 0] = Series([11], dtype="Int64") tm.assert_frame_equal(df, expected) + def test_setitem_object_inferring(self): + # GH#56102 + idx = Index([Timestamp("2019-12-31")], dtype=object) + df = DataFrame({"a": [1]}) + with tm.assert_produces_warning(FutureWarning, match="infer"): + df.loc[:, "b"] = idx + with tm.assert_produces_warning(FutureWarning, match="infer"): + df["c"] = idx + + expected = DataFrame( + { + "a": [1], + "b": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + "c": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + } + ) + tm.assert_frame_equal(df, expected) + class TestSetitemTZAwareValues: @pytest.fixture From f0b61c561858da28d22a1df7c03787ce8f1b482f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 22:19:54 +0100 Subject: [PATCH 43/63] CI: Fix mypy (#56428) --- pandas/core/arrays/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8928c72de750c..91dd40c2deced 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -770,7 +770,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: ] if values.dtype == object: values = lib.maybe_convert_objects( - values, + values, # type: ignore[arg-type] convert_non_numeric=True, dtype_if_all_nat=self.dtype, ) From 8aa7a96765ff4f5e2e157fdbacedc442381292b4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 10 Dec 2023 00:35:25 +0100 Subject: [PATCH 44/63] Adjust tests in array folder for new string option (#56188) * Adjust tests in array directory for new string option * BUG: value_counts not preserving object dtype * Adjust tests in array folder for new string option * Fixup * Fix * Fix * Revert "BUG: value_counts not preserving object dtype" This reverts commit f570a4ff --- pandas/core/algorithms.py | 5 ++- .../tests/arrays/boolean/test_arithmetic.py | 16 +++++++-- .../tests/arrays/categorical/test_astype.py | 2 +- .../arrays/categorical/test_constructors.py | 3 ++ .../arrays/categorical/test_operators.py | 6 ++-- pandas/tests/arrays/categorical/test_repr.py | 30 ++++++++++++----- .../tests/arrays/floating/test_arithmetic.py | 21 +++++++++--- .../tests/arrays/integer/test_arithmetic.py | 33 ++++++++++++++----- pandas/tests/arrays/integer/test_reduction.py | 4 ++- pandas/tests/arrays/string_/test_string.py | 22 +++++++++++-- .../tests/arrays/string_/test_string_arrow.py | 32 +++++++++++------- pandas/tests/arrays/test_array.py | 2 +- 12 files changed, 129 insertions(+), 47 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 03f06da5f84e1..ec0fe32163ec8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -932,7 +932,10 @@ def value_counts_internal( idx = Index(keys) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) - elif idx.dtype != keys.dtype: + elif ( + idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 + and idx.dtype != "string[pyarrow_numpy]" + ): warnings.warn( # GH#56161 "The behavior of value_counts with object-dtype is deprecated. " diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 197e83121567e..0c4fcf149eb20 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -90,9 +90,16 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): # invalid ops + if using_infer_string: + import pyarrow as pa + + err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + err = TypeError + op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) @@ -110,9 +117,10 @@ def test_error_invalid_values(data, all_arithmetic_operators): [ r"unsupported operand type\(s\) for", "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -123,7 +131,9 @@ def test_error_invalid_values(data, all_arithmetic_operators): r"unsupported operand type\(s\) for", "can only concatenate str", "not all arguments converted during string formatting", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ops(pd.Series("foo", index=s.index)) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 7fba150c9113f..a2a53af6ab1ad 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -89,7 +89,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object dtype to float64" + msg = r"Cannot cast object|string dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index e25e31e2f2e9e..50aaa42e09f22 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -447,6 +449,7 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index a1e50917fed98..16b941eab4830 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -92,7 +92,7 @@ def test_comparisons(self, factor): cat > cat_unordered # comparison (in both directions) with Series will raise - s = Series(["b", "b", "b"]) + s = Series(["b", "b", "b"], dtype=object) msg = ( "Cannot compare a Categorical for op __gt__ with type " r"" @@ -108,7 +108,7 @@ def test_comparisons(self, factor): # comparison with numpy.array will raise in both direction, but only on # newer numpy versions - a = np.array(["b", "b", "b"]) + a = np.array(["b", "b", "b"], dtype=object) with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): @@ -248,7 +248,7 @@ def test_comparisons(self, data, reverse, base): cat_base = Series( Categorical(base, categories=cat.cat.categories, ordered=True) ) - s = Series(base) + s = Series(base, dtype=object if base == list("bbb") else None) a = np.array(base) # comparisons need to take categories ordering into account diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index dca171bf81047..d6f93fbbd912f 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,9 +1,13 @@ import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype from pandas import ( Categorical, CategoricalDtype, CategoricalIndex, + Index, Series, date_range, option_context, @@ -13,11 +17,17 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor): - expected = [ - "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, object): ['a' < 'b' < 'c']", - ] + def test_print(self, factor, using_infer_string): + if using_infer_string: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, string): [a < b < c]", + ] + else: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, object): ['a' < 'b' < 'c']", + ] expected = "\n".join(expected) actual = repr(factor) assert actual == expected @@ -26,7 +36,7 @@ def test_print(self, factor): class TestCategoricalRepr: def test_big_print(self): codes = np.array([0, 1, 2, 0, 1, 2] * 100) - dtype = CategoricalDtype(categories=["a", "b", "c"]) + dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object)) factor = Categorical.from_codes(codes, dtype=dtype) expected = [ "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", @@ -40,13 +50,13 @@ def test_big_print(self): assert actual == expected def test_empty_print(self): - factor = Categorical([], ["a", "b", "c"]) + factor = Categorical([], Index(["a", "b", "c"], dtype=object)) expected = "[], Categories (3, object): ['a', 'b', 'c']" actual = repr(factor) assert actual == expected assert expected == actual - factor = Categorical([], ["a", "b", "c"], ordered=True) + factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True) expected = "[], Categories (3, object): ['a' < 'b' < 'c']" actual = repr(factor) assert expected == actual @@ -66,6 +76,10 @@ def test_print_none_width(self): with option_context("display.width", None): assert exp == repr(a) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="Change once infer_string is set to True by default", + ) def test_unicode_print(self): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 056c22d8c1131..ba081bd01062a 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -122,11 +122,18 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + # invalid scalars msg = "|".join( [ @@ -140,15 +147,17 @@ def test_error_invalid_values(data, all_arithmetic_operators): "ufunc '.*' not supported for the input types, and the inputs could not", "ufunc '.*' did not contain a loop with signature matching types", "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops("foo") - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series("foo", index=s.index)) msg = "|".join( @@ -167,9 +176,11 @@ def test_error_invalid_values(data, all_arithmetic_operators): ), r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)", "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index ce6c245cd0f37..d979dd445a61a 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -172,11 +172,18 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + # invalid scalars msg = "|".join( [ @@ -188,20 +195,26 @@ def test_error_invalid_values(data, all_arithmetic_operators): "ufunc '.*' not supported for the input types, and the inputs could not", "ufunc '.*' did not contain a loop with signature matching types", "Addition/subtraction of integers and integer-arrays with Timestamp", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops("foo") - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes str_ser = pd.Series("foo", index=s.index) # with pytest.raises(TypeError, match=msg): - if all_arithmetic_operators in [ - "__mul__", - "__rmul__", - ]: # (data[~data.isna()] >= 0).all(): + if ( + all_arithmetic_operators + in [ + "__mul__", + "__rmul__", + ] + and not using_infer_string + ): # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) expected = expected.fillna(np.nan) @@ -210,7 +223,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(str_ser) msg = "|".join( @@ -223,9 +236,11 @@ def test_error_invalid_values(data, all_arithmetic_operators): r"can only concatenate str \(not \"int\"\) to str", "not all arguments converted during string", "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index 1c91cd25ba69c..db04862e4ea07 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -102,7 +102,9 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected): +def test_mixed_reductions(op, expected, using_infer_string): + if op in ["any", "all"] and using_infer_string: + expected = expected.astype("bool") df = DataFrame( { "A": ["a", "b", "b"], diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8dcda44aa68e5..d015e899c4231 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -191,7 +191,7 @@ def test_mul(dtype): @pytest.mark.xfail(reason="GH-28527") def test_add_strings(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) - df = pd.DataFrame([["t", "y", "v", "w"]]) + df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) assert arr.__add__(df) is NotImplemented result = arr + df @@ -498,10 +498,17 @@ def test_arrow_array(dtype): @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2): +def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -516,10 +523,19 @@ def test_arrow_roundtrip(dtype, string_storage2): @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks(dtype, string_storage2): +def test_arrow_load_from_zero_chunks( + dtype, string_storage2, request, using_infer_string +): # GH-41040 pa = pytest.importorskip("pyarrow") + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index a801a845bc7be..a022dfffbdd2b 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -26,7 +26,9 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage): +def test_config(string_storage, request, using_infer_string): + if using_infer_string and string_storage != "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) @@ -101,7 +103,7 @@ def test_constructor_from_list(): assert result.dtype.storage == "pyarrow" -def test_from_sequence_wrong_dtype_raises(): +def test_from_sequence_wrong_dtype_raises(using_infer_string): pytest.importorskip("pyarrow") with pd.option_context("string_storage", "python"): ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") @@ -114,15 +116,19 @@ def test_from_sequence_wrong_dtype_raises(): ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") - with pytest.raises(AssertionError, match=None): - with pd.option_context("string_storage", "python"): - ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) with pd.option_context("string_storage", "pyarrow"): ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) - with pytest.raises(AssertionError, match=None): - ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence( + ["a", None, "c"], dtype=StringDtype("python") + ) ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) @@ -137,13 +143,15 @@ def test_from_sequence_wrong_dtype_raises(): with pytest.raises(AssertionError, match=None): StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") - with pd.option_context("string_storage", "python"): - StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) - - with pytest.raises(AssertionError, match=None): - with pd.option_context("string_storage", "pyarrow"): + if not using_infer_string: + with pd.option_context("string_storage", "python"): StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) with pytest.raises(AssertionError, match=None): diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index e2b8ebcb79a3b..b0ec2787097f0 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -440,7 +440,7 @@ def test_array_unboxes(index_or_series): def test_array_to_numpy_na(): # GH#40638 - arr = pd.array([pd.NA, 1], dtype="string") + arr = pd.array([pd.NA, 1], dtype="string[python]") result = arr.to_numpy(na_value=True, dtype=bool) expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) From 9b51ab2ae6d16b4d599b99fd253aba00c959b41d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 10 Dec 2023 20:11:13 +0100 Subject: [PATCH 45/63] ENH: Make get_dummies return ea booleans for ea inputs (#56291) * ENH: Make get_dummies return ea booleans for ea inputs * ENH: Make get_dummies return ea booleans for ea inputs * Update * Update pandas/tests/reshape/test_get_dummies.py Co-authored-by: Thomas Baumann * Update test_get_dummies.py * Update test_get_dummies.py * Fixup --------- Co-authored-by: Thomas Baumann --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/encoding.py | 24 ++++++++++++- pandas/tests/reshape/test_get_dummies.py | 45 ++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ad44e87cacf82..acec8379ee5b3 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -218,6 +218,7 @@ Other enhancements - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). +- :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`) - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`) - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 6963bf677bcfb..3ed67bb7b7c02 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -21,9 +21,14 @@ is_object_dtype, pandas_dtype, ) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, +) from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable +from pandas.core.arrays.string_ import StringDtype from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, @@ -244,8 +249,25 @@ def _get_dummies_1d( # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data, copy=False)) - if dtype is None: + if dtype is None and hasattr(data, "dtype"): + input_dtype = data.dtype + if isinstance(input_dtype, CategoricalDtype): + input_dtype = input_dtype.categories.dtype + + if isinstance(input_dtype, ArrowDtype): + import pyarrow as pa + + dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] + elif ( + isinstance(input_dtype, StringDtype) + and input_dtype.storage != "pyarrow_numpy" + ): + dtype = pandas_dtype("boolean") # type: ignore[assignment] + else: + dtype = np.dtype(bool) + elif dtype is None: dtype = np.dtype(bool) + _dtype = pandas_dtype(dtype) if is_object_dtype(_dtype): diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 3bfff56cfedf2..9b7aefac60969 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,13 +4,18 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_integer_dtype import pandas as pd from pandas import ( + ArrowDtype, Categorical, + CategoricalDtype, CategoricalIndex, DataFrame, + Index, RangeIndex, Series, SparseDtype, @@ -19,6 +24,11 @@ import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray +try: + import pyarrow as pa +except ImportError: + pa = None + class TestGetDummies: @pytest.fixture @@ -217,6 +227,7 @@ def test_dataframe_dummies_string_dtype(self, df): }, dtype=bool, ) + expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self, df, sparse, dtype): @@ -693,3 +704,37 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): dtype=any_numeric_ea_and_arrow_dtype, ) tm.assert_frame_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_get_dummies_ea_dtype(self): + # GH#56273 + for dtype, exp_dtype in [ + ("string[pyarrow]", "boolean"), + ("string[pyarrow_numpy]", "bool"), + (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), + (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), + ]: + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_get_dummies_arrow_dtype(self): + # GH#56273 + df = DataFrame({"name": Series(["a"], dtype=ArrowDtype(pa.string())), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype="bool[pyarrow]")}) + tm.assert_frame_equal(result, expected) + + df = DataFrame( + { + "name": Series( + ["a"], + dtype=CategoricalDtype(Index(["a"], dtype=ArrowDtype(pa.string()))), + ), + "x": 1, + } + ) + result = get_dummies(df) + tm.assert_frame_equal(result, expected) From b4c9df811a234d50646540f9b4dc92ad39d7c77b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 11 Dec 2023 00:45:49 +0100 Subject: [PATCH 46/63] WEB: Add timeout for ci (#56440) --- web/pandas_web.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/web/pandas_web.py b/web/pandas_web.py index 1cd3be456bfe0..407debf5828be 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -175,7 +175,9 @@ def maintainers_add_info(context): context["maintainers"]["active"] + context["maintainers"]["inactive"] ): resp = requests.get( - f"https://api.github.com/users/{user}", headers=GITHUB_API_HEADERS + f"https://api.github.com/users/{user}", + headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write( @@ -184,7 +186,7 @@ def maintainers_add_info(context): # if we exceed github api quota, we use the github info # of maintainers saved with the website resp_bkp = requests.get( - context["main"]["production_url"] + "maintainers.json" + context["main"]["production_url"] + "maintainers.json", timeout=5 ) resp_bkp.raise_for_status() maintainers_info = resp_bkp.json() @@ -214,10 +216,13 @@ def home_add_releases(context): resp = requests.get( f"https://api.github.com/repos/{github_repo_url}/releases", headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write("WARN: GitHub API quota exceeded when fetching releases\n") - resp_bkp = requests.get(context["main"]["production_url"] + "releases.json") + resp_bkp = requests.get( + context["main"]["production_url"] + "releases.json", timeout=5 + ) resp_bkp.raise_for_status() releases = resp_bkp.json() else: @@ -302,10 +307,13 @@ def roadmap_pdeps(context): "https://api.github.com/search/issues?" f"q=is:pr is:open label:PDEP repo:{github_repo_url}", headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write("WARN: GitHub API quota exceeded when fetching pdeps\n") - resp_bkp = requests.get(context["main"]["production_url"] + "pdeps.json") + resp_bkp = requests.get( + context["main"]["production_url"] + "pdeps.json", timeout=5 + ) resp_bkp.raise_for_status() pdeps = resp_bkp.json() else: From 400ae748367cf9a356b8433c1aa59f34033f02a4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 11 Dec 2023 14:05:59 +0100 Subject: [PATCH 47/63] CoW: Update test with read-only array (#56418) --- pandas/tests/frame/methods/test_to_dict_of_blocks.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index f7d9dc914a2ee..f64cfd5fe6a2d 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -35,9 +35,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): assert _last_df is not None and not _last_df[column].equals(df[column]) -def test_to_dict_of_blocks_item_cache(request, using_copy_on_write, warn_copy_on_write): - if using_copy_on_write: - request.applymarker(pytest.mark.xfail(reason="CoW - not yet implemented")) +def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object)) @@ -49,10 +47,8 @@ def test_to_dict_of_blocks_item_cache(request, using_copy_on_write, warn_copy_on df._to_dict_of_blocks() if using_copy_on_write: - # TODO(CoW) we should disallow this, so `df` doesn't get updated, - # this currently still updates df, so this test fails - ser.values[0] = "foo" - assert df.loc[0, "b"] == "a" + with pytest.raises(ValueError, match="read-only"): + ser.values[0] = "foo" elif warn_copy_on_write: ser.values[0] = "foo" assert df.loc[0, "b"] == "foo" From 87c40c7dedddde1bf022d6d8f1d4a897acdf11a2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Dec 2023 09:27:18 -0800 Subject: [PATCH 48/63] Bump actions/stale from 8 to 9 (#56449) Bumps [actions/stale](https://github.com/actions/stale) from 8 to 9. - [Release notes](https://github.com/actions/stale/releases) - [Changelog](https://github.com/actions/stale/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/stale/compare/v8...v9) --- updated-dependencies: - dependency-name: actions/stale dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/stale-pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index 11b81d11f7876..792afe8f4faf5 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -14,7 +14,7 @@ jobs: if: github.repository_owner == 'pandas-dev' runs-on: ubuntu-22.04 steps: - - uses: actions/stale@v8 + - uses: actions/stale@v9 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please [update](https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#updating-your-pull-request) and respond to this comment if you're still interested in working on this." From 5ad2251aa5181958f1c8163aeb4e09262c489703 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Dec 2023 09:27:54 -0800 Subject: [PATCH 49/63] Bump actions/setup-python from 4 to 5 (#56450) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 4 to 5. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/code-checks.yml | 2 +- .github/workflows/package-checks.yml | 2 +- .github/workflows/unit-tests.yml | 2 +- .github/workflows/wheels.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 4260c0836bbea..b49b9a67c4743 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -170,7 +170,7 @@ jobs: - name: Setup Python id: setup_python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10' cache: 'pip' diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index a2c42af53c3a8..04d8b8e006985 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -40,7 +40,7 @@ jobs: - name: Setup Python id: setup_python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10' diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index ffcd2ae32c09c..88d705dbd9251 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -354,7 +354,7 @@ jobs: fetch-depth: 0 - name: Set up Python Dev Version - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.12-dev' diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 67c6f9bedd9c9..ca5dacad1b599 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -53,7 +53,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.11' From 0dd6954284a74bbfb11350acff81a788afebdebc Mon Sep 17 00:00:00 2001 From: Linus Sommer <95619282+linus-md@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:32:42 +0100 Subject: [PATCH 50/63] DOC: Improve ``io/excel/_base.py`` (#56453) * Improve error message and add test * Remove empty line * update * Fix line length * Update test_indexing.py * Add spaces * Remove changes in test_indexing.py * Update base.py --- pandas/io/excel/_base.py | 55 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6d66830ab1dfd..2884294377ec9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -86,7 +86,7 @@ ) _read_excel_doc = ( """ -Read an Excel file into a pandas DataFrame. +Read an Excel file into a ``pandas`` ``DataFrame``. Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions read from a local filesystem or URL. Supports an option to read @@ -112,7 +112,7 @@ Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify None to get all worksheets. + Specify ``None`` to get all worksheets. Available cases: @@ -121,7 +121,7 @@ * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" as a dict of `DataFrame` - * None: All worksheets. + * ``None``: All worksheets. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed @@ -155,21 +155,21 @@ Returns a subset of the columns according to behavior above. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} - Use `object` to preserve data as stored in Excel and not interpret dtype, - which will necessarily result in `object` dtype. + Use ``object`` to preserve data as stored in Excel and not interpret dtype, + which will necessarily result in ``object`` dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. - If you use `None`, it will infer the dtype of each column based on the data. + If you use ``None``, it will infer the dtype of each column based on the data. engine : str, default None If io is not a buffer or path, this must be set to identify io. Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - - "xlrd" supports old-style Excel files (.xls). - - "openpyxl" supports newer Excel file formats. - - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - - "pyxlsb" supports Binary Excel files. - - "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb) + - ``xlr`` supports old-style Excel files (.xls). + - ``openpyxl`` supports newer Excel file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 @@ -215,34 +215,34 @@ + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: + Depending on whether ``na_values`` is passed in, the behavior is as follows: - * If `keep_default_na` is True, and `na_values` are specified, `na_values` - is appended to the default NaN values used for parsing. - * If `keep_default_na` is True, and `na_values` are not specified, only + * If ``keep_default_na`` is True, and ``na_values`` are specified, + ``na_values`` is appended to the default NaN values used for parsing. + * If ``keep_default_na`` is True, and ``na_values`` are not specified, only the default NaN values are used for parsing. - * If `keep_default_na` is False, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is False, and `na_values` are not specified, no + * If ``keep_default_na`` is False, and ``na_values`` are specified, only + the NaN values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is False, and ``na_values`` are not specified, no strings will be parsed as NaN. - Note that if `na_filter` is passed in as False, the `keep_default_na` and - `na_values` parameters will be ignored. + Note that if `na_filter` is passed in as False, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. na_filter : bool, default True Detect missing value markers (empty strings and the value of na_values). In - data without any NAs, passing na_filter=False can improve the performance - of reading a large file. + data without any NAs, passing ``na_filter=False`` can improve the + performance of reading a large file. verbose : bool, default False Indicate number of NA values placed in non-numeric columns. parse_dates : bool, list-like, or dict, default False The behavior is as follows: - * bool. If True -> try parsing the index. - * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + * ``bool``. If True -> try parsing the index. + * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column. - * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column. - * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call result 'foo' If a column or index contains an unparsable date, the entire column or @@ -372,7 +372,8 @@ 1 NaN 2 2 #Comment 3 -Comment lines in the excel input file can be skipped using the `comment` kwarg +Comment lines in the excel input file can be skipped using the +``comment`` kwarg. >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP Name Value From 224ea88399c250226bea0cc8eedf2f3d857f0bc0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:34:39 +0100 Subject: [PATCH 51/63] Adjust concat tests for string option (#56446) --- pandas/tests/reshape/concat/test_append_common.py | 4 +++- pandas/tests/reshape/concat/test_categorical.py | 6 ++++-- pandas/tests/reshape/concat/test_empty.py | 10 ++++++---- pandas/tests/reshape/concat/test_index.py | 8 +++++--- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index ab20e8c8f6930..31c3ef3176222 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -57,10 +57,12 @@ class TestConcatAppendCommon: Test common dtype coercion rules between concat and append. """ - def test_dtypes(self, item, index_or_series): + def test_dtypes(self, item, index_or_series, using_infer_string): # to confirm test case covers intended dtypes typ, vals = item obj = index_or_series(vals) + if typ == "object" and using_infer_string: + typ = "string" if isinstance(obj, Index): assert obj.dtype == typ elif isinstance(obj, Series): diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 7acd0ff4f4c56..bbaaf0abecfbd 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -51,7 +51,7 @@ def test_categorical_concat(self, sort): exp["h"] = exp["h"].astype(df2["h"].dtype) tm.assert_frame_equal(res, exp) - def test_categorical_concat_dtypes(self): + def test_categorical_concat_dtypes(self, using_infer_string): # GH8143 index = ["cat", "obj", "num"] cat = Categorical(["a", "b", "c"]) @@ -59,7 +59,9 @@ def test_categorical_concat_dtypes(self): num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == "object" + result = df.dtypes == ( + object if not using_infer_string else "string[pyarrow_numpy]" + ) expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index c80f3244dccaf..30ef0a934157b 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -13,7 +13,7 @@ class TestEmptyConcat: - def test_handle_empty_objects(self, sort): + def test_handle_empty_objects(self, sort, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) @@ -26,7 +26,9 @@ def test_handle_empty_objects(self, sort): concatted = concat(frames, axis=0, sort=sort) expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) - expected["foo"] = expected["foo"].astype("O") + expected["foo"] = expected["foo"].astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) expected.loc[0:4, "foo"] = "bar" tm.assert_frame_equal(concatted, expected) @@ -275,14 +277,14 @@ def test_concat_empty_dataframe(self): expected = DataFrame(columns=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_concat_empty_dataframe_different_dtypes(self): + def test_concat_empty_dataframe_different_dtypes(self, using_infer_string): # 39037 df1 = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) df2 = DataFrame({"a": [1, 2, 3]}) result = concat([df1[:0], df2[:0]]) assert result["a"].dtype == np.int64 - assert result["b"].dtype == np.object_ + assert result["b"].dtype == np.object_ if not using_infer_string else "string" def test_concat_to_empty_ea(self): """48510 `concat` to an empty EA should maintain type EA dtype.""" diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index f835bb953ce6c..52bb9fa0f151b 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -447,12 +447,14 @@ def test_concat_index_find_common(self, dtype): ) tm.assert_frame_equal(result, expected) - def test_concat_axis_1_sort_false_rangeindex(self): + def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): # GH 46675 s1 = Series(["a", "b", "c"]) s2 = Series(["a", "b"]) s3 = Series(["a", "b", "c", "d"]) - s4 = Series([], dtype=object) + s4 = Series( + [], dtype=object if not using_infer_string else "string[pyarrow_numpy]" + ) result = concat( [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 ) @@ -463,7 +465,7 @@ def test_concat_axis_1_sort_false_rangeindex(self): ["c", np.nan] * 2, [np.nan] * 2 + ["d"] + [np.nan], ], - dtype=object, + dtype=object if not using_infer_string else "string[pyarrow_numpy]", ) tm.assert_frame_equal( result, expected, check_index_type=True, check_column_type=True From fbd4fcddcdc0bd5e83d6d135111003fa6d2681a0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:36:36 +0100 Subject: [PATCH 52/63] BUG: merge_asof raising incorrect error for strings (#56444) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 7 ++++++- pandas/tests/reshape/merge/test_merge_asof.py | 21 ++++++++++++------- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index acec8379ee5b3..908cd528dc11f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -665,6 +665,7 @@ Reshaping - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`concat` renaming :class:`Series` when ``ignore_index=False`` (:issue:`15047`) - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) +- Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f07c4fb8f7d5f..c43c16cded852 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1986,7 +1986,12 @@ def _validate_left_right_on(self, left_on, right_on): else: ro_dtype = self.right.index.dtype - if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): + if ( + is_object_dtype(lo_dtype) + or is_object_dtype(ro_dtype) + or is_string_dtype(lo_dtype) + or is_string_dtype(ro_dtype) + ): raise MergeError( f"Incompatible merge dtype, {repr(ro_dtype)} and " f"{repr(lo_dtype)}, both sides must have numeric dtype" diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 6d0a405430c9f..f6278e5e2f38b 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -11,6 +11,7 @@ Index, Timedelta, merge_asof, + option_context, to_datetime, ) import pandas._testing as tm @@ -3372,6 +3373,9 @@ def test_left_index_right_index_tolerance(self, unit): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize( "kwargs", [{"on": "x"}, {"left_index": True, "right_index": True}] ) @@ -3379,15 +3383,16 @@ def test_left_index_right_index_tolerance(self, unit): "data", [["2019-06-01 00:09:12", "2019-06-01 00:10:29"], [1.0, "2019-06-01 00:10:29"]], ) -def test_merge_asof_non_numerical_dtype(kwargs, data): +def test_merge_asof_non_numerical_dtype(kwargs, data, infer_string): # GH#29130 - left = pd.DataFrame({"x": data}, index=data) - right = pd.DataFrame({"x": data}, index=data) - with pytest.raises( - MergeError, - match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", - ): - merge_asof(left, right, **kwargs) + with option_context("future.infer_string", infer_string): + left = pd.DataFrame({"x": data}, index=data) + right = pd.DataFrame({"x": data}, index=data) + with pytest.raises( + MergeError, + match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", + ): + merge_asof(left, right, **kwargs) def test_merge_asof_non_numerical_dtype_object(): From 5b6723cdb8cbf591761756f5c6f181df820e780a Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 11 Dec 2023 12:37:56 -0500 Subject: [PATCH 53/63] BUG: DataFrame.join with left or right empty not respecting sort=True (#56443) * join with empty not respecting sort param * whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/indexes/base.py | 36 ++++++++++++------- .../tests/frame/methods/test_combine_first.py | 2 +- pandas/tests/frame/test_arithmetic.py | 6 ++-- pandas/tests/reshape/merge/test_join.py | 2 ++ pandas/tests/series/test_logical_ops.py | 8 ++--- 6 files changed, 35 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 908cd528dc11f..3385e28c029a9 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -247,7 +247,7 @@ These are bug fixes that might have notable behavior changes. In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not always return a result that followed the documented sort behavior. pandas now -follows the documented sort behavior in merge and join operations (:issue:`54611`, :issue:`56426`). +follows the documented sort behavior in merge and join operations (:issue:`54611`, :issue:`56426`, :issue:`56443`). As documented, ``sort=True`` sorts the join keys lexicographically in the resulting :class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9d998b46dbeed..5dc4a85ba9792 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4576,9 +4576,6 @@ def join( pother, how=how, level=level, return_indexers=True, sort=sort ) - lindexer: np.ndarray | None - rindexer: np.ndarray | None - # try to figure out the join level # GH3662 if level is None and (self._is_multi or other._is_multi): @@ -4592,25 +4589,38 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) + lidx: np.ndarray | None + ridx: np.ndarray | None + if len(other) == 0: if how in ("left", "outer"): - join_index = self._view() - rindexer = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, None, rindexer + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, lidx, ridx elif how in ("right", "inner", "cross"): join_index = other._view() - lindexer = np.array([]) - return join_index, lindexer, None + lidx = np.array([], dtype=np.intp) + return join_index, lidx, None if len(self) == 0: if how in ("right", "outer"): - join_index = other._view() - lindexer = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lindexer, None + if sort and not other.is_monotonic_increasing: + ridx = other.argsort() + join_index = other.take(ridx) + else: + ridx = None + join_index = other._view() + lidx = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, lidx, ridx elif how in ("left", "inner", "cross"): join_index = self._view() - rindexer = np.array([]) - return join_index, None, rindexer + ridx = np.array([], dtype=np.intp) + return join_index, None, ridx if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 941e4c03464ea..8aeab5dacd8b4 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -82,7 +82,7 @@ def test_combine_first(self, float_frame, using_infer_string): tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) - tm.assert_frame_equal(comb, float_frame) + tm.assert_frame_equal(comb, float_frame.sort_index()) comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) assert "faz" in comb.index diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index ec3222efab5a8..42ce658701355 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -629,10 +629,12 @@ def test_arith_flex_frame_corner(self, float_frame): # corner cases result = float_frame.add(float_frame[:0]) - tm.assert_frame_equal(result, float_frame * np.nan) + expected = float_frame.sort_index() * np.nan + tm.assert_frame_equal(result, expected) result = float_frame[:0].add(float_frame) - tm.assert_frame_equal(result, float_frame * np.nan) + expected = float_frame.sort_index() * np.nan + tm.assert_frame_equal(result, expected) with pytest.raises(NotImplementedError, match="fill_value"): float_frame.add(float_frame.iloc[0], fill_value=3) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 3408e6e4731bd..4cc887c32b585 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1023,6 +1023,8 @@ def test_join_empty(left_empty, how, exp): expected = DataFrame(columns=["B", "C"], dtype="int64") if how != "cross": expected = expected.rename_axis("A") + if how == "outer": + expected = expected.sort_index() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 166f52181fed4..153b4bfaaf444 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -40,11 +40,11 @@ def test_logical_operators_bool_dtype_with_empty(self): s_empty = Series([], dtype=object) res = s_tft & s_empty - expected = s_fff + expected = s_fff.sort_index() tm.assert_series_equal(res, expected) res = s_tft | s_empty - expected = s_tft + expected = s_tft.sort_index() tm.assert_series_equal(res, expected) def test_logical_operators_int_dtype_with_int_dtype(self): @@ -397,11 +397,11 @@ def test_logical_ops_label_based(self, using_infer_string): empty = Series([], dtype=object) result = a & empty.copy() - expected = Series([False, False, False], list("bca")) + expected = Series([False, False, False], list("abc")) tm.assert_series_equal(result, expected) result = a | empty.copy() - expected = Series([True, False, True], list("bca")) + expected = Series([True, True, False], list("abc")) tm.assert_series_equal(result, expected) # vs non-matching From 8d259c09b88b4a154abc83d723d31538fe843769 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 11 Dec 2023 09:40:18 -0800 Subject: [PATCH 54/63] REF: get_supported_reso->get_supported_dtype (#56439) --- pandas/_libs/tslibs/__init__.py | 10 ++++------ pandas/_libs/tslibs/dtypes.pxd | 6 +++--- pandas/_libs/tslibs/dtypes.pyi | 3 --- pandas/_libs/tslibs/dtypes.pyx | 6 +++--- pandas/_libs/tslibs/np_datetime.pyi | 2 ++ pandas/_libs/tslibs/np_datetime.pyx | 24 ++++++++++++++++++++++++ pandas/core/arrays/_mixins.py | 13 +++---------- pandas/core/arrays/datetimes.py | 20 ++++++++------------ pandas/core/arrays/masked.py | 9 ++------- pandas/core/arrays/numpy_.py | 9 ++------- pandas/core/arrays/timedeltas.py | 17 ++++++----------- pandas/core/construction.py | 14 ++++++++------ pandas/core/dtypes/cast.py | 6 ++---- pandas/core/ops/array_ops.py | 20 ++++++++------------ pandas/core/tools/datetimes.py | 5 ++--- pandas/tests/tslibs/test_api.py | 5 ++--- 16 files changed, 79 insertions(+), 90 deletions(-) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index b626959203295..88a9a259ac8ec 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -30,20 +30,16 @@ "get_unit_from_dtype", "periods_per_day", "periods_per_second", - "is_supported_unit", - "npy_unit_to_abbrev", - "get_supported_reso", "guess_datetime_format", "add_overflowsafe", + "get_supported_dtype", + "is_supported_dtype", ] from pandas._libs.tslibs import dtypes # pylint: disable=import-self from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.dtypes import ( Resolution, - get_supported_reso, - is_supported_unit, - npy_unit_to_abbrev, periods_per_day, periods_per_second, ) @@ -58,6 +54,8 @@ OutOfBoundsTimedelta, add_overflowsafe, astype_overflowsafe, + get_supported_dtype, + is_supported_dtype, is_unitless, py_get_unit_from_dtype as get_unit_from_dtype, ) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index bda4fcf04234b..88cfa6ca60d93 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -3,13 +3,13 @@ from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT -cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) +cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev) cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1 cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1 -cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) -cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso) +cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) +cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cpdef freq_to_period_freqstr(freq_n, freq_name) cdef dict c_OFFSET_TO_PERIOD_FREQSTR diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 76649aaaa41bf..72d12ca2d9dc7 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -4,9 +4,6 @@ OFFSET_TO_PERIOD_FREQSTR: dict[str, str] def periods_per_day(reso: int = ...) -> int: ... def periods_per_second(reso: int) -> int: ... -def is_supported_unit(reso: int) -> bool: ... -def npy_unit_to_abbrev(unit: int) -> str: ... -def get_supported_reso(reso: int) -> int: ... def abbrev_to_npy_unit(abbrev: str) -> int: ... def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ... diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 17f517e5e7264..52e1133e596c5 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -559,7 +559,7 @@ class NpyDatetimeUnit(Enum): NPY_FR_GENERIC = NPY_DATETIMEUNIT.NPY_FR_GENERIC -cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): +cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): # If we have an unsupported reso, return the nearest supported reso. if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # TODO: or raise ValueError? trying this gives unraisable errors, but @@ -572,7 +572,7 @@ cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): return reso -cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso): +cdef bint is_supported_unit(NPY_DATETIMEUNIT reso): return ( reso == NPY_DATETIMEUNIT.NPY_FR_ns or reso == NPY_DATETIMEUNIT.NPY_FR_us @@ -581,7 +581,7 @@ cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso): ) -cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): +cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # generic -> default to nanoseconds return "ns" diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index 5a4ba673dbeff..00ef35c50e532 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -23,3 +23,5 @@ def add_overflowsafe( left: npt.NDArray[np.int64], right: npt.NDArray[np.int64], ) -> npt.NDArray[np.int64]: ... +def get_supported_dtype(dtype: np.dtype) -> np.dtype: ... +def is_supported_dtype(dtype: np.dtype) -> bool: ... diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 5f5e75b1e64d0..54a5bcf3164ee 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -39,6 +39,8 @@ from numpy cimport ( ) from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + is_supported_unit, npy_unit_to_abbrev, npy_unit_to_attrname, ) @@ -91,6 +93,28 @@ def py_get_unit_from_dtype(dtype): return get_unit_from_dtype(dtype) +def get_supported_dtype(dtype: cnp.dtype) -> cnp.dtype: + reso = get_unit_from_dtype(dtype) + new_reso = get_supported_reso(reso) + new_unit = npy_unit_to_abbrev(new_reso) + + # Accessing dtype.kind here incorrectly(?) gives "" instead of "m"/"M", + # so we check type_num instead + if dtype.type_num == cnp.NPY_DATETIME: + new_dtype = np.dtype(f"M8[{new_unit}]") + else: + new_dtype = np.dtype(f"m8[{new_unit}]") + return new_dtype + + +def is_supported_dtype(dtype: cnp.dtype) -> bool: + if dtype.type_num not in [cnp.NPY_DATETIME, cnp.NPY_TIMEDELTA]: + raise ValueError("is_unitless dtype must be datetime64 or timedelta64") + cdef: + NPY_DATETIMEUNIT unit = get_unit_from_dtype(dtype) + return is_supported_unit(unit) + + def is_unitless(dtype: cnp.dtype) -> bool: """ Check if a datetime64 or timedelta64 dtype has no attached unit. diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 8d1f5262e7911..cb8f802239146 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -13,10 +13,7 @@ from pandas._libs import lib from pandas._libs.arrays import NDArrayBacked -from pandas._libs.tslibs import ( - get_unit_from_dtype, - is_supported_unit, -) +from pandas._libs.tslibs import is_supported_dtype from pandas._typing import ( ArrayLike, AxisInt, @@ -141,16 +138,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: cls = dtype.construct_array_type() # type: ignore[assignment] dt64_values = arr.view(f"M8[{dtype.unit}]") return cls(dt64_values, dtype=dtype) - elif lib.is_np_dtype(dtype, "M") and is_supported_unit( - get_unit_from_dtype(dtype) - ): + elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): from pandas.core.arrays import DatetimeArray dt64_values = arr.view(dtype) return DatetimeArray(dt64_values, dtype=dtype) - elif lib.is_np_dtype(dtype, "m") and is_supported_unit( - get_unit_from_dtype(dtype) - ): + elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): from pandas.core.arrays import TimedeltaArray td64_values = arr.view(dtype) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 64f08adcd48c4..0074645a482b2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -27,14 +27,13 @@ astype_overflowsafe, fields, get_resolution, - get_supported_reso, + get_supported_dtype, get_unit_from_dtype, ints_to_pydatetime, is_date_array_normalized, - is_supported_unit, + is_supported_dtype, is_unitless, normalize_i8_timestamps, - npy_unit_to_abbrev, timezones, to_offset, tz_convert_from_utc, @@ -712,7 +711,7 @@ def astype(self, dtype, copy: bool = True): self.tz is None and lib.is_np_dtype(dtype, "M") and not is_unitless(dtype) - and is_supported_unit(get_unit_from_dtype(dtype)) + and is_supported_dtype(dtype) ): # unit conversion e.g. datetime64[s] res_values = astype_overflowsafe(self._ndarray, dtype, copy=True) @@ -2307,7 +2306,7 @@ def _sequence_to_dt64( assert isinstance(result, np.ndarray), type(result) assert result.dtype.kind == "M" assert result.dtype != "M8" - assert is_supported_unit(get_unit_from_dtype(result.dtype)) + assert is_supported_dtype(result.dtype) return result, tz @@ -2321,14 +2320,10 @@ def _construct_from_dt64_naive( # lib.is_np_dtype(data.dtype) new_dtype = data.dtype - data_unit = get_unit_from_dtype(new_dtype) - if not is_supported_unit(data_unit): + if not is_supported_dtype(new_dtype): # Cast to the nearest supported unit, generally "s" - new_reso = get_supported_reso(data_unit) - new_unit = npy_unit_to_abbrev(new_reso) - new_dtype = np.dtype(f"M8[{new_unit}]") + new_dtype = get_supported_dtype(new_dtype) data = astype_overflowsafe(data, dtype=new_dtype, copy=False) - data_unit = get_unit_from_dtype(new_dtype) copy = False if data.dtype.byteorder == ">": @@ -2346,6 +2341,7 @@ def _construct_from_dt64_naive( if data.ndim > 1: data = data.ravel() + data_unit = get_unit_from_dtype(new_dtype) data = tzconversion.tz_localize_to_utc( data.view("i8"), tz, ambiguous=ambiguous, creso=data_unit ) @@ -2552,7 +2548,7 @@ def _validate_dt64_dtype(dtype): if ( isinstance(dtype, np.dtype) - and (dtype.kind != "M" or not is_supported_unit(get_unit_from_dtype(dtype))) + and (dtype.kind != "M" or not is_supported_dtype(dtype)) ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)): raise ValueError( f"Unexpected value for 'dtype': '{dtype}'. " diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2f0cf7a67c1cc..b35c1033df384 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -15,10 +15,7 @@ lib, missing as libmissing, ) -from pandas._libs.tslibs import ( - get_unit_from_dtype, - is_supported_unit, -) +from pandas._libs.tslibs import is_supported_dtype from pandas._typing import ( ArrayLike, AstypeArg, @@ -876,9 +873,7 @@ def _maybe_mask_result( return BooleanArray(result, mask, copy=False) - elif lib.is_np_dtype(result.dtype, "m") and is_supported_unit( - get_unit_from_dtype(result.dtype) - ): + elif lib.is_np_dtype(result.dtype, "m") and is_supported_dtype(result.dtype): # e.g. test_numeric_arr_mul_tdscalar_numexpr_path from pandas.core.arrays import TimedeltaArray diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index efe0c0df45e00..d83a37088daec 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -8,10 +8,7 @@ import numpy as np from pandas._libs import lib -from pandas._libs.tslibs import ( - get_unit_from_dtype, - is_supported_unit, -) +from pandas._libs.tslibs import is_supported_dtype from pandas.compat.numpy import function as nv from pandas.core.dtypes.astype import astype_array @@ -553,9 +550,7 @@ def _cmp_method(self, other, op): def _wrap_ndarray_result(self, result: np.ndarray): # If we have timedelta64[ns] result, return a TimedeltaArray instead # of a NumpyExtensionArray - if result.dtype.kind == "m" and is_supported_unit( - get_unit_from_dtype(result.dtype) - ): + if result.dtype.kind == "m" and is_supported_dtype(result.dtype): from pandas.core.arrays import TimedeltaArray return TimedeltaArray._simple_new(result, dtype=result.dtype) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f55d3de8878ad..ccb63d6677b1a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -19,11 +19,9 @@ Tick, Timedelta, astype_overflowsafe, - get_supported_reso, - get_unit_from_dtype, + get_supported_dtype, iNaT, - is_supported_unit, - npy_unit_to_abbrev, + is_supported_dtype, periods_per_second, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized @@ -352,7 +350,7 @@ def astype(self, dtype, copy: bool = True): return self.copy() return self - if is_supported_unit(get_unit_from_dtype(dtype)): + if is_supported_dtype(dtype): # unit conversion e.g. timedelta64[s] res_values = astype_overflowsafe(self._ndarray, dtype, copy=False) return type(self)._simple_new( @@ -1064,12 +1062,9 @@ def sequence_to_td64ns( copy = False elif lib.is_np_dtype(data.dtype, "m"): - data_unit = get_unit_from_dtype(data.dtype) - if not is_supported_unit(data_unit): + if not is_supported_dtype(data.dtype): # cast to closest supported unit, i.e. s or ns - new_reso = get_supported_reso(data_unit) - new_unit = npy_unit_to_abbrev(new_reso) - new_dtype = np.dtype(f"m8[{new_unit}]") + new_dtype = get_supported_dtype(data.dtype) data = astype_overflowsafe(data, dtype=new_dtype, copy=False) copy = False @@ -1173,7 +1168,7 @@ def _validate_td64_dtype(dtype) -> DtypeObj: if not lib.is_np_dtype(dtype, "m"): raise ValueError(f"dtype '{dtype}' is invalid, should be np.timedelta64 dtype") - elif not is_supported_unit(get_unit_from_dtype(dtype)): + elif not is_supported_dtype(dtype): raise ValueError("Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'") return dtype diff --git a/pandas/core/construction.py b/pandas/core/construction.py index a0a92a99abe51..8cb76e57eba7e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -24,8 +24,8 @@ from pandas._libs import lib from pandas._libs.tslibs import ( Period, - get_unit_from_dtype, - is_supported_unit, + get_supported_dtype, + is_supported_dtype, ) from pandas._typing import ( AnyArrayLike, @@ -370,9 +370,9 @@ def array( # 1. datetime64[ns,us,ms,s] # 2. timedelta64[ns,us,ms,s] # so that a DatetimeArray is returned. - if lib.is_np_dtype(dtype, "M") and is_supported_unit(get_unit_from_dtype(dtype)): + if lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) - if lib.is_np_dtype(dtype, "m") and is_supported_unit(get_unit_from_dtype(dtype)): + if lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) elif lib.is_np_dtype(dtype, "mM"): @@ -490,12 +490,14 @@ def ensure_wrapped_if_datetimelike(arr): if arr.dtype.kind == "M": from pandas.core.arrays import DatetimeArray - return DatetimeArray._from_sequence(arr) + dtype = get_supported_dtype(arr.dtype) + return DatetimeArray._from_sequence(arr, dtype=dtype) elif arr.dtype.kind == "m": from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(arr) + dtype = get_supported_dtype(arr.dtype) + return TimedeltaArray._from_sequence(arr, dtype=dtype) return arr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 320f028f4484c..d5144174d3c71 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -36,8 +36,7 @@ OutOfBoundsTimedelta, Timedelta, Timestamp, - get_unit_from_dtype, - is_supported_unit, + is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 from pandas.errors import ( @@ -1266,8 +1265,7 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None: pass elif dtype.kind in "mM": - reso = get_unit_from_dtype(dtype) - if not is_supported_unit(reso): + if not is_supported_dtype(dtype): # pre-2.0 we would silently swap in nanos for lower-resolutions, # raise for above-nano resolutions if dtype.name in ["datetime64", "timedelta64"]: diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index b39930da9f711..d8a772aac6082 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -24,11 +24,9 @@ ) from pandas._libs.tslibs import ( BaseOffset, - get_supported_reso, - get_unit_from_dtype, - is_supported_unit, + get_supported_dtype, + is_supported_dtype, is_unitless, - npy_unit_to_abbrev, ) from pandas.util._exceptions import find_stack_level @@ -543,10 +541,9 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): # GH 52295 if is_unitless(obj.dtype): obj = obj.astype("datetime64[ns]") - elif not is_supported_unit(get_unit_from_dtype(obj.dtype)): - unit = get_unit_from_dtype(obj.dtype) - closest_unit = npy_unit_to_abbrev(get_supported_reso(unit)) - obj = obj.astype(f"datetime64[{closest_unit}]") + elif not is_supported_dtype(obj.dtype): + new_dtype = get_supported_dtype(obj.dtype) + obj = obj.astype(new_dtype) right = np.broadcast_to(obj, shape) return DatetimeArray(right) @@ -562,10 +559,9 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): # GH 52295 if is_unitless(obj.dtype): obj = obj.astype("timedelta64[ns]") - elif not is_supported_unit(get_unit_from_dtype(obj.dtype)): - unit = get_unit_from_dtype(obj.dtype) - closest_unit = npy_unit_to_abbrev(get_supported_reso(unit)) - obj = obj.astype(f"timedelta64[{closest_unit}]") + elif not is_supported_dtype(obj.dtype): + new_dtype = get_supported_dtype(obj.dtype) + obj = obj.astype(new_dtype) right = np.broadcast_to(obj, shape) return TimedeltaArray(right) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 26cbc77e4e8ae..5ebf1e442733e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -25,8 +25,7 @@ Timedelta, Timestamp, astype_overflowsafe, - get_unit_from_dtype, - is_supported_unit, + is_supported_dtype, timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized @@ -385,7 +384,7 @@ def _convert_listlike_datetimes( return arg elif lib.is_np_dtype(arg_dtype, "M"): - if not is_supported_unit(get_unit_from_dtype(arg_dtype)): + if not is_supported_dtype(arg_dtype): # We go to closest supported reso, i.e. "s" arg = astype_overflowsafe( # TODO: looks like we incorrectly raise with errors=="ignore" diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index e02cea2fef426..42d055326c2a5 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -54,11 +54,10 @@ def test_namespace(): "get_unit_from_dtype", "periods_per_day", "periods_per_second", - "is_supported_unit", - "get_supported_reso", - "npy_unit_to_abbrev", "guess_datetime_format", "add_overflowsafe", + "get_supported_dtype", + "is_supported_dtype", ] expected = set(submodules + api) From feacb6faccb6e1ce016b61e091b46c11ebbe86c9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 11 Dec 2023 09:41:45 -0800 Subject: [PATCH 55/63] TST: de-duplicate PeriodIndex constructor tests (#56435) * rename test_period_asfreq->test_period * TST: de-duplicate PeriodIndex constructor tests --- .../tests/indexes/period/test_constructors.py | 159 +++++++++++------- pandas/tests/indexes/period/test_period.py | 136 --------------- .../tests/indexes/period/test_period_range.py | 137 ++++++++++----- pandas/tests/scalar/period/test_period.py | 57 ++++--- .../{test_period_asfreq.py => test_period.py} | 7 + 5 files changed, 232 insertions(+), 264 deletions(-) rename pandas/tests/tslibs/{test_period_asfreq.py => test_period.py} (92%) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index aecd3b3bace9a..d53581fab40c7 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -19,7 +19,54 @@ from pandas.core.arrays import PeriodArray +class TestPeriodIndexDisallowedFreqs: + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2M", "2ME"), + ("2Q-MAR", "2QE-MAR"), + ("2Y-FEB", "2YE-FEB"), + ], + ) + def test_period_index_frequency_ME_error_message(self, freq, freq_depr): + # GH#52064 + msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) + + with pytest.raises(ValueError, match=msg): + period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) + + @pytest.mark.parametrize("freq_depr", ["2SME", "2CBME", "2BYE"]) + def test_period_index_frequency_invalid_freq(self, freq_depr): + # GH#9586 + msg = f"Invalid frequency: {freq_depr[1:]}" + + with pytest.raises(ValueError, match=msg): + period_range("2020-01", "2020-05", freq=freq_depr) + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + + class TestPeriodIndex: + def test_from_ordinals(self): + Period(ordinal=-1000, freq="Y") + Period(ordinal=0, freq="Y") + + msg = "The 'ordinal' keyword in PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="Y") + with tm.assert_produces_warning(FutureWarning, match=msg): + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="Y") + tm.assert_index_equal(idx1, idx2) + + alt1 = PeriodIndex.from_ordinals([-1, 0, 1], freq="Y") + tm.assert_index_equal(alt1, idx1) + + alt2 = PeriodIndex.from_ordinals(np.array([-1, 0, 1]), freq="Y") + tm.assert_index_equal(alt2, idx2) + def test_keyword_mismatch(self): # GH#55961 we should get exactly one of data/ordinals/**fields per = Period("2016-01-01", "D") @@ -131,11 +178,6 @@ def test_constructor_field_arrays(self): exp = period_range("2007-01", periods=3, freq="M") tm.assert_index_equal(idx, exp) - def test_constructor_U(self): - # U was used as undefined period - with pytest.raises(ValueError, match="Invalid frequency: X"): - period_range("2007-1-1", periods=500, freq="X") - def test_constructor_nano(self): idx = period_range( start=Period(ordinal=1, freq="ns"), @@ -371,49 +413,12 @@ def test_constructor_mixed(self): exp = PeriodIndex(["2011-01-01", "NaT", "2012-01-01"], freq="D") tm.assert_index_equal(idx, exp) - def test_constructor_simple_new(self): - idx = period_range("2007-01", name="p", periods=2, freq="M") - - with pytest.raises(AssertionError, match=""): - idx._simple_new(idx, name="p") - - result = idx._simple_new(idx._data, name="p") - tm.assert_index_equal(result, idx) - - msg = "Should be numpy array of type i8" - with pytest.raises(AssertionError, match=msg): - # Need ndarray, not int64 Index - type(idx._data)._simple_new(Index(idx.asi8), dtype=idx.dtype) - - arr = type(idx._data)._simple_new(idx.asi8, dtype=idx.dtype) - result = idx._simple_new(arr, name="p") - tm.assert_index_equal(result, idx) - - def test_constructor_simple_new_empty(self): - # GH13079 - idx = PeriodIndex([], freq="M", name="p") - with pytest.raises(AssertionError, match=""): - idx._simple_new(idx, name="p") - - result = idx._simple_new(idx._data, name="p") - tm.assert_index_equal(result, idx) - @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) def test_constructor_floats(self, floats): - with pytest.raises(AssertionError, match=" Date: Mon, 11 Dec 2023 09:43:41 -0800 Subject: [PATCH 56/63] DEPR: make_block (#56422) * DEPR: make_block * lint fixup --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/internals/api.py | 11 +++++++++- pandas/tests/internals/test_api.py | 4 +++- pandas/tests/internals/test_internals.py | 20 +++++++++++++------ .../tests/io/parser/common/test_chunksize.py | 1 + pandas/tests/io/parser/test_parse_dates.py | 9 ++++++--- 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 3385e28c029a9..8209525721b98 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -432,6 +432,7 @@ Other Deprecations ^^^^^^^^^^^^^^^^^^ - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) +- Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) @@ -484,7 +485,6 @@ Other Deprecations - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) -- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index b0b3937ca47ea..e5ef44d07061e 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -9,10 +9,12 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings import numpy as np from pandas._libs.internals import BlockPlacement +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( @@ -50,6 +52,14 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ + warnings.warn( + # GH#40226 + "make_block is deprecated and will be removed in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + if dtype is not None: dtype = pandas_dtype(dtype) @@ -113,7 +123,6 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int def __getattr__(name: str): # GH#55139 - import warnings if name in [ "Block", diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 1251a6ae97a1c..f816cef38b9ab 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -68,7 +68,9 @@ def test_deprecations(name): def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") - blk = api.make_block(dti, placement=[0]) + msg = "make_block is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + blk = api.make_block(dti, placement=[0]) assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ce88bae6e02f2..2265522bc7ecb 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1383,9 +1383,11 @@ def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" + depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): - make_block(values, placement, ndim=2) + with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1400,8 +1402,12 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) + warn = None if block_maker is not make_block else DeprecationWarning + msg = "make_block is deprecated and will be removed in a future version" + # NumpyExtensionArray, no dtype - result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=msg): + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: @@ -1409,14 +1415,16 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=msg): + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # new_block no longer taked dtype keyword # ndarray, NumpyEADtype - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) + with tm.assert_produces_warning(warn, match=msg): + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index baed74fc212e4..5e47bcc1c5b0e 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -233,6 +233,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float +@pytest.mark.filterwarnings("ignore:make_block is deprecated:FutureWarning") def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9355d6089b742..d65961f9483d8 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -33,9 +33,12 @@ from pandas.io.parsers import read_csv -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning"), +] xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") From d352d5a012a59ca41203c7d9b49554d43de9f903 Mon Sep 17 00:00:00 2001 From: ccccjone <144291871+ccccjone@users.noreply.github.com> Date: Mon, 11 Dec 2023 11:06:17 -0800 Subject: [PATCH 57/63] TST: Improved test coverage for Styler.bar error conditions (#56341) * Improved test coverage for Styler.bar error conditions * Fixed the code style issue causing test failure * Fixed a Styler.bar bug of missing value * Adopted another tricky way to fix the bug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added a test for Styler.bar with pyarrow * Updated with code style --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/io/formats/style.py | 5 ++- pandas/tests/io/formats/style/test_bar.py | 53 ++++++++++++++++++++++- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 39d5b45862a8f..7d5c354aef002 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -4082,8 +4082,9 @@ def css_calc(x, left: float, right: float, align: str, color: str | list | tuple return ret values = data.to_numpy() - left = np.nanmin(values) if vmin is None else vmin - right = np.nanmax(values) if vmax is None else vmax + # A tricky way to address the issue where np.nanmin/np.nanmax fail to handle pd.NA. + left = np.nanmin(data.min(skipna=True)) if vmin is None else vmin + right = np.nanmax(data.max(skipna=True)) if vmax is None else vmax z: float = 0 # adjustment to translate data if align == "mid": diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py index 19884aaac86a7..b0e4712e8bb3d 100644 --- a/pandas/tests/io/formats/style/test_bar.py +++ b/pandas/tests/io/formats/style/test_bar.py @@ -1,7 +1,13 @@ +import io + import numpy as np import pytest -from pandas import DataFrame +from pandas import ( + NA, + DataFrame, + read_csv, +) pytest.importorskip("jinja2") @@ -305,3 +311,48 @@ def test_bar_value_error_raises(): msg = r"`height` must be a value in \[0, 100\]" with pytest.raises(ValueError, match=msg): df.style.bar(height=200).to_html() + + +def test_bar_color_and_cmap_error_raises(): + df = DataFrame({"A": [1, 2, 3, 4]}) + msg = "`color` and `cmap` cannot both be given" + # Test that providing both color and cmap raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color="#d65f5f", cmap="viridis").to_html() + + +def test_bar_invalid_color_type_error_raises(): + df = DataFrame({"A": [1, 2, 3, 4]}) + msg = ( + r"`color` must be string or list or tuple of 2 strings," + r"\(eg: color=\['#d65f5f', '#5fba7d'\]\)" + ) + # Test that providing an invalid color type raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color=123).to_html() + + # Test that providing a color list with more than two elements raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color=["#d65f5f", "#5fba7d", "#abcdef"]).to_html() + + +def test_styler_bar_with_NA_values(): + df1 = DataFrame({"A": [1, 2, NA, 4]}) + df2 = DataFrame([[NA, NA], [NA, NA]]) + expected_substring = "style type=" + html_output1 = df1.style.bar(subset="A").to_html() + html_output2 = df2.style.bar(align="left", axis=None).to_html() + assert expected_substring in html_output1 + assert expected_substring in html_output2 + + +def test_style_bar_with_pyarrow_NA_values(): + data = """name,age,test1,test2,teacher + Adam,15,95.0,80,Ashby + Bob,16,81.0,82,Ashby + Dave,16,89.0,84,Jones + Fred,15,,88,Jones""" + df = read_csv(io.StringIO(data), dtype_backend="pyarrow") + expected_substring = "style type=" + html_output = df.style.bar(subset="test1").to_html() + assert expected_substring in html_output From 7710240d74259f555dec7448f2d3378a11172a72 Mon Sep 17 00:00:00 2001 From: MainHanzo <33153091+MainHanzo@users.noreply.github.com> Date: Wed, 6 Dec 2023 21:22:56 +0100 Subject: [PATCH 58/63] Update nlargest nsmallest doc --- pandas/core/frame.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e741fa7b37f33..1329cc451eec9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7505,7 +7505,7 @@ def nlargest( - ``first`` : prioritize the first occurrence(s) - ``last`` : prioritize the last occurrence(s) - - ``all`` : do not drop any duplicates, even it means + - ``all`` : keep all the ties of the smallest item even it means selecting more than `n` items. Returns @@ -7568,7 +7568,9 @@ def nlargest( Italy 59000000 1937894 IT Brunei 434000 12128 BN - When using ``keep='all'``, all duplicate items are maintained: + When using ``keep='all'``, the number of element kept can go beyond n + if there are duplicates value for the smallest element, all the + ties are kept: >>> df.nlargest(3, 'population', keep='all') population GDP alpha-2 @@ -7578,6 +7580,17 @@ def nlargest( Maldives 434000 4520 MV Brunei 434000 12128 BN + However, ``nlargest`` does not keep n distinct largest elements: + + >>> df.nlargest(5, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + + To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. @@ -7614,7 +7627,7 @@ def nsmallest( - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. - - ``all`` : do not drop any duplicates, even it means + - ``all`` : keep all the ties of the largest item even it means selecting more than `n` items. Returns @@ -7669,7 +7682,10 @@ def nsmallest( Tuvalu 11300 38 TV Nauru 337000 182 NR - When using ``keep='all'``, all duplicate items are maintained: + When using ``keep='all'``, the number of element kept can go beyond n + if there are duplicates value for the largest element, all the + ties are kept. However, ``nsmallest`` does not keep n distinct + smallest elements: >>> df.nsmallest(3, 'population', keep='all') population GDP alpha-2 From da0026c75b072bc101cb71307244365aec9609e8 Mon Sep 17 00:00:00 2001 From: MainHanzo <33153091+MainHanzo@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:21:09 +0100 Subject: [PATCH 59/63] double line break --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1329cc451eec9..9514a6684ab75 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7590,7 +7590,6 @@ def nlargest( Maldives 434000 4520 MV Brunei 434000 12128 BN - To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. From 479728f2a855bb21941fc6b0ae25f960d5200928 Mon Sep 17 00:00:00 2001 From: MainHanzo <33153091+MainHanzo@users.noreply.github.com> Date: Thu, 7 Dec 2023 16:28:59 +0100 Subject: [PATCH 60/63] code review --- pandas/core/frame.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9514a6684ab75..a22142904e73d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7505,8 +7505,8 @@ def nlargest( - ``first`` : prioritize the first occurrence(s) - ``last`` : prioritize the last occurrence(s) - - ``all`` : keep all the ties of the smallest item even it means - selecting more than `n` items. + - ``all`` : keep all the ties of the smallest item even if it means + selecting more than ``n`` items. Returns ------- @@ -7568,8 +7568,8 @@ def nlargest( Italy 59000000 1937894 IT Brunei 434000 12128 BN - When using ``keep='all'``, the number of element kept can go beyond n - if there are duplicates value for the smallest element, all the + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the smallest element, all the ties are kept: >>> df.nlargest(3, 'population', keep='all') @@ -7580,7 +7580,7 @@ def nlargest( Maldives 434000 4520 MV Brunei 434000 12128 BN - However, ``nlargest`` does not keep n distinct largest elements: + However, ``nlargest`` does not keep ``n`` distinct largest elements: >>> df.nlargest(5, 'population', keep='all') population GDP alpha-2 @@ -7626,8 +7626,8 @@ def nsmallest( - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. - - ``all`` : keep all the ties of the largest item even it means - selecting more than `n` items. + - ``all`` : keep all the ties of the largest item even if it means + selecting more than ``n`` items. Returns ------- @@ -7681,9 +7681,9 @@ def nsmallest( Tuvalu 11300 38 TV Nauru 337000 182 NR - When using ``keep='all'``, the number of element kept can go beyond n - if there are duplicates value for the largest element, all the - ties are kept. However, ``nsmallest`` does not keep n distinct + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the largest element, all the + ties are kept. However, ``nsmallest`` does not keep ``n`` distinct smallest elements: >>> df.nsmallest(3, 'population', keep='all') From 56ff72417b4076132e6947c03673c10f964e1a9c Mon Sep 17 00:00:00 2001 From: MainHanzo <33153091+MainHanzo@users.noreply.github.com> Date: Sat, 9 Dec 2023 01:54:54 +0100 Subject: [PATCH 61/63] code review for nsmallest --- pandas/core/frame.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a22142904e73d..49f57c47d3ecd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7683,8 +7683,7 @@ def nsmallest( When using ``keep='all'``, the number of element kept can go beyond ``n`` if there are duplicate values for the largest element, all the - ties are kept. However, ``nsmallest`` does not keep ``n`` distinct - smallest elements: + ties are kept. >>> df.nsmallest(3, 'population', keep='all') population GDP alpha-2 @@ -7692,6 +7691,16 @@ def nsmallest( Anguilla 11300 311 AI Iceland 337000 17036 IS Nauru 337000 182 NR + + However, ``nsmallest`` does not keep ``n`` distinct + smallest elements: + + >>> df.nsmallest(4, 'population', keep='all') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + Nauru 337000 182 NR To order by the smallest values in column "population" and then "GDP", we can specify multiple columns like in the next example. From 07399030aabe13df1e15e3fe30dfb400e04efafb Mon Sep 17 00:00:00 2001 From: MainHanzo <33153091+MainHanzo@users.noreply.github.com> Date: Mon, 11 Dec 2023 21:42:14 +0100 Subject: [PATCH 62/63] whitespace --- pandas/core/frame.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 49f57c47d3ecd..ce4f10b1bea1b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7568,7 +7568,7 @@ def nlargest( Italy 59000000 1937894 IT Brunei 434000 12128 BN - When using ``keep='all'``, the number of element kept can go beyond ``n`` + When using ``keep='all'``, the number of element kept can go beyond ``n`` if there are duplicate values for the smallest element, all the ties are kept: @@ -7581,7 +7581,7 @@ def nlargest( Brunei 434000 12128 BN However, ``nlargest`` does not keep ``n`` distinct largest elements: - + >>> df.nlargest(5, 'population', keep='all') population GDP alpha-2 France 65000000 2583560 FR @@ -7589,7 +7589,7 @@ def nlargest( Malta 434000 12011 MT Maldives 434000 4520 MV Brunei 434000 12128 BN - + To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. @@ -7691,7 +7691,7 @@ def nsmallest( Anguilla 11300 311 AI Iceland 337000 17036 IS Nauru 337000 182 NR - + However, ``nsmallest`` does not keep ``n`` distinct smallest elements: From a803dbfa4043c3ba2ac75e91a3bbb5718374c296 Mon Sep 17 00:00:00 2001 From: MainHanzo <33153091+MainHanzo@users.noreply.github.com> Date: Mon, 11 Dec 2023 22:18:16 +0100 Subject: [PATCH 63/63] whitespace --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ce4f10b1bea1b..8ba9926c054ba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7681,9 +7681,9 @@ def nsmallest( Tuvalu 11300 38 TV Nauru 337000 182 NR - When using ``keep='all'``, the number of element kept can go beyond ``n`` + When using ``keep='all'``, the number of element kept can go beyond ``n`` if there are duplicate values for the largest element, all the - ties are kept. + ties are kept. >>> df.nsmallest(3, 'population', keep='all') population GDP alpha-2