diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 341ac2c0b48ec..c810af32f7480 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -9,16 +9,23 @@ from pandas.compat import pa_version_under10p1 +from pandas.core.dtypes.missing import isna + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc if TYPE_CHECKING: - from pandas._typing import Self + from collections.abc import Sized + + from pandas._typing import ( + Scalar, + Self, + ) class ArrowStringArrayMixin: - _pa_array = None + _pa_array: Sized def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -97,3 +104,37 @@ def _str_removesuffix(self, suffix: str): removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) + + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fbffb4a0a9990..fa778aab71349 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2337,38 +2337,7 @@ def _str_contains( result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return type(self)(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self: - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _result_converter(self, result): return type(self)(result) def _str_replace( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 15807c365ecfd..cfc892b9e3648 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -280,6 +280,8 @@ def astype(self, dtype, copy: bool = True): # String methods interface _str_map = BaseStringArray._str_map + _str_startswith = ArrowStringArrayMixin._str_startswith + _str_endswith = ArrowStringArrayMixin._str_endswith def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -298,44 +300,6 @@ def _str_contains( result[isna(result)] = bool(na) return result - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._convert_bool_result(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._convert_bool_result(result) - def _str_replace( self, pat: str | re.Pattern,