|
9 | 9 |
|
10 | 10 | from pandas.compat import pa_version_under10p1
|
11 | 11 |
|
| 12 | +from pandas.core.dtypes.missing import isna |
| 13 | + |
12 | 14 | if not pa_version_under10p1:
|
13 | 15 | import pyarrow as pa
|
14 | 16 | import pyarrow.compute as pc
|
15 | 17 |
|
16 | 18 | if TYPE_CHECKING:
|
17 |
| - from pandas._typing import Self |
| 19 | + from collections.abc import Sized |
| 20 | + |
| 21 | + from pandas._typing import ( |
| 22 | + Scalar, |
| 23 | + Self, |
| 24 | + ) |
18 | 25 |
|
19 | 26 |
|
20 | 27 | class ArrowStringArrayMixin:
|
21 |
| - _pa_array = None |
| 28 | + # _object_compat specifies whether we should 1) attempt to match behaviors |
| 29 | + # of the object-backed StringDtype and 2) fall back to object-based |
| 30 | + # computation for cases that pyarrow does not support natively. |
| 31 | + _object_compat = False |
| 32 | + _pa_array: Sized |
22 | 33 |
|
23 | 34 | def __init__(self, *args, **kwargs) -> None:
|
24 | 35 | raise NotImplementedError
|
25 | 36 |
|
| 37 | + def _result_converter(self, values, na=None): |
| 38 | + # Convert a bool-dtype pyarrow result to appropriate output type. |
| 39 | + raise NotImplementedError |
| 40 | + |
26 | 41 | def _str_pad(
|
27 | 42 | self,
|
28 | 43 | width: int,
|
@@ -89,3 +104,53 @@ def _str_removesuffix(self, suffix: str):
|
89 | 104 | removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
|
90 | 105 | result = pc.if_else(ends_with, removed, self._pa_array)
|
91 | 106 | return type(self)(result)
|
| 107 | + |
| 108 | + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): |
| 109 | + if isinstance(pat, str): |
| 110 | + result = pc.starts_with(self._pa_array, pattern=pat) |
| 111 | + else: |
| 112 | + if len(pat) == 0: |
| 113 | + if self._object_compat: |
| 114 | + # mimic existing behaviour of string extension array |
| 115 | + # and python string method |
| 116 | + result = pa.array( |
| 117 | + np.zeros(len(self._pa_array), dtype=np.bool_), |
| 118 | + mask=isna(self._pa_array), |
| 119 | + ) |
| 120 | + else: |
| 121 | + # For empty tuple, pd.StringDtype() returns null for missing values |
| 122 | + # and false for valid values. |
| 123 | + result = pc.if_else(pc.is_null(self._pa_array), None, False) |
| 124 | + else: |
| 125 | + result = pc.starts_with(self._pa_array, pattern=pat[0]) |
| 126 | + |
| 127 | + for p in pat[1:]: |
| 128 | + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) |
| 129 | + if not isna(na): |
| 130 | + result = result.fill_null(na) |
| 131 | + return self._result_converter(result) |
| 132 | + |
| 133 | + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): |
| 134 | + if isinstance(pat, str): |
| 135 | + result = pc.ends_with(self._pa_array, pattern=pat) |
| 136 | + else: |
| 137 | + if len(pat) == 0: |
| 138 | + if self._object_compat: |
| 139 | + # mimic existing behaviour of string extension array |
| 140 | + # and python string method |
| 141 | + result = pa.array( |
| 142 | + np.zeros(len(self._pa_array), dtype=np.bool_), |
| 143 | + mask=isna(self._pa_array), |
| 144 | + ) |
| 145 | + else: |
| 146 | + # For empty tuple, pd.StringDtype() returns null for missing values |
| 147 | + # and false for valid values. |
| 148 | + result = pc.if_else(pc.is_null(self._pa_array), None, False) |
| 149 | + else: |
| 150 | + result = pc.ends_with(self._pa_array, pattern=pat[0]) |
| 151 | + |
| 152 | + for p in pat[1:]: |
| 153 | + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) |
| 154 | + if not isna(na): |
| 155 | + result = result.fill_null(na) |
| 156 | + return self._result_converter(result) |
0 commit comments