Skip to content

Commit 9800fbc

Browse files
committed
REF (string): de-duplicate str_endswith, startswith
1 parent 59bb3f4 commit 9800fbc

File tree

3 files changed

+69
-72
lines changed

3 files changed

+69
-72
lines changed

pandas/core/arrays/_arrow_string_mixins.py

+67-2
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,35 @@
99

1010
from pandas.compat import pa_version_under10p1
1111

12+
from pandas.core.dtypes.missing import isna
13+
1214
if not pa_version_under10p1:
1315
import pyarrow as pa
1416
import pyarrow.compute as pc
1517

1618
if TYPE_CHECKING:
17-
from pandas._typing import Self
19+
from collections.abc import Sized
20+
21+
from pandas._typing import (
22+
Scalar,
23+
Self,
24+
)
1825

1926

2027
class ArrowStringArrayMixin:
21-
_pa_array = None
28+
# _object_compat specifies whether we should 1) attempt to match behaviors
29+
# of the object-backed StringDtype and 2) fall back to object-based
30+
# computation for cases that pyarrow does not support natively.
31+
_object_compat = False
32+
_pa_array: Sized
2233

2334
def __init__(self, *args, **kwargs) -> None:
2435
raise NotImplementedError
2536

37+
def _result_converter(self, values, na=None):
38+
# Convert a bool-dtype pyarrow result to appropriate output type.
39+
raise NotImplementedError
40+
2641
def _str_pad(
2742
self,
2843
width: int,
@@ -89,3 +104,53 @@ def _str_removesuffix(self, suffix: str):
89104
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
90105
result = pc.if_else(ends_with, removed, self._pa_array)
91106
return type(self)(result)
107+
108+
def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
109+
if isinstance(pat, str):
110+
result = pc.starts_with(self._pa_array, pattern=pat)
111+
else:
112+
if len(pat) == 0:
113+
if self._object_compat:
114+
# mimic existing behaviour of string extension array
115+
# and python string method
116+
result = pa.array(
117+
np.zeros(len(self._pa_array), dtype=np.bool_),
118+
mask=isna(self._pa_array),
119+
)
120+
else:
121+
# For empty tuple, pd.StringDtype() returns null for missing values
122+
# and false for valid values.
123+
result = pc.if_else(pc.is_null(self._pa_array), None, False)
124+
else:
125+
result = pc.starts_with(self._pa_array, pattern=pat[0])
126+
127+
for p in pat[1:]:
128+
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
129+
if not isna(na):
130+
result = result.fill_null(na)
131+
return self._result_converter(result)
132+
133+
def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
134+
if isinstance(pat, str):
135+
result = pc.ends_with(self._pa_array, pattern=pat)
136+
else:
137+
if len(pat) == 0:
138+
if self._object_compat:
139+
# mimic existing behaviour of string extension array
140+
# and python string method
141+
result = pa.array(
142+
np.zeros(len(self._pa_array), dtype=np.bool_),
143+
mask=isna(self._pa_array),
144+
)
145+
else:
146+
# For empty tuple, pd.StringDtype() returns null for missing values
147+
# and false for valid values.
148+
result = pc.if_else(pc.is_null(self._pa_array), None, False)
149+
else:
150+
result = pc.ends_with(self._pa_array, pattern=pat[0])
151+
152+
for p in pat[1:]:
153+
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
154+
if not isna(na):
155+
result = result.fill_null(na)
156+
return self._result_converter(result)

pandas/core/arrays/arrow/array.py

+1-32
Original file line numberDiff line numberDiff line change
@@ -2305,38 +2305,7 @@ def _str_contains(
23052305
result = result.fill_null(na)
23062306
return type(self)(result)
23072307

2308-
def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self:
2309-
if isinstance(pat, str):
2310-
result = pc.starts_with(self._pa_array, pattern=pat)
2311-
else:
2312-
if len(pat) == 0:
2313-
# For empty tuple, pd.StringDtype() returns null for missing values
2314-
# and false for valid values.
2315-
result = pc.if_else(pc.is_null(self._pa_array), None, False)
2316-
else:
2317-
result = pc.starts_with(self._pa_array, pattern=pat[0])
2318-
2319-
for p in pat[1:]:
2320-
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
2321-
if not isna(na):
2322-
result = result.fill_null(na)
2323-
return type(self)(result)
2324-
2325-
def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self:
2326-
if isinstance(pat, str):
2327-
result = pc.ends_with(self._pa_array, pattern=pat)
2328-
else:
2329-
if len(pat) == 0:
2330-
# For empty tuple, pd.StringDtype() returns null for missing values
2331-
# and false for valid values.
2332-
result = pc.if_else(pc.is_null(self._pa_array), None, False)
2333-
else:
2334-
result = pc.ends_with(self._pa_array, pattern=pat[0])
2335-
2336-
for p in pat[1:]:
2337-
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
2338-
if not isna(na):
2339-
result = result.fill_null(na)
2308+
def _result_converter(self, result):
23402309
return type(self)(result)
23412310

23422311
def _str_replace(

pandas/core/arrays/string_arrow.py

+1-38
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ def astype(self, dtype, copy: bool = True):
278278

279279
# ------------------------------------------------------------------------
280280
# String methods interface
281+
_object_compat = True
281282

282283
_str_map = BaseStringArray._str_map
283284

@@ -298,44 +299,6 @@ def _str_contains(
298299
result[isna(result)] = bool(na)
299300
return result
300301

301-
def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
302-
if isinstance(pat, str):
303-
result = pc.starts_with(self._pa_array, pattern=pat)
304-
else:
305-
if len(pat) == 0:
306-
# mimic existing behaviour of string extension array
307-
# and python string method
308-
result = pa.array(
309-
np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array)
310-
)
311-
else:
312-
result = pc.starts_with(self._pa_array, pattern=pat[0])
313-
314-
for p in pat[1:]:
315-
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
316-
if not isna(na):
317-
result = result.fill_null(na)
318-
return self._result_converter(result)
319-
320-
def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
321-
if isinstance(pat, str):
322-
result = pc.ends_with(self._pa_array, pattern=pat)
323-
else:
324-
if len(pat) == 0:
325-
# mimic existing behaviour of string extension array
326-
# and python string method
327-
result = pa.array(
328-
np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array)
329-
)
330-
else:
331-
result = pc.ends_with(self._pa_array, pattern=pat[0])
332-
333-
for p in pat[1:]:
334-
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
335-
if not isna(na):
336-
result = result.fill_null(na)
337-
return self._result_converter(result)
338-
339302
def _str_replace(
340303
self,
341304
pat: str | re.Pattern,

0 commit comments

Comments
 (0)