From cbb1fbdd349be9bbb8c89f8c506ab6c36ddfb4e5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Apr 2021 12:32:16 +0100 Subject: [PATCH 1/2] [ArrowStringArray] use pyarrow.compute.match_substring_regex if available --- pandas/core/arrays/string_arrow.py | 32 +++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 180ed51e7fd2b..e27d5c2bc789b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,12 +1,14 @@ from __future__ import annotations from distutils.version import LooseVersion +import re from typing import ( TYPE_CHECKING, Any, Sequence, cast, ) +import warnings import numpy as np @@ -754,15 +756,31 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): return lib.map_infer_mask(arr, f, mask.view("uint8")) def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - if not regex and case: - result = pc.match_substring(self._data, pat) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + if flags: return super()._str_contains(pat, case, flags, na, regex) + if regex: + if hasattr(pc, "match_substring_regex") and case: + if re.compile(pat).groups: + warnings.warn( + "This pattern has match groups. To actually get the " + "groups, use str.extract.", + UserWarning, + stacklevel=3, + ) + result = pc.match_substring_regex(self._data, pat) + else: + return super()._str_contains(pat, case, flags, na, regex) + else: + if case: + result = pc.match_substring(self._data, pat) + else: + result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + def _str_isalnum(self): if hasattr(pc, "utf8_is_alnum"): result = pc.utf8_is_alnum(self._data) From ed9e30ff41606ad50997299e60e60f4d930cb056 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 30 Apr 2021 14:34:20 +0100 Subject: [PATCH 2/2] add comments --- pandas/core/arrays/string_arrow.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0c1036a310dd6..63c77f418d056 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -760,6 +760,7 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): return super()._str_contains(pat, case, flags, na, regex) if regex: + # match_substring_regex added in pyarrow 4.0.0 if hasattr(pc, "match_substring_regex") and case: if re.compile(pat).groups: warnings.warn( @@ -782,6 +783,7 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): return result def _str_startswith(self, pat, na=None): + # match_substring_regex added in pyarrow 4.0.0 if hasattr(pc, "match_substring_regex"): result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) result = BooleanDtype().__from_arrow__(result) @@ -792,6 +794,7 @@ def _str_startswith(self, pat, na=None): return super()._str_startswith(pat, na) def _str_endswith(self, pat, na=None): + # match_substring_regex added in pyarrow 4.0.0 if hasattr(pc, "match_substring_regex"): result = pc.match_substring_regex(self._data, re.escape(pat) + "$") result = BooleanDtype().__from_arrow__(result)