From cbb1fbdd349be9bbb8c89f8c506ab6c36ddfb4e5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 29 Apr 2021 12:32:16 +0100
Subject: [PATCH 1/2] [ArrowStringArray] use
 pyarrow.compute.match_substring_regex if available

---
 pandas/core/arrays/string_arrow.py | 32 +++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 180ed51e7fd2b..e27d5c2bc789b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,12 +1,14 @@
 from __future__ import annotations
 
 from distutils.version import LooseVersion
+import re
 from typing import (
     TYPE_CHECKING,
     Any,
     Sequence,
     cast,
 )
+import warnings
 
 import numpy as np
 
@@ -754,15 +756,31 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
 
     def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
-        if not regex and case:
-            result = pc.match_substring(self._data, pat)
-            result = BooleanDtype().__from_arrow__(result)
-            if not isna(na):
-                result[isna(result)] = bool(na)
-            return result
-        else:
+        if flags:
             return super()._str_contains(pat, case, flags, na, regex)
 
+        if regex:
+            if hasattr(pc, "match_substring_regex") and case:
+                if re.compile(pat).groups:
+                    warnings.warn(
+                        "This pattern has match groups. To actually get the "
+                        "groups, use str.extract.",
+                        UserWarning,
+                        stacklevel=3,
+                    )
+                result = pc.match_substring_regex(self._data, pat)
+            else:
+                return super()._str_contains(pat, case, flags, na, regex)
+        else:
+            if case:
+                result = pc.match_substring(self._data, pat)
+            else:
+                result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
+        result = BooleanDtype().__from_arrow__(result)
+        if not isna(na):
+            result[isna(result)] = bool(na)
+        return result
+
     def _str_isalnum(self):
         if hasattr(pc, "utf8_is_alnum"):
             result = pc.utf8_is_alnum(self._data)

From ed9e30ff41606ad50997299e60e60f4d930cb056 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 30 Apr 2021 14:34:20 +0100
Subject: [PATCH 2/2] add comments

---
 pandas/core/arrays/string_arrow.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 0c1036a310dd6..63c77f418d056 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -760,6 +760,7 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
             return super()._str_contains(pat, case, flags, na, regex)
 
         if regex:
+            # match_substring_regex added in pyarrow 4.0.0
             if hasattr(pc, "match_substring_regex") and case:
                 if re.compile(pat).groups:
                     warnings.warn(
@@ -782,6 +783,7 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         return result
 
     def _str_startswith(self, pat, na=None):
+        # match_substring_regex added in pyarrow 4.0.0
         if hasattr(pc, "match_substring_regex"):
             result = pc.match_substring_regex(self._data, "^" + re.escape(pat))
             result = BooleanDtype().__from_arrow__(result)
@@ -792,6 +794,7 @@ def _str_startswith(self, pat, na=None):
             return super()._str_startswith(pat, na)
 
     def _str_endswith(self, pat, na=None):
+        # match_substring_regex added in pyarrow 4.0.0
         if hasattr(pc, "match_substring_regex"):
             result = pc.match_substring_regex(self._data, re.escape(pat) + "$")
             result = BooleanDtype().__from_arrow__(result)