[ArrowStringArray] REF: extract/extractall column names (#41417)

simonjayhawkins · web-flow · commit fb70ca4f4be8 · 2021-05-11T21:07:02.000-04:00
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -3,8 +3,10 @@
 import re
 from typing import (
     Dict,
+    Hashable,
     List,
     Optional,
+    Pattern,
 )
 import warnings
 
@@ -3036,13 +3038,31 @@ def _result_dtype(arr):
         return object
 
 
-def _get_single_group_name(rx):
-    try:
-        return list(rx.groupindex.keys()).pop()
-    except IndexError:
+def _get_single_group_name(regex: Pattern) -> Hashable:
+    if regex.groupindex:
+        return next(iter(regex.groupindex))
+    else:
         return None
 
 
+def _get_group_names(regex: Pattern) -> List[Hashable]:
+    """
+    Get named groups from compiled regex.
+
+    Unnamed groups are numbered.
+
+    Parameters
+    ----------
+    regex : compiled regex
+
+    Returns
+    -------
+    list of column labels
+    """
+    names = {v: k for k, v in regex.groupindex.items()}
+    return [names.get(1 + i, i) for i in range(regex.groups)]
+
+
 def _str_extract_noexpand(arr, pat, flags=0):
     """
     Find groups in each string in the Series using passed regular
@@ -3069,8 +3089,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
         if isinstance(arr, ABCIndex):
             raise ValueError("only one regex group is supported with Index")
         name = None
-        names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
-        columns = [names.get(1 + i, i) for i in range(regex.groups)]
+        columns = _get_group_names(regex)
         if arr.size == 0:
             # error: Incompatible types in assignment (expression has type
             # "DataFrame", variable has type "ndarray")
@@ -3101,8 +3120,7 @@ def _str_extract_frame(arr, pat, flags=0):
 
     regex = re.compile(pat, flags=flags)
     groups_or_na = _groups_or_na_fun(regex)
-    names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
-    columns = [names.get(1 + i, i) for i in range(regex.groups)]
+    columns = _get_group_names(regex)
 
     if len(arr) == 0:
         return DataFrame(columns=columns, dtype=object)
@@ -3139,8 +3157,7 @@ def str_extractall(arr, pat, flags=0):
     if isinstance(arr, ABCIndex):
         arr = arr.to_series().reset_index(drop=True)
 
-    names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
-    columns = [names.get(1 + i, i) for i in range(regex.groups)]
+    columns = _get_group_names(regex)
     match_list = []
     index_list = []
     is_mi = arr.index.nlevels > 1