3
3
import re
4
4
from typing import (
5
5
Dict ,
6
+ Hashable ,
6
7
List ,
7
8
Optional ,
9
+ Pattern ,
8
10
)
9
11
import warnings
10
12
@@ -3036,13 +3038,31 @@ def _result_dtype(arr):
3036
3038
return object
3037
3039
3038
3040
3039
- def _get_single_group_name (rx ) :
3040
- try :
3041
- return list ( rx .groupindex . keys ()). pop ( )
3042
- except IndexError :
3041
+ def _get_single_group_name (regex : Pattern ) -> Hashable :
3042
+ if regex . groupindex :
3043
+ return next ( iter ( regex .groupindex ) )
3044
+ else :
3043
3045
return None
3044
3046
3045
3047
3048
+ def _get_group_names (regex : Pattern ) -> List [Hashable ]:
3049
+ """
3050
+ Get named groups from compiled regex.
3051
+
3052
+ Unnamed groups are numbered.
3053
+
3054
+ Parameters
3055
+ ----------
3056
+ regex : compiled regex
3057
+
3058
+ Returns
3059
+ -------
3060
+ list of column labels
3061
+ """
3062
+ names = {v : k for k , v in regex .groupindex .items ()}
3063
+ return [names .get (1 + i , i ) for i in range (regex .groups )]
3064
+
3065
+
3046
3066
def _str_extract_noexpand (arr , pat , flags = 0 ):
3047
3067
"""
3048
3068
Find groups in each string in the Series using passed regular
@@ -3069,8 +3089,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
3069
3089
if isinstance (arr , ABCIndex ):
3070
3090
raise ValueError ("only one regex group is supported with Index" )
3071
3091
name = None
3072
- names = dict (zip (regex .groupindex .values (), regex .groupindex .keys ()))
3073
- columns = [names .get (1 + i , i ) for i in range (regex .groups )]
3092
+ columns = _get_group_names (regex )
3074
3093
if arr .size == 0 :
3075
3094
# error: Incompatible types in assignment (expression has type
3076
3095
# "DataFrame", variable has type "ndarray")
@@ -3101,8 +3120,7 @@ def _str_extract_frame(arr, pat, flags=0):
3101
3120
3102
3121
regex = re .compile (pat , flags = flags )
3103
3122
groups_or_na = _groups_or_na_fun (regex )
3104
- names = dict (zip (regex .groupindex .values (), regex .groupindex .keys ()))
3105
- columns = [names .get (1 + i , i ) for i in range (regex .groups )]
3123
+ columns = _get_group_names (regex )
3106
3124
3107
3125
if len (arr ) == 0 :
3108
3126
return DataFrame (columns = columns , dtype = object )
@@ -3139,8 +3157,7 @@ def str_extractall(arr, pat, flags=0):
3139
3157
if isinstance (arr , ABCIndex ):
3140
3158
arr = arr .to_series ().reset_index (drop = True )
3141
3159
3142
- names = dict (zip (regex .groupindex .values (), regex .groupindex .keys ()))
3143
- columns = [names .get (1 + i , i ) for i in range (regex .groups )]
3160
+ columns = _get_group_names (regex )
3144
3161
match_list = []
3145
3162
index_list = []
3146
3163
is_mi = arr .index .nlevels > 1
0 commit comments