@@ -421,24 +421,39 @@ def str_extract(arr, pat, flags=0):
421
421
Pattern or regular expression
422
422
flags : int, default 0 (no flags)
423
423
re module flags, e.g. re.IGNORECASE
424
+ expand : bool, default True
425
+ * If True, return DataFrame/MultiIndex expanding dimensionality.
426
+ * If False, return Series/Index.
424
427
425
428
Returns
426
429
-------
427
- extracted groups : Series (one group) or DataFrame (multiple groups)
430
+ extracted groups : Deprecated: Series (one group) or DataFrame (multiple groups)
428
431
Note that dtype of the result is always object, even when no match is
429
432
found and the result is a Series or DataFrame containing only NaN
430
433
values.
431
434
435
+ Being changed to return Series/Index or DataFrame/MultiIndex of objects
436
+ specified by expand option in future version.
437
+
432
438
Examples
433
439
--------
434
- A pattern with one group will return a Series. Non-matches will be NaN.
440
+ Deprecated: A pattern with one group returns a Series. Non-matches will be NaN.
441
+ Being changed to return DataFrame by default in future version.
435
442
436
443
>>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
437
444
0 1
438
445
1 2
439
446
2 NaN
440
447
dtype: object
441
448
449
+ Specify ``expand=False`` to return Series.
450
+
451
+ >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)', expand=False)
452
+ 0 1
453
+ 1 2
454
+ 2 NaN
455
+ dtype: object
456
+
442
457
A pattern with more than one group will return a DataFrame.
443
458
444
459
>>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
@@ -462,12 +477,7 @@ def str_extract(arr, pat, flags=0):
462
477
0 a 1
463
478
1 b 2
464
479
2 NaN NaN
465
-
466
480
"""
467
- from pandas .core .series import Series
468
- from pandas .core .frame import DataFrame
469
- from pandas .core .index import Index
470
-
471
481
regex = re .compile (pat , flags = flags )
472
482
# just to be safe, check this
473
483
if regex .groups == 0 :
@@ -487,18 +497,9 @@ def f(x):
487
497
result = np .array ([f (val )[0 ] for val in arr ], dtype = object )
488
498
name = _get_single_group_name (regex )
489
499
else :
490
- if isinstance (arr , Index ):
491
- raise ValueError ("only one regex group is supported with Index" )
492
- name = None
493
500
names = dict (zip (regex .groupindex .values (), regex .groupindex .keys ()))
494
- columns = [names .get (1 + i , i ) for i in range (regex .groups )]
495
- if arr .empty :
496
- result = DataFrame (columns = columns , dtype = object )
497
- else :
498
- result = DataFrame ([f (val ) for val in arr ],
499
- columns = columns ,
500
- index = arr .index ,
501
- dtype = object )
501
+ name = [names .get (1 + i , i ) for i in range (regex .groups )]
502
+ result = np .array ([f (val ) for val in arr ], dtype = object )
502
503
return result , name
503
504
504
505
@@ -511,6 +512,9 @@ def str_get_dummies(arr, sep='|'):
511
512
----------
512
513
sep : string, default "|"
513
514
String to split on.
515
+ expand : bool, default False
516
+ * If True, return DataFrame/MultiIndex expanding dimensionality.
517
+ * If False, return Series/Index.
514
518
515
519
Returns
516
520
-------
@@ -534,15 +538,15 @@ def str_get_dummies(arr, sep='|'):
534
538
--------
535
539
pandas.get_dummies
536
540
"""
537
- from pandas .core .frame import DataFrame
538
541
from pandas .core .index import Index
539
-
540
- # GH9980, Index.str does not support get_dummies() as it returns a frame
542
+ # TODO: Add fillna GH 10089
541
543
if isinstance (arr , Index ):
542
- raise TypeError ("get_dummies is not supported for string methods on Index" )
543
-
544
- # TODO remove this hack?
545
- arr = arr .fillna ('' )
544
+ # temp hack
545
+ values = arr .values
546
+ values [isnull (values )] = ''
547
+ arr = Index (values )
548
+ else :
549
+ arr = arr .fillna ('' )
546
550
try :
547
551
arr = sep + arr + sep
548
552
except TypeError :
@@ -558,7 +562,7 @@ def str_get_dummies(arr, sep='|'):
558
562
for i , t in enumerate (tags ):
559
563
pat = sep + t + sep
560
564
dummies [:, i ] = lib .map_infer (arr .values , lambda x : pat in x )
561
- return DataFrame ( dummies , arr . index , tags )
565
+ return dummies , tags
562
566
563
567
564
568
def str_join (arr , sep ):
@@ -1043,40 +1047,19 @@ def __iter__(self):
1043
1047
i += 1
1044
1048
g = self .get (i )
1045
1049
1046
- def _wrap_result (self , result , ** kwargs ):
1047
-
1048
- # leave as it is to keep extract and get_dummies results
1049
- # can be merged to _wrap_result_expand in v0.17
1050
- from pandas .core .series import Series
1051
- from pandas .core .frame import DataFrame
1052
- from pandas .core .index import Index
1053
-
1054
- if not hasattr (result , 'ndim' ):
1055
- return result
1056
- name = kwargs .get ('name' ) or getattr (result , 'name' , None ) or self .series .name
1057
-
1058
- if result .ndim == 1 :
1059
- if isinstance (self .series , Index ):
1060
- # if result is a boolean np.array, return the np.array
1061
- # instead of wrapping it into a boolean Index (GH 8875)
1062
- if is_bool_dtype (result ):
1063
- return result
1064
- return Index (result , name = name )
1065
- return Series (result , index = self .series .index , name = name )
1066
- else :
1067
- assert result .ndim < 3
1068
- return DataFrame (result , index = self .series .index )
1050
+ def _wrap_result (self , result , expand = False , name = None ):
1051
+ from pandas .core .index import Index , MultiIndex
1069
1052
1070
- def _wrap_result_expand (self , result , expand = False ):
1071
1053
if not isinstance (expand , bool ):
1072
1054
raise ValueError ("expand must be True or False" )
1073
1055
1074
- from pandas .core .index import Index , MultiIndex
1056
+ if name is None :
1057
+ name = getattr (result , 'name' , None ) or self .series .name
1058
+
1075
1059
if not hasattr (result , 'ndim' ):
1076
1060
return result
1077
1061
1078
1062
if isinstance (self .series , Index ):
1079
- name = getattr (result , 'name' , None )
1080
1063
# if result is a boolean np.array, return the np.array
1081
1064
# instead of wrapping it into a boolean Index (GH 8875)
1082
1065
if hasattr (result , 'dtype' ) and is_bool_dtype (result ):
@@ -1092,10 +1075,12 @@ def _wrap_result_expand(self, result, expand=False):
1092
1075
if expand :
1093
1076
cons_row = self .series ._constructor
1094
1077
cons = self .series ._constructor_expanddim
1095
- data = [cons_row (x ) for x in result ]
1096
- return cons (data , index = index )
1078
+ data = [cons_row (x , index = name ) for x in result ]
1079
+ return cons (data , index = index , columns = name ,
1080
+ dtype = result .dtype )
1097
1081
else :
1098
- name = getattr (result , 'name' , None )
1082
+ if result .ndim > 1 :
1083
+ result = list (result )
1099
1084
cons = self .series ._constructor
1100
1085
return cons (result , name = name , index = index )
1101
1086
@@ -1109,7 +1094,7 @@ def cat(self, others=None, sep=None, na_rep=None):
1109
1094
@copy (str_split )
1110
1095
def split (self , pat = None , n = - 1 , expand = False ):
1111
1096
result = str_split (self .series , pat , n = n )
1112
- return self ._wrap_result_expand (result , expand = expand )
1097
+ return self ._wrap_result (result , expand = expand )
1113
1098
1114
1099
_shared_docs ['str_partition' ] = ("""
1115
1100
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1160,15 +1145,15 @@ def split(self, pat=None, n=-1, expand=False):
1160
1145
def partition (self , pat = ' ' , expand = True ):
1161
1146
f = lambda x : x .partition (pat )
1162
1147
result = _na_map (f , self .series )
1163
- return self ._wrap_result_expand (result , expand = expand )
1148
+ return self ._wrap_result (result , expand = expand )
1164
1149
1165
1150
@Appender (_shared_docs ['str_partition' ] % {'side' : 'last' ,
1166
1151
'return' : '3 elements containing two empty strings, followed by the string itself' ,
1167
1152
'also' : 'partition : Split the string at the first occurrence of `sep`' })
1168
1153
def rpartition (self , pat = ' ' , expand = True ):
1169
1154
f = lambda x : x .rpartition (pat )
1170
1155
result = _na_map (f , self .series )
1171
- return self ._wrap_result_expand (result , expand = expand )
1156
+ return self ._wrap_result (result , expand = expand )
1172
1157
1173
1158
@copy (str_get )
1174
1159
def get (self , i ):
@@ -1309,9 +1294,9 @@ def wrap(self, width, **kwargs):
1309
1294
return self ._wrap_result (result )
1310
1295
1311
1296
@copy (str_get_dummies )
1312
- def get_dummies (self , sep = '|' ):
1313
- result = str_get_dummies (self .series , sep )
1314
- return self ._wrap_result (result )
1297
+ def get_dummies (self , sep = '|' , expand = True ):
1298
+ result , name = str_get_dummies (self .series , sep )
1299
+ return self ._wrap_result (result , name = name , expand = expand )
1315
1300
1316
1301
@copy (str_translate )
1317
1302
def translate (self , table , deletechars = None ):
@@ -1324,9 +1309,26 @@ def translate(self, table, deletechars=None):
1324
1309
findall = _pat_wrapper (str_findall , flags = True )
1325
1310
1326
1311
@copy (str_extract )
1327
- def extract (self , pat , flags = 0 ):
1312
+ def extract (self , pat , flags = 0 , expand = None ):
1328
1313
result , name = str_extract (self .series , pat , flags = flags )
1329
- return self ._wrap_result (result , name = name )
1314
+
1315
+ if expand is None and hasattr (result , 'ndim' ):
1316
+ # to be compat with previous behavior
1317
+ msg = ("Extracting with single group returns DataFrame in future version. "
1318
+ "Specify expand=False to return Series." )
1319
+ if len (result ) == 0 :
1320
+ # for empty input
1321
+ if isinstance (name , list ):
1322
+ expand = True
1323
+ else :
1324
+ warnings .warn (msg , UserWarning )
1325
+ expand = False
1326
+ elif result .ndim > 1 :
1327
+ expand = True
1328
+ else :
1329
+ warnings .warn (msg , UserWarning )
1330
+ expand = False
1331
+ return self ._wrap_result (result , name = name , expand = expand )
1330
1332
1331
1333
_shared_docs ['find' ] = ("""
1332
1334
Return %(side)s indexes in each strings in the Series/Index
0 commit comments