@@ -213,6 +213,14 @@ def untokenize(self, iterable):
213
213
self .tokens .append (indent )
214
214
self .prev_col = len (indent )
215
215
startline = False
216
+ elif tok_type == FSTRING_MIDDLE :
217
+ if '{' in token or '}' in token :
218
+ end_line , end_col = end
219
+ end = (end_line , end_col + token .count ('{' ) + token .count ('}' ))
220
+ token = re .sub ('{' , '{{' , token )
221
+ token = re .sub ('}' , '}}' , token )
222
+
223
+
216
224
self .add_whitespace (start )
217
225
self .tokens .append (token )
218
226
self .prev_row , self .prev_col = end
@@ -255,6 +263,11 @@ def compat(self, token, iterable):
255
263
elif startline and indents :
256
264
toks_append (indents [- 1 ])
257
265
startline = False
266
+ elif toknum == FSTRING_MIDDLE :
267
+ if '{' in tokval or '}' in tokval :
268
+ tokval = re .sub ('{' , '{{' , tokval )
269
+ tokval = re .sub ('}' , '}}' , tokval )
270
+
258
271
toks_append (tokval )
259
272
260
273
@@ -404,36 +417,6 @@ def open(filename):
404
417
buffer .close ()
405
418
raise
406
419
407
- def tokenize2 (readline ):
408
- encoding , consumed = detect_encoding (readline )
409
- rl_gen = _itertools .chain (consumed , iter (readline , b"" ))
410
- if encoding is not None :
411
- if encoding == "utf-8-sig" :
412
- # BOM will already have been stripped.
413
- encoding = "utf-8"
414
- yield TokenInfo (ENCODING , encoding , (0 , 0 ), (0 , 0 ), '' )
415
- yield from _tokenize2 (rl_gen , encoding )
416
-
417
- def _tokenize2 (rl_gen , encoding ):
418
- source = b"" .join (rl_gen )
419
- token = None
420
- for token in _generate_tokens_from_c_tokenizer (source .decode (encoding ), extra_tokens = True ):
421
- # TODO: Marta -> limpiar esto
422
- if 6 < token .type <= 54 :
423
- token = token ._replace (type = OP )
424
- if token .type in {ASYNC , AWAIT }:
425
- token = token ._replace (type = NAME )
426
- if token .type == NEWLINE :
427
- l_start , c_start = token .start
428
- l_end , c_end = token .end
429
- token = token ._replace (string = '\n ' , start = (l_start , c_start ), end = (l_end , c_end + 1 ))
430
-
431
- yield token
432
- if token is not None :
433
- last_line , _ = token .start
434
- yield TokenInfo (ENDMARKER , '' , (last_line + 1 , 0 ), (last_line + 1 , 0 ), '' )
435
-
436
-
437
420
def tokenize (readline ):
438
421
"""
439
422
The tokenize() generator requires one argument, readline, which
@@ -454,194 +437,33 @@ def tokenize(readline):
454
437
which tells you which encoding was used to decode the bytes stream.
455
438
"""
456
439
encoding , consumed = detect_encoding (readline )
457
- empty = _itertools .repeat (b"" )
458
- rl_gen = _itertools .chain (consumed , iter (readline , b"" ), empty )
459
- return _tokenize (rl_gen .__next__ , encoding )
460
-
461
-
462
- def _tokenize (readline , encoding ):
463
- lnum = parenlev = continued = 0
464
- numchars = '0123456789'
465
- contstr , needcont = '' , 0
466
- contline = None
467
- indents = [0 ]
468
-
440
+ rl_gen = _itertools .chain (consumed , iter (readline , b"" ))
469
441
if encoding is not None :
470
442
if encoding == "utf-8-sig" :
471
443
# BOM will already have been stripped.
472
444
encoding = "utf-8"
473
445
yield TokenInfo (ENCODING , encoding , (0 , 0 ), (0 , 0 ), '' )
474
- last_line = b''
475
- line = b''
476
- while True : # loop over lines in stream
477
- try :
478
- # We capture the value of the line variable here because
479
- # readline uses the empty string '' to signal end of input,
480
- # hence `line` itself will always be overwritten at the end
481
- # of this loop.
482
- last_line = line
483
- line = readline ()
484
- except StopIteration :
485
- line = b''
486
-
487
- if encoding is not None :
488
- line = line .decode (encoding )
489
- lnum += 1
490
- pos , max = 0 , len (line )
491
-
492
- if contstr : # continued string
493
- if not line :
494
- raise TokenError ("EOF in multi-line string" , strstart )
495
- endmatch = endprog .match (line )
496
- if endmatch :
497
- pos = end = endmatch .end (0 )
498
- yield TokenInfo (STRING , contstr + line [:end ],
499
- strstart , (lnum , end ), contline + line )
500
- contstr , needcont = '' , 0
501
- contline = None
502
- elif needcont and line [- 2 :] != '\\ \n ' and line [- 3 :] != '\\ \r \n ' :
503
- yield TokenInfo (ERRORTOKEN , contstr + line ,
504
- strstart , (lnum , len (line )), contline )
505
- contstr = ''
506
- contline = None
507
- continue
508
- else :
509
- contstr = contstr + line
510
- contline = contline + line
511
- continue
512
-
513
- elif parenlev == 0 and not continued : # new statement
514
- if not line : break
515
- column = 0
516
- while pos < max : # measure leading whitespace
517
- if line [pos ] == ' ' :
518
- column += 1
519
- elif line [pos ] == '\t ' :
520
- column = (column // tabsize + 1 )* tabsize
521
- elif line [pos ] == '\f ' :
522
- column = 0
523
- else :
524
- break
525
- pos += 1
526
- if pos == max :
527
- break
528
-
529
- if line [pos ] in '#\r \n ' : # skip comments or blank lines
530
- if line [pos ] == '#' :
531
- comment_token = line [pos :].rstrip ('\r \n ' )
532
- yield TokenInfo (COMMENT , comment_token ,
533
- (lnum , pos ), (lnum , pos + len (comment_token )), line )
534
- pos += len (comment_token )
535
-
536
- yield TokenInfo (NL , line [pos :],
537
- (lnum , pos ), (lnum , len (line )), line )
538
- continue
539
-
540
- if column > indents [- 1 ]: # count indents or dedents
541
- indents .append (column )
542
- yield TokenInfo (INDENT , line [:pos ], (lnum , 0 ), (lnum , pos ), line )
543
- while column < indents [- 1 ]:
544
- if column not in indents :
545
- raise IndentationError (
546
- "unindent does not match any outer indentation level" ,
547
- ("<tokenize>" , lnum , pos , line ))
548
- indents = indents [:- 1 ]
549
-
550
- yield TokenInfo (DEDENT , '' , (lnum , pos ), (lnum , pos ), line )
551
-
552
- else : # continued statement
553
- if not line :
554
- raise TokenError ("EOF in multi-line statement" , (lnum , 0 ))
555
- continued = 0
556
-
557
- while pos < max :
558
- pseudomatch = _compile (PseudoToken ).match (line , pos )
559
- if pseudomatch : # scan for tokens
560
- start , end = pseudomatch .span (1 )
561
- spos , epos , pos = (lnum , start ), (lnum , end ), end
562
- if start == end :
563
- continue
564
- token , initial = line [start :end ], line [start ]
565
-
566
- if (initial in numchars or # ordinary number
567
- (initial == '.' and token != '.' and token != '...' )):
568
- yield TokenInfo (NUMBER , token , spos , epos , line )
569
- elif initial in '\r \n ' :
570
- if parenlev > 0 :
571
- yield TokenInfo (NL , token , spos , epos , line )
572
- else :
573
- yield TokenInfo (NEWLINE , token , spos , epos , line )
574
-
575
- elif initial == '#' :
576
- assert not token .endswith ("\n " )
577
- yield TokenInfo (COMMENT , token , spos , epos , line )
578
-
579
- elif token in triple_quoted :
580
- endprog = _compile (endpats [token ])
581
- endmatch = endprog .match (line , pos )
582
- if endmatch : # all on one line
583
- pos = endmatch .end (0 )
584
- token = line [start :pos ]
585
- yield TokenInfo (STRING , token , spos , (lnum , pos ), line )
586
- else :
587
- strstart = (lnum , start ) # multiple lines
588
- contstr = line [start :]
589
- contline = line
590
- break
591
-
592
- # Check up to the first 3 chars of the token to see if
593
- # they're in the single_quoted set. If so, they start
594
- # a string.
595
- # We're using the first 3, because we're looking for
596
- # "rb'" (for example) at the start of the token. If
597
- # we switch to longer prefixes, this needs to be
598
- # adjusted.
599
- # Note that initial == token[:1].
600
- # Also note that single quote checking must come after
601
- # triple quote checking (above).
602
- elif (initial in single_quoted or
603
- token [:2 ] in single_quoted or
604
- token [:3 ] in single_quoted ):
605
- if token [- 1 ] == '\n ' : # continued string
606
- strstart = (lnum , start )
607
- # Again, using the first 3 chars of the
608
- # token. This is looking for the matching end
609
- # regex for the correct type of quote
610
- # character. So it's really looking for
611
- # endpats["'"] or endpats['"'], by trying to
612
- # skip string prefix characters, if any.
613
- endprog = _compile (endpats .get (initial ) or
614
- endpats .get (token [1 ]) or
615
- endpats .get (token [2 ]))
616
- contstr , needcont = line [start :], 1
617
- contline = line
618
- break
619
- else : # ordinary string
620
- yield TokenInfo (STRING , token , spos , epos , line )
621
-
622
- elif initial .isidentifier (): # ordinary name
623
- yield TokenInfo (NAME , token , spos , epos , line )
624
- elif initial == '\\ ' : # continued stmt
625
- continued = 1
626
- else :
627
- if initial in '([{' :
628
- parenlev += 1
629
- elif initial in ')]}' :
630
- parenlev -= 1
631
- yield TokenInfo (OP , token , spos , epos , line )
632
- else :
633
- yield TokenInfo (ERRORTOKEN , line [pos ],
634
- (lnum , pos ), (lnum , pos + 1 ), line )
635
- pos += 1
446
+ yield from _tokenize (rl_gen , encoding )
447
+
448
+ def _tokenize (rl_gen , encoding ):
449
+ source = b"" .join (rl_gen ).decode (encoding )
450
+ token = None
451
+ for token in _generate_tokens_from_c_tokenizer (source , extra_tokens = True ):
452
+ # TODO: Marta -> limpiar esto
453
+ if 6 < token .type <= 54 :
454
+ token = token ._replace (type = OP )
455
+ if token .type in {ASYNC , AWAIT }:
456
+ token = token ._replace (type = NAME )
457
+ if token .type == NEWLINE :
458
+ l_start , c_start = token .start
459
+ l_end , c_end = token .end
460
+ token = token ._replace (string = '\n ' , start = (l_start , c_start ), end = (l_end , c_end + 1 ))
636
461
637
- # Add an implicit NEWLINE if the input doesn't end in one
638
- if last_line and last_line [- 1 ] not in '\r \n ' and not last_line .strip ().startswith ("#" ):
639
- yield TokenInfo (NEWLINE , '' , (lnum - 1 , len (last_line )), (lnum - 1 , len (last_line ) + 1 ), '' )
640
- for indent in indents [1 :]: # pop remaining indent levels
641
- yield TokenInfo (DEDENT , '' , (lnum , 0 ), (lnum , 0 ), '' )
642
- yield TokenInfo (ENDMARKER , '' , (lnum , 0 ), (lnum , 0 ), '' )
462
+ yield token
463
+ if token is not None :
464
+ last_line , _ = token .start
465
+ yield TokenInfo (ENDMARKER , '' , (last_line + 1 , 0 ), (last_line + 1 , 0 ), '' )
643
466
644
- tokenize = tokenize2
645
467
646
468
def generate_tokens (readline ):
647
469
"""Tokenize a source reading Python code as unicode strings.
@@ -658,7 +480,7 @@ def _gen():
658
480
if not line :
659
481
return
660
482
yield line .encode ()
661
- return _tokenize2 (_gen (), 'utf-8' )
483
+ return _tokenize (_gen (), 'utf-8' )
662
484
663
485
def main ():
664
486
import argparse
0 commit comments