Skip to content

Commit 73af4bb

Browse files
committed
Fix more bugs
1 parent 68bf188 commit 73af4bb

File tree

5 files changed

+53
-242
lines changed

5 files changed

+53
-242
lines changed

Lib/inspect.py

-1
Original file line numberDiff line numberDiff line change
@@ -2185,7 +2185,6 @@ def _signature_strip_non_python_syntax(signature):
21852185
if string == ',':
21862186
current_parameter += 1
21872187

2188-
# if (type == ERRORTOKEN) and (string == '$'):
21892188
if (type == OP) and (string == '$'):
21902189
assert self_parameter is None
21912190
self_parameter = current_parameter

Lib/test/test_tokenize.py

+14-28
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from test import support
22
from test.support import os_helper
3-
from tokenize import (tokenize, tokenize2, _tokenize, untokenize, NUMBER, NAME, OP,
3+
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
44
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
55
open as tokenize_open, Untokenizer, generate_tokens,
6-
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
6+
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
77
from io import BytesIO, StringIO
88
import unittest
99
from textwrap import dedent
@@ -46,7 +46,7 @@ def check_tokenize(self, s, expected):
4646
# Format the tokens in s in a table format.
4747
# The ENDMARKER and final NEWLINE are omitted.
4848
f = BytesIO(s.encode('utf-8'))
49-
result = stringify_tokens_from_source(tokenize2(f.readline), s)
49+
result = stringify_tokens_from_source(tokenize(f.readline), s)
5050
self.assertEqual(result,
5151
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
5252
expected.rstrip().splitlines())
@@ -1128,33 +1128,16 @@ def readline():
11281128
nonlocal first
11291129
if not first:
11301130
first = True
1131-
return line
1131+
yield line
11321132
else:
1133-
return b''
1133+
yield b''
11341134

11351135
# skip the initial encoding token and the end tokens
1136-
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
1137-
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1136+
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
1137+
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
11381138
self.assertEqual(tokens, expected_tokens,
11391139
"bytes not decoded with encoding")
11401140

1141-
def test__tokenize_does_not_decode_with_encoding_none(self):
1142-
literal = '"ЉЊЈЁЂ"'
1143-
first = False
1144-
def readline():
1145-
nonlocal first
1146-
if not first:
1147-
first = True
1148-
return literal
1149-
else:
1150-
return b''
1151-
1152-
# skip the end tokens
1153-
tokens = list(_tokenize(readline, encoding=None))[:-2]
1154-
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1155-
self.assertEqual(tokens, expected_tokens,
1156-
"string not tokenized when encoding is None")
1157-
11581141

11591142
class TestDetectEncoding(TestCase):
11601143

@@ -1412,7 +1395,7 @@ def test_open_error(self):
14121395

14131396
class TestTokenize(TestCase):
14141397

1415-
def test_tokenizee(self):
1398+
def test_tokenize(self):
14161399
import tokenize as tokenize_module
14171400
encoding = "utf-8"
14181401
encoding_used = None
@@ -1424,7 +1407,10 @@ def mock__tokenize(readline, encoding):
14241407
encoding_used = encoding
14251408
out = []
14261409
while True:
1427-
next_line = readline()
1410+
try:
1411+
next_line = next(readline)
1412+
except StopIteration:
1413+
return out
14281414
if next_line:
14291415
out.append(next_line)
14301416
continue
@@ -1444,7 +1430,7 @@ def mock_readline():
14441430
tokenize_module._tokenize = mock__tokenize
14451431
try:
14461432
results = tokenize(mock_readline)
1447-
self.assertEqual(list(results),
1433+
self.assertEqual(list(results)[1:],
14481434
[b'first', b'second', b'1', b'2', b'3', b'4'])
14491435
finally:
14501436
tokenize_module.detect_encoding = orig_detect_encoding
@@ -1740,7 +1726,7 @@ def test_random_files(self):
17401726
if support.verbose >= 2:
17411727
print('tokenize', testfile)
17421728
with open(testfile, 'rb') as f:
1743-
with self.subTest(file=testfile):
1729+
# with self.subTest(file=testfile):
17441730
self.check_roundtrip(f)
17451731

17461732

Lib/tokenize.py

+34-212
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,14 @@ def untokenize(self, iterable):
213213
self.tokens.append(indent)
214214
self.prev_col = len(indent)
215215
startline = False
216+
elif tok_type == FSTRING_MIDDLE:
217+
if '{' in token or '}' in token:
218+
end_line, end_col = end
219+
end = (end_line, end_col + token.count('{') + token.count('}'))
220+
token = re.sub('{', '{{', token)
221+
token = re.sub('}', '}}', token)
222+
223+
216224
self.add_whitespace(start)
217225
self.tokens.append(token)
218226
self.prev_row, self.prev_col = end
@@ -255,6 +263,11 @@ def compat(self, token, iterable):
255263
elif startline and indents:
256264
toks_append(indents[-1])
257265
startline = False
266+
elif toknum == FSTRING_MIDDLE:
267+
if '{' in tokval or '}' in tokval:
268+
tokval = re.sub('{', '{{', tokval)
269+
tokval = re.sub('}', '}}', tokval)
270+
258271
toks_append(tokval)
259272

260273

@@ -404,36 +417,6 @@ def open(filename):
404417
buffer.close()
405418
raise
406419

407-
def tokenize2(readline):
408-
encoding, consumed = detect_encoding(readline)
409-
rl_gen = _itertools.chain(consumed, iter(readline, b""))
410-
if encoding is not None:
411-
if encoding == "utf-8-sig":
412-
# BOM will already have been stripped.
413-
encoding = "utf-8"
414-
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
415-
yield from _tokenize2(rl_gen, encoding)
416-
417-
def _tokenize2(rl_gen, encoding):
418-
source = b"".join(rl_gen)
419-
token = None
420-
for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True):
421-
# TODO: Marta -> limpiar esto
422-
if 6 < token.type <= 54:
423-
token = token._replace(type=OP)
424-
if token.type in {ASYNC, AWAIT}:
425-
token = token._replace(type=NAME)
426-
if token.type == NEWLINE:
427-
l_start, c_start = token.start
428-
l_end, c_end = token.end
429-
token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
430-
431-
yield token
432-
if token is not None:
433-
last_line, _ = token.start
434-
yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
435-
436-
437420
def tokenize(readline):
438421
"""
439422
The tokenize() generator requires one argument, readline, which
@@ -454,194 +437,33 @@ def tokenize(readline):
454437
which tells you which encoding was used to decode the bytes stream.
455438
"""
456439
encoding, consumed = detect_encoding(readline)
457-
empty = _itertools.repeat(b"")
458-
rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
459-
return _tokenize(rl_gen.__next__, encoding)
460-
461-
462-
def _tokenize(readline, encoding):
463-
lnum = parenlev = continued = 0
464-
numchars = '0123456789'
465-
contstr, needcont = '', 0
466-
contline = None
467-
indents = [0]
468-
440+
rl_gen = _itertools.chain(consumed, iter(readline, b""))
469441
if encoding is not None:
470442
if encoding == "utf-8-sig":
471443
# BOM will already have been stripped.
472444
encoding = "utf-8"
473445
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
474-
last_line = b''
475-
line = b''
476-
while True: # loop over lines in stream
477-
try:
478-
# We capture the value of the line variable here because
479-
# readline uses the empty string '' to signal end of input,
480-
# hence `line` itself will always be overwritten at the end
481-
# of this loop.
482-
last_line = line
483-
line = readline()
484-
except StopIteration:
485-
line = b''
486-
487-
if encoding is not None:
488-
line = line.decode(encoding)
489-
lnum += 1
490-
pos, max = 0, len(line)
491-
492-
if contstr: # continued string
493-
if not line:
494-
raise TokenError("EOF in multi-line string", strstart)
495-
endmatch = endprog.match(line)
496-
if endmatch:
497-
pos = end = endmatch.end(0)
498-
yield TokenInfo(STRING, contstr + line[:end],
499-
strstart, (lnum, end), contline + line)
500-
contstr, needcont = '', 0
501-
contline = None
502-
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
503-
yield TokenInfo(ERRORTOKEN, contstr + line,
504-
strstart, (lnum, len(line)), contline)
505-
contstr = ''
506-
contline = None
507-
continue
508-
else:
509-
contstr = contstr + line
510-
contline = contline + line
511-
continue
512-
513-
elif parenlev == 0 and not continued: # new statement
514-
if not line: break
515-
column = 0
516-
while pos < max: # measure leading whitespace
517-
if line[pos] == ' ':
518-
column += 1
519-
elif line[pos] == '\t':
520-
column = (column//tabsize + 1)*tabsize
521-
elif line[pos] == '\f':
522-
column = 0
523-
else:
524-
break
525-
pos += 1
526-
if pos == max:
527-
break
528-
529-
if line[pos] in '#\r\n': # skip comments or blank lines
530-
if line[pos] == '#':
531-
comment_token = line[pos:].rstrip('\r\n')
532-
yield TokenInfo(COMMENT, comment_token,
533-
(lnum, pos), (lnum, pos + len(comment_token)), line)
534-
pos += len(comment_token)
535-
536-
yield TokenInfo(NL, line[pos:],
537-
(lnum, pos), (lnum, len(line)), line)
538-
continue
539-
540-
if column > indents[-1]: # count indents or dedents
541-
indents.append(column)
542-
yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
543-
while column < indents[-1]:
544-
if column not in indents:
545-
raise IndentationError(
546-
"unindent does not match any outer indentation level",
547-
("<tokenize>", lnum, pos, line))
548-
indents = indents[:-1]
549-
550-
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
551-
552-
else: # continued statement
553-
if not line:
554-
raise TokenError("EOF in multi-line statement", (lnum, 0))
555-
continued = 0
556-
557-
while pos < max:
558-
pseudomatch = _compile(PseudoToken).match(line, pos)
559-
if pseudomatch: # scan for tokens
560-
start, end = pseudomatch.span(1)
561-
spos, epos, pos = (lnum, start), (lnum, end), end
562-
if start == end:
563-
continue
564-
token, initial = line[start:end], line[start]
565-
566-
if (initial in numchars or # ordinary number
567-
(initial == '.' and token != '.' and token != '...')):
568-
yield TokenInfo(NUMBER, token, spos, epos, line)
569-
elif initial in '\r\n':
570-
if parenlev > 0:
571-
yield TokenInfo(NL, token, spos, epos, line)
572-
else:
573-
yield TokenInfo(NEWLINE, token, spos, epos, line)
574-
575-
elif initial == '#':
576-
assert not token.endswith("\n")
577-
yield TokenInfo(COMMENT, token, spos, epos, line)
578-
579-
elif token in triple_quoted:
580-
endprog = _compile(endpats[token])
581-
endmatch = endprog.match(line, pos)
582-
if endmatch: # all on one line
583-
pos = endmatch.end(0)
584-
token = line[start:pos]
585-
yield TokenInfo(STRING, token, spos, (lnum, pos), line)
586-
else:
587-
strstart = (lnum, start) # multiple lines
588-
contstr = line[start:]
589-
contline = line
590-
break
591-
592-
# Check up to the first 3 chars of the token to see if
593-
# they're in the single_quoted set. If so, they start
594-
# a string.
595-
# We're using the first 3, because we're looking for
596-
# "rb'" (for example) at the start of the token. If
597-
# we switch to longer prefixes, this needs to be
598-
# adjusted.
599-
# Note that initial == token[:1].
600-
# Also note that single quote checking must come after
601-
# triple quote checking (above).
602-
elif (initial in single_quoted or
603-
token[:2] in single_quoted or
604-
token[:3] in single_quoted):
605-
if token[-1] == '\n': # continued string
606-
strstart = (lnum, start)
607-
# Again, using the first 3 chars of the
608-
# token. This is looking for the matching end
609-
# regex for the correct type of quote
610-
# character. So it's really looking for
611-
# endpats["'"] or endpats['"'], by trying to
612-
# skip string prefix characters, if any.
613-
endprog = _compile(endpats.get(initial) or
614-
endpats.get(token[1]) or
615-
endpats.get(token[2]))
616-
contstr, needcont = line[start:], 1
617-
contline = line
618-
break
619-
else: # ordinary string
620-
yield TokenInfo(STRING, token, spos, epos, line)
621-
622-
elif initial.isidentifier(): # ordinary name
623-
yield TokenInfo(NAME, token, spos, epos, line)
624-
elif initial == '\\': # continued stmt
625-
continued = 1
626-
else:
627-
if initial in '([{':
628-
parenlev += 1
629-
elif initial in ')]}':
630-
parenlev -= 1
631-
yield TokenInfo(OP, token, spos, epos, line)
632-
else:
633-
yield TokenInfo(ERRORTOKEN, line[pos],
634-
(lnum, pos), (lnum, pos+1), line)
635-
pos += 1
446+
yield from _tokenize(rl_gen, encoding)
447+
448+
def _tokenize(rl_gen, encoding):
449+
source = b"".join(rl_gen).decode(encoding)
450+
token = None
451+
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
452+
# TODO: Marta -> limpiar esto
453+
if 6 < token.type <= 54:
454+
token = token._replace(type=OP)
455+
if token.type in {ASYNC, AWAIT}:
456+
token = token._replace(type=NAME)
457+
if token.type == NEWLINE:
458+
l_start, c_start = token.start
459+
l_end, c_end = token.end
460+
token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
636461

637-
# Add an implicit NEWLINE if the input doesn't end in one
638-
if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
639-
yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
640-
for indent in indents[1:]: # pop remaining indent levels
641-
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
642-
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
462+
yield token
463+
if token is not None:
464+
last_line, _ = token.start
465+
yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
643466

644-
tokenize = tokenize2
645467

646468
def generate_tokens(readline):
647469
"""Tokenize a source reading Python code as unicode strings.
@@ -658,7 +480,7 @@ def _gen():
658480
if not line:
659481
return
660482
yield line.encode()
661-
return _tokenize2(_gen(), 'utf-8')
483+
return _tokenize(_gen(), 'utf-8')
662484

663485
def main():
664486
import argparse

0 commit comments

Comments
 (0)