Skip to content

Commit 19a58c5

Browse files
committed
gh-104976: Ensure trailing dedent tokens are emitted as the previous tokenizer
Signed-off-by: Pablo Galindo <[email protected]>
1 parent 2cb4456 commit 19a58c5

File tree

4 files changed

+23
-14
lines changed

4 files changed

+23
-14
lines changed

Lib/test/test_tokenize.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def test_basic(self):
8282
NAME 'False' (4, 11) (4, 16)
8383
COMMENT '# NEWLINE' (4, 17) (4, 26)
8484
NEWLINE '\\n' (4, 26) (4, 27)
85-
DEDENT '' (4, 27) (4, 27)
85+
DEDENT '' (5, 0) (5, 0)
8686
""")
8787
indent_error_file = b"""\
8888
def k(x):
@@ -755,8 +755,8 @@ def test_tabs(self):
755755
NEWLINE '\\n' (2, 5) (2, 6)
756756
INDENT ' \\t' (3, 0) (3, 9)
757757
NAME 'pass' (3, 9) (3, 13)
758-
DEDENT '' (3, 14) (3, 14)
759-
DEDENT '' (3, 14) (3, 14)
758+
DEDENT '' (4, 0) (4, 0)
759+
DEDENT '' (4, 0) (4, 0)
760760
""")
761761

762762
def test_non_ascii_identifiers(self):
@@ -968,7 +968,7 @@ async def foo():
968968
NUMBER '1' (2, 17) (2, 18)
969969
OP ':' (2, 18) (2, 19)
970970
NAME 'pass' (2, 20) (2, 24)
971-
DEDENT '' (2, 25) (2, 25)
971+
DEDENT '' (3, 0) (3, 0)
972972
""")
973973

974974
self.check_tokenize('''async def foo(async): await''', """\
@@ -1016,7 +1016,7 @@ async def bar(): pass
10161016
NAME 'await' (6, 2) (6, 7)
10171017
OP '=' (6, 8) (6, 9)
10181018
NUMBER '2' (6, 10) (6, 11)
1019-
DEDENT '' (6, 12) (6, 12)
1019+
DEDENT '' (7, 0) (7, 0)
10201020
""")
10211021

10221022
self.check_tokenize('''\
@@ -1054,7 +1054,7 @@ async def bar(): pass
10541054
NAME 'await' (6, 2) (6, 7)
10551055
OP '=' (6, 8) (6, 9)
10561056
NUMBER '2' (6, 10) (6, 11)
1057-
DEDENT '' (6, 12) (6, 12)
1057+
DEDENT '' (7, 0) (7, 0)
10581058
""")
10591059

10601060
def test_newline_after_parenthesized_block_with_comment(self):
@@ -1174,7 +1174,7 @@ def readline():
11741174

11751175
# skip the initial encoding token and the end tokens
11761176
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
1177-
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1177+
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
11781178
self.assertEqual(tokens, expected_tokens,
11791179
"bytes not decoded with encoding")
11801180

@@ -2669,7 +2669,8 @@ def generate_source(indents):
26692669

26702670
valid = generate_source(MAXINDENT - 1)
26712671
tokens = list(_generate_tokens_from_c_tokenizer(valid))
2672-
self.assertEqual(tokens[-1].type, DEDENT)
2672+
self.assertEqual(tokens[-2].type, DEDENT)
2673+
self.assertEqual(tokens[-1].type, ENDMARKER)
26732674
compile(valid, "<string>", "exec")
26742675

26752676
invalid = generate_source(MAXINDENT)

Lib/tokenize.py

-5
Original file line numberDiff line numberDiff line change
@@ -447,13 +447,8 @@ def tokenize(readline):
447447

448448
def _tokenize(rl_gen, encoding):
449449
source = b"".join(rl_gen).decode(encoding)
450-
token = None
451450
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
452451
yield token
453-
if token is not None:
454-
last_line, _ = token.start
455-
yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
456-
457452

458453
def generate_tokens(readline):
459454
"""Tokenize a source reading Python code as unicode strings.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted
2+
by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo
3+
Galindo

Python/Python-tokenize.c

+11-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t
3030
typedef struct
3131
{
3232
PyObject_HEAD struct tok_state *tok;
33+
int done;
3334
} tokenizeriterobject;
3435

3536
/*[clinic input]
@@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
6364
if (extra_tokens) {
6465
self->tok->tok_extra_tokens = 1;
6566
}
67+
self->done = 0;
6668
return (PyObject *)self;
6769
}
6870

@@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it)
179181
}
180182
goto exit;
181183
}
182-
if (type == ERRORTOKEN || type == ENDMARKER) {
184+
if (it->done || type == ERRORTOKEN) {
183185
PyErr_SetString(PyExc_StopIteration, "EOF");
186+
it->done = 1;
184187
goto exit;
185188
}
186189
PyObject *str = NULL;
@@ -215,6 +218,10 @@ tokenizeriter_next(tokenizeriterobject *it)
215218
}
216219

217220
if (it->tok->tok_extra_tokens) {
221+
if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
222+
lineno = end_lineno = lineno + 1;
223+
col_offset = end_col_offset = 0;
224+
}
218225
// Necessary adjustments to match the original Python tokenize
219226
// implementation
220227
if (type > DEDENT && type < OP) {
@@ -232,6 +239,9 @@ tokenizeriter_next(tokenizeriterobject *it)
232239
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
233240
exit:
234241
_PyToken_Free(&token);
242+
if (type == ENDMARKER) {
243+
it->done = 1;
244+
}
235245
return result;
236246
}
237247

0 commit comments

Comments
 (0)