gh-104976: Ensure trailing dedent tokens are emitted as the previous tokenizer

pablogsal · pablogsal · commit 19a58c56fa3d · 2023-05-26T16:21:26.000+01:00
Signed-off-by: Pablo Galindo &lt;pablogsal@gmail.com&gt;
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
@@ -82,7 +82,7 @@ def test_basic(self):
     NAME       'False'       (4, 11) (4, 16)
     COMMENT    '# NEWLINE'   (4, 17) (4, 26)
     NEWLINE    '\\n'          (4, 26) (4, 27)
-    DEDENT     ''            (4, 27) (4, 27)
+    DEDENT     ''            (5, 0) (5, 0)
     """)
         indent_error_file = b"""\
 def k(x):
@@ -755,8 +755,8 @@ def test_tabs(self):
     NEWLINE    '\\n'          (2, 5) (2, 6)
     INDENT     '        \\t'  (3, 0) (3, 9)
     NAME       'pass'        (3, 9) (3, 13)
-    DEDENT     ''            (3, 14) (3, 14)
-    DEDENT     ''            (3, 14) (3, 14)
+    DEDENT     ''            (4, 0) (4, 0)
+    DEDENT     ''            (4, 0) (4, 0)
     """)
 
     def test_non_ascii_identifiers(self):
@@ -968,7 +968,7 @@ async def foo():
     NUMBER     '1'           (2, 17) (2, 18)
     OP         ':'           (2, 18) (2, 19)
     NAME       'pass'        (2, 20) (2, 24)
-    DEDENT     ''            (2, 25) (2, 25)
+    DEDENT     ''            (3, 0) (3, 0)
     """)
 
         self.check_tokenize('''async def foo(async): await''', """\
@@ -1016,7 +1016,7 @@ async def bar(): pass
     NAME       'await'       (6, 2) (6, 7)
     OP         '='           (6, 8) (6, 9)
     NUMBER     '2'           (6, 10) (6, 11)
-    DEDENT     ''            (6, 12) (6, 12)
+    DEDENT     ''            (7, 0) (7, 0)
     """)
 
         self.check_tokenize('''\
@@ -1054,7 +1054,7 @@ async def bar(): pass
     NAME       'await'       (6, 2) (6, 7)
     OP         '='           (6, 8) (6, 9)
     NUMBER     '2'           (6, 10) (6, 11)
-    DEDENT     ''            (6, 12) (6, 12)
+    DEDENT     ''            (7, 0) (7, 0)
     """)
 
     def test_newline_after_parenthesized_block_with_comment(self):
@@ -1174,7 +1174,7 @@ def readline():
 
         # skip the initial encoding token and the end tokens
         tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
-        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
         self.assertEqual(tokens, expected_tokens,
                          "bytes not decoded with encoding")
 
@@ -2669,7 +2669,8 @@ def generate_source(indents):
 
         valid = generate_source(MAXINDENT - 1)
         tokens = list(_generate_tokens_from_c_tokenizer(valid))
-        self.assertEqual(tokens[-1].type, DEDENT)
+        self.assertEqual(tokens[-2].type, DEDENT)
+        self.assertEqual(tokens[-1].type, ENDMARKER)
         compile(valid, "<string>", "exec")
 
         invalid = generate_source(MAXINDENT)
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
@@ -447,13 +447,8 @@ def tokenize(readline):
 
 def _tokenize(rl_gen, encoding):
     source = b"".join(rl_gen).decode(encoding)
-    token = None
     for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
         yield token
-    if token is not None:
-        last_line, _ = token.start
-        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
-
 
 def generate_tokens(readline):
     """Tokenize a source reading Python code as unicode strings.
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst
@@ -0,0 +1,3 @@
+Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted
+by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo
+Galindo
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
@@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t
 typedef struct
 {
     PyObject_HEAD struct tok_state *tok;
+    int done;
 } tokenizeriterobject;
 
 /*[clinic input]
@@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
     if (extra_tokens) {
         self->tok->tok_extra_tokens = 1;
     }
+    self->done = 0;
     return (PyObject *)self;
 }
 
@@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it)
         }
         goto exit;
     }
-    if (type == ERRORTOKEN || type == ENDMARKER) {
+    if (it->done || type == ERRORTOKEN) {
         PyErr_SetString(PyExc_StopIteration, "EOF");
+        it->done = 1;
         goto exit;
     }
     PyObject *str = NULL;
@@ -215,6 +218,10 @@ tokenizeriter_next(tokenizeriterobject *it)
     }
 
     if (it->tok->tok_extra_tokens) {
+        if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
+            lineno = end_lineno = lineno + 1;
+            col_offset = end_col_offset = 0;
+        }
         // Necessary adjustments to match the original Python tokenize
         // implementation
         if (type > DEDENT && type < OP) {
@@ -232,6 +239,9 @@ tokenizeriter_next(tokenizeriterobject *it)
     result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
 exit:
     _PyToken_Free(&token);
+    if (type == ENDMARKER) {
+        it->done = 1;
+    }
     return result;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted
	`2`	+by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo
	`3`	`+Galindo`
Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t`
`30`	`30`	`typedef struct`
`31`	`31`	`{`
`32`	`32`	`PyObject_HEAD struct tok_state *tok;`
	`33`	`+ int done;`
`33`	`34`	`} tokenizeriterobject;`
`34`	`35`
`35`	`36`	`/*[clinic input]`
`@@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject type, const char source,`
`63`	`64`	`if (extra_tokens) {`
`64`	`65`	`self->tok->tok_extra_tokens = 1;`
`65`	`66`	`}`
	`67`	`+ self->done = 0;`
`66`	`68`	`return (PyObject *)self;`
`67`	`69`	`}`
`68`	`70`
`@@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it)`
`179`	`181`	`}`
`180`	`182`	`goto exit;`
`181`	`183`	`}`
`182`		`- if (type == ERRORTOKEN \|\| type == ENDMARKER) {`
	`184`	`+ if (it->done \|\| type == ERRORTOKEN) {`
`183`	`185`	`PyErr_SetString(PyExc_StopIteration, "EOF");`
	`186`	`+ it->done = 1;`
`184`	`187`	`goto exit;`
`185`	`188`	`}`
`186`	`189`	`PyObject *str = NULL;`
`@@ -215,6 +218,10 @@ tokenizeriter_next(tokenizeriterobject *it)`
`215`	`218`	`}`
`216`	`219`
`217`	`220`	`if (it->tok->tok_extra_tokens) {`
	`221`	`+ if (type == ENDMARKER \|\| (type == DEDENT && it->tok->done == E_EOF)) {`
	`222`	`+ lineno = end_lineno = lineno + 1;`
	`223`	`+ col_offset = end_col_offset = 0;`
	`224`	`+ }`
`218`	`225`	`// Necessary adjustments to match the original Python tokenize`
`219`	`226`	`// implementation`
`220`	`227`	`if (type > DEDENT && type < OP) {`
`@@ -232,6 +239,9 @@ tokenizeriter_next(tokenizeriterobject *it)`
`232`	`239`	`result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);`
`233`	`240`	`exit:`
`234`	`241`	`_PyToken_Free(&token);`
	`242`	`+ if (type == ENDMARKER) {`
	`243`	`+ it->done = 1;`
	`244`	`+ }`
`235`	`245`	`return result;`
`236`	`246`	`}`
`237`	`247`