Skip to content

Commit b8f2ff0

Browse files
hugovktomasr8
andauthored
[3.12] gh-125553: Fix backslash continuation in untokenize (GH-126010) (#130579)
(cherry picked from commit 7ad793e) Co-authored-by: Tomas R. <[email protected]>
1 parent 245ca26 commit b8f2ff0

File tree

3 files changed

+53
-11
lines changed

3 files changed

+53
-11
lines changed

Lib/test/test_tokenize.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
1-
from test import support
2-
from test.support import os_helper
1+
import os
2+
import re
3+
import token
4+
import unittest
35
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
46
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
57
open as tokenize_open, Untokenizer, generate_tokens,
68
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
79
TokenError)
810
from io import BytesIO, StringIO
9-
import unittest
1011
from textwrap import dedent
1112
from unittest import TestCase, mock
13+
from test import support
1214
from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
1315
INVALID_UNDERSCORE_LITERALS)
1416
from test.support import os_helper
1517
from test.support.script_helper import run_test_script, make_script, run_python_until_end
16-
import os
17-
import token
1818

1919
# Converts a source string into a list of textual representation
2020
# of the tokens such as:
@@ -1816,6 +1816,22 @@ def test_iter_compat(self):
18161816
self.assertEqual(untokenize(iter(tokens)), b'Hello ')
18171817

18181818

1819+
def contains_ambiguous_backslash(source):
1820+
"""Return `True` if the source contains a backslash on a
1821+
line by itself. For example:
1822+
1823+
a = (1
1824+
\\
1825+
)
1826+
1827+
Code like this cannot be untokenized exactly. This is because
1828+
the tokenizer does not produce any tokens for the line containing
1829+
the backslash and so there is no way to know its indent.
1830+
"""
1831+
pattern = re.compile(br'\n\s*\\\r?\n')
1832+
return pattern.search(source) is not None
1833+
1834+
18191835
class TestRoundtrip(TestCase):
18201836

18211837
def check_roundtrip(self, f):
@@ -1826,6 +1842,9 @@ def check_roundtrip(self, f):
18261842
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
18271843
The test fails if the 3 pair tokenizations do not match.
18281844
1845+
If the source code can be untokenized unambiguously, the
1846+
untokenized code must match the original code exactly.
1847+
18291848
When untokenize bugs are fixed, untokenize with 5-tuples should
18301849
reproduce code that does not contain a backslash continuation
18311850
following spaces. A proper test should test this.
@@ -1849,6 +1868,13 @@ def check_roundtrip(self, f):
18491868
tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
18501869
self.assertEqual(tokens2_from5, tokens2)
18511870

1871+
if not contains_ambiguous_backslash(code):
1872+
# The BOM does not produce a token so there is no way to preserve it.
1873+
code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
1874+
readline = iter(code_without_bom.splitlines(keepends=True)).__next__
1875+
untokenized_code = untokenize(tokenize(readline))
1876+
self.assertEqual(code_without_bom, untokenized_code)
1877+
18521878
def check_line_extraction(self, f):
18531879
if isinstance(f, str):
18541880
code = f.encode('utf-8')

Lib/tokenize.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -171,21 +171,36 @@ def __init__(self):
171171
self.prev_row = 1
172172
self.prev_col = 0
173173
self.prev_type = None
174+
self.prev_line = ""
174175
self.encoding = None
175176

176177
def add_whitespace(self, start):
177178
row, col = start
178179
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
179180
raise ValueError("start ({},{}) precedes previous end ({},{})"
180181
.format(row, col, self.prev_row, self.prev_col))
181-
row_offset = row - self.prev_row
182-
if row_offset:
183-
self.tokens.append("\\\n" * row_offset)
184-
self.prev_col = 0
182+
self.add_backslash_continuation(start)
185183
col_offset = col - self.prev_col
186184
if col_offset:
187185
self.tokens.append(" " * col_offset)
188186

187+
def add_backslash_continuation(self, start):
188+
"""Add backslash continuation characters if the row has increased
189+
without encountering a newline token.
190+
191+
This also inserts the correct amount of whitespace before the backslash.
192+
"""
193+
row = start[0]
194+
row_offset = row - self.prev_row
195+
if row_offset == 0:
196+
return
197+
198+
newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
199+
line = self.prev_line.rstrip('\\\r\n')
200+
ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
201+
self.tokens.append(ws + f"\\{newline}" * row_offset)
202+
self.prev_col = 0
203+
189204
def escape_brackets(self, token):
190205
characters = []
191206
consume_until_next_bracket = False
@@ -245,8 +260,6 @@ def untokenize(self, iterable):
245260
end_line, end_col = end
246261
extra_chars = last_line.count("{{") + last_line.count("}}")
247262
end = (end_line, end_col + extra_chars)
248-
elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
249-
self.tokens.append(" ")
250263

251264
self.add_whitespace(start)
252265
self.tokens.append(token)
@@ -255,6 +268,7 @@ def untokenize(self, iterable):
255268
self.prev_row += 1
256269
self.prev_col = 0
257270
self.prev_type = tok_type
271+
self.prev_line = line
258272
return "".join(self.tokens)
259273

260274
def compat(self, token, iterable):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix round-trip invariance for backslash continuations in
2+
:func:`tokenize.untokenize`.

0 commit comments

Comments
 (0)