1
- from test import support
2
- from test .support import os_helper
1
+ import os
2
+ import re
3
+ import token
4
+ import unittest
3
5
from tokenize import (tokenize , untokenize , NUMBER , NAME , OP ,
4
6
STRING , ENDMARKER , ENCODING , tok_name , detect_encoding ,
5
7
open as tokenize_open , Untokenizer , generate_tokens ,
6
8
NEWLINE , _generate_tokens_from_c_tokenizer , DEDENT , TokenInfo ,
7
9
TokenError )
8
10
from io import BytesIO , StringIO
9
- import unittest
10
11
from textwrap import dedent
11
12
from unittest import TestCase , mock
13
+ from test import support
12
14
from test .test_grammar import (VALID_UNDERSCORE_LITERALS ,
13
15
INVALID_UNDERSCORE_LITERALS )
14
16
from test .support import os_helper
15
17
from test .support .script_helper import run_test_script , make_script , run_python_until_end
16
- import os
17
- import token
18
18
19
19
# Converts a source string into a list of textual representation
20
20
# of the tokens such as:
@@ -1816,6 +1816,22 @@ def test_iter_compat(self):
1816
1816
self .assertEqual (untokenize (iter (tokens )), b'Hello ' )
1817
1817
1818
1818
1819
+ def contains_ambiguous_backslash (source ):
1820
+ """Return `True` if the source contains a backslash on a
1821
+ line by itself. For example:
1822
+
1823
+ a = (1
1824
+ \\
1825
+ )
1826
+
1827
+ Code like this cannot be untokenized exactly. This is because
1828
+ the tokenizer does not produce any tokens for the line containing
1829
+ the backslash and so there is no way to know its indent.
1830
+ """
1831
+ pattern = re .compile (br'\n\s*\\\r?\n' )
1832
+ return pattern .search (source ) is not None
1833
+
1834
+
1819
1835
class TestRoundtrip (TestCase ):
1820
1836
1821
1837
def check_roundtrip (self , f ):
@@ -1826,6 +1842,9 @@ def check_roundtrip(self, f):
1826
1842
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1827
1843
The test fails if the 3 pair tokenizations do not match.
1828
1844
1845
+ If the source code can be untokenized unambiguously, the
1846
+ untokenized code must match the original code exactly.
1847
+
1829
1848
When untokenize bugs are fixed, untokenize with 5-tuples should
1830
1849
reproduce code that does not contain a backslash continuation
1831
1850
following spaces. A proper test should test this.
@@ -1849,6 +1868,13 @@ def check_roundtrip(self, f):
1849
1868
tokens2_from5 = [tok [:2 ] for tok in tokenize (readline5 )]
1850
1869
self .assertEqual (tokens2_from5 , tokens2 )
1851
1870
1871
+ if not contains_ambiguous_backslash (code ):
1872
+ # The BOM does not produce a token so there is no way to preserve it.
1873
+ code_without_bom = code .removeprefix (b'\xef \xbb \xbf ' )
1874
+ readline = iter (code_without_bom .splitlines (keepends = True )).__next__
1875
+ untokenized_code = untokenize (tokenize (readline ))
1876
+ self .assertEqual (code_without_bom , untokenized_code )
1877
+
1852
1878
def check_line_extraction (self , f ):
1853
1879
if isinstance (f , str ):
1854
1880
code = f .encode ('utf-8' )
0 commit comments