Skip to content

Commit 96fff35

Browse files
gh-105017: Include CRLF lines in strings and column numbers (#105030)
Co-authored-by: Pablo Galindo <[email protected]>
1 parent 3821b92 commit 96fff35

File tree

6 files changed

+74
-26
lines changed

6 files changed

+74
-26
lines changed

Lib/test/test_tokenize.py

+36-8
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,29 @@ def test_basic(self):
8585
DEDENT '' (5, 0) (5, 0)
8686
""")
8787

88-
self.check_tokenize("foo='bar'\r\n", """\
89-
NAME 'foo' (1, 0) (1, 3)
90-
OP '=' (1, 3) (1, 4)
91-
STRING "'bar'" (1, 4) (1, 9)
92-
NEWLINE '\\n' (1, 9) (1, 10)
88+
self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\
89+
NAME 'if' (1, 0) (1, 2)
90+
NAME 'True' (1, 3) (1, 7)
91+
OP ':' (1, 7) (1, 8)
92+
NEWLINE '\\r\\n' (1, 8) (1, 10)
93+
COMMENT '# NL' (2, 4) (2, 8)
94+
NL '\\r\\n' (2, 8) (2, 10)
95+
INDENT ' ' (3, 0) (3, 4)
96+
NAME 'foo' (3, 4) (3, 7)
97+
OP '=' (3, 7) (3, 8)
98+
STRING "\'bar\'" (3, 8) (3, 13)
99+
NEWLINE '\\r\\n' (3, 13) (3, 15)
100+
NL '\\r\\n' (4, 0) (4, 2)
101+
DEDENT '' (5, 0) (5, 0)
102+
""")
103+
104+
self.check_tokenize("x = 1 + \\\r\n1\r\n", """\
105+
NAME 'x' (1, 0) (1, 1)
106+
OP '=' (1, 2) (1, 3)
107+
NUMBER '1' (1, 4) (1, 5)
108+
OP '+' (1, 6) (1, 7)
109+
NUMBER '1' (2, 0) (2, 1)
110+
NEWLINE '\\r\\n' (2, 1) (2, 3)
93111
""")
94112

95113
indent_error_file = b"""\
@@ -1784,9 +1802,9 @@ def test_random_files(self):
17841802
if support.verbose >= 2:
17851803
print('tokenize', testfile)
17861804
with open(testfile, 'rb') as f:
1787-
# with self.subTest(file=testfile):
1788-
self.check_roundtrip(f)
1789-
self.check_line_extraction(f)
1805+
with self.subTest(file=testfile):
1806+
self.check_roundtrip(f)
1807+
self.check_line_extraction(f)
17901808

17911809

17921810
def roundtrip(self, code):
@@ -2084,6 +2102,10 @@ def test_string(self):
20842102
b\
20852103
c"""', """\
20862104
STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
2105+
""")
2106+
2107+
self.check_tokenize(r'"hola\\\r\ndfgf"', """\
2108+
STRING \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16)
20872109
""")
20882110

20892111
self.check_tokenize('f"abc"', """\
@@ -2120,6 +2142,12 @@ def test_string(self):
21202142
FSTRING_START 'Rf"' (1, 0) (1, 3)
21212143
FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3)
21222144
FSTRING_END '"' (2, 3) (2, 4)
2145+
""")
2146+
2147+
self.check_tokenize(r'f"hola\\\r\ndfgf"', """\
2148+
FSTRING_START \'f"\' (1, 0) (1, 2)
2149+
FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16)
2150+
FSTRING_END \'"\' (1, 16) (1, 17)
21232151
""")
21242152

21252153
def test_function(self):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez.

Parser/pegen.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -924,9 +924,9 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
924924

925925
struct tok_state *tok;
926926
if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
927-
tok = _PyTokenizer_FromUTF8(str, exec_input);
927+
tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
928928
} else {
929-
tok = _PyTokenizer_FromString(str, exec_input);
929+
tok = _PyTokenizer_FromString(str, exec_input, 0);
930930
}
931931
if (tok == NULL) {
932932
if (PyErr_Occurred()) {

Parser/tokenizer.c

+26-12
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,8 @@ translate_into_utf8(const char* str, const char* enc) {
772772

773773

774774
static char *
775-
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
775+
translate_newlines(const char *s, int exec_input, int preserve_crlf,
776+
struct tok_state *tok) {
776777
int skip_next_lf = 0;
777778
size_t needed_length = strlen(s) + 2, final_length;
778779
char *buf, *current;
@@ -792,7 +793,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
792793
break;
793794
}
794795
}
795-
if (c == '\r') {
796+
if (!preserve_crlf && c == '\r') {
796797
skip_next_lf = 1;
797798
c = '\n';
798799
}
@@ -822,14 +823,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
822823
inside TOK. */
823824

824825
static char *
825-
decode_str(const char *input, int single, struct tok_state *tok)
826+
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
826827
{
827828
PyObject* utf8 = NULL;
828829
char *str;
829830
const char *s;
830831
const char *newl[2] = {NULL, NULL};
831832
int lineno = 0;
832-
tok->input = str = translate_newlines(input, single, tok);
833+
tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
833834
if (str == NULL)
834835
return NULL;
835836
tok->enc = NULL;
@@ -881,14 +882,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
881882
/* Set up tokenizer for string */
882883

883884
struct tok_state *
884-
_PyTokenizer_FromString(const char *str, int exec_input)
885+
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
885886
{
886887
struct tok_state *tok = tok_new();
887888
char *decoded;
888889

889890
if (tok == NULL)
890891
return NULL;
891-
decoded = decode_str(str, exec_input, tok);
892+
decoded = decode_str(str, exec_input, tok, preserve_crlf);
892893
if (decoded == NULL) {
893894
_PyTokenizer_Free(tok);
894895
return NULL;
@@ -902,13 +903,13 @@ _PyTokenizer_FromString(const char *str, int exec_input)
902903
/* Set up tokenizer for UTF-8 string */
903904

904905
struct tok_state *
905-
_PyTokenizer_FromUTF8(const char *str, int exec_input)
906+
_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
906907
{
907908
struct tok_state *tok = tok_new();
908909
char *translated;
909910
if (tok == NULL)
910911
return NULL;
911-
tok->input = translated = translate_newlines(str, exec_input, tok);
912+
tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
912913
if (translated == NULL) {
913914
_PyTokenizer_Free(tok);
914915
return NULL;
@@ -1050,7 +1051,7 @@ tok_underflow_interactive(struct tok_state *tok) {
10501051
}
10511052
char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
10521053
if (newtok != NULL) {
1053-
char *translated = translate_newlines(newtok, 0, tok);
1054+
char *translated = translate_newlines(newtok, 0, 0, tok);
10541055
PyMem_Free(newtok);
10551056
if (translated == NULL) {
10561057
return 0;
@@ -1594,6 +1595,9 @@ tok_decimal_tail(struct tok_state *tok)
15941595
static inline int
15951596
tok_continuation_line(struct tok_state *tok) {
15961597
int c = tok_nextc(tok);
1598+
if (c == '\r') {
1599+
c = tok_nextc(tok);
1600+
}
15971601
if (c != '\n') {
15981602
tok->done = E_LINECONT;
15991603
return -1;
@@ -1693,7 +1697,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
16931697
}
16941698
}
16951699
tok_backup(tok, c);
1696-
if (c == '#' || c == '\n') {
1700+
if (c == '#' || c == '\n' || c == '\r') {
16971701
/* Lines with only whitespace and/or comments
16981702
shouldn't affect the indentation and are
16991703
not passed to the parser as NEWLINE tokens,
@@ -1822,7 +1826,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
18221826
const char *prefix, *type_start;
18231827
int current_starting_col_offset;
18241828

1825-
while (c != EOF && c != '\n') {
1829+
while (c != EOF && c != '\n' && c != '\r') {
18261830
c = tok_nextc(tok);
18271831
}
18281832

@@ -2002,6 +2006,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
20022006
return MAKE_TOKEN(NAME);
20032007
}
20042008

2009+
if (c == '\r') {
2010+
c = tok_nextc(tok);
2011+
}
2012+
20052013
/* Newline */
20062014
if (c == '\n') {
20072015
tok->atbol = 1;
@@ -2405,7 +2413,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
24052413
else {
24062414
end_quote_size = 0;
24072415
if (c == '\\') {
2408-
tok_nextc(tok); /* skip escaped char */
2416+
c = tok_nextc(tok); /* skip escaped char */
2417+
if (c == '\r') {
2418+
c = tok_nextc(tok);
2419+
}
24092420
}
24102421
}
24112422
}
@@ -2696,6 +2707,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
26962707
return MAKE_TOKEN(FSTRING_MIDDLE);
26972708
} else if (c == '\\') {
26982709
int peek = tok_nextc(tok);
2710+
if (peek == '\r') {
2711+
peek = tok_nextc(tok);
2712+
}
26992713
// Special case when the backslash is right before a curly
27002714
// brace. We have to restore and return the control back
27012715
// to the loop for the next iteration.

Parser/tokenizer.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ struct tok_state {
135135
#endif
136136
};
137137

138-
extern struct tok_state *_PyTokenizer_FromString(const char *, int);
139-
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
138+
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
139+
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
140140
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
141141
const char *, const char *);
142142
extern void _PyTokenizer_Free(struct tok_state *);

Python/Python-tokenize.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
5555
if (filename == NULL) {
5656
return NULL;
5757
}
58-
self->tok = _PyTokenizer_FromUTF8(source, 1);
58+
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
5959
if (self->tok == NULL) {
6060
Py_DECREF(filename);
6161
return NULL;
@@ -240,7 +240,12 @@ tokenizeriter_next(tokenizeriterobject *it)
240240
type = NAME;
241241
}
242242
else if (type == NEWLINE) {
243-
str = PyUnicode_FromString("\n");
243+
Py_DECREF(str);
244+
if (it->tok->start[0] == '\r') {
245+
str = PyUnicode_FromString("\r\n");
246+
} else {
247+
str = PyUnicode_FromString("\n");
248+
}
244249
end_col_offset++;
245250
}
246251
}

0 commit comments

Comments
 (0)