Skip to content

[3.12] gh-105017: Include CRLF lines in strings and column numbers (GH-105030) #105041

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 36 additions & 8 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,29 @@ def test_basic(self):
DEDENT '' (5, 0) (5, 0)
""")

self.check_tokenize("foo='bar'\r\n", """\
NAME 'foo' (1, 0) (1, 3)
OP '=' (1, 3) (1, 4)
STRING "'bar'" (1, 4) (1, 9)
NEWLINE '\\n' (1, 9) (1, 10)
self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\
NAME 'if' (1, 0) (1, 2)
NAME 'True' (1, 3) (1, 7)
OP ':' (1, 7) (1, 8)
NEWLINE '\\r\\n' (1, 8) (1, 10)
COMMENT '# NL' (2, 4) (2, 8)
NL '\\r\\n' (2, 8) (2, 10)
INDENT ' ' (3, 0) (3, 4)
NAME 'foo' (3, 4) (3, 7)
OP '=' (3, 7) (3, 8)
STRING "\'bar\'" (3, 8) (3, 13)
NEWLINE '\\r\\n' (3, 13) (3, 15)
NL '\\r\\n' (4, 0) (4, 2)
DEDENT '' (5, 0) (5, 0)
""")

self.check_tokenize("x = 1 + \\\r\n1\r\n", """\
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
OP '+' (1, 6) (1, 7)
NUMBER '1' (2, 0) (2, 1)
NEWLINE '\\r\\n' (2, 1) (2, 3)
""")

indent_error_file = b"""\
Expand Down Expand Up @@ -1784,9 +1802,9 @@ def test_random_files(self):
if support.verbose >= 2:
print('tokenize', testfile)
with open(testfile, 'rb') as f:
# with self.subTest(file=testfile):
self.check_roundtrip(f)
self.check_line_extraction(f)
with self.subTest(file=testfile):
self.check_roundtrip(f)
self.check_line_extraction(f)


def roundtrip(self, code):
Expand Down Expand Up @@ -2084,6 +2102,10 @@ def test_string(self):
b\
c"""', """\
STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
""")

self.check_tokenize(r'"hola\\\r\ndfgf"', """\
STRING \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16)
""")

self.check_tokenize('f"abc"', """\
Expand Down Expand Up @@ -2120,6 +2142,12 @@ def test_string(self):
FSTRING_START 'Rf"' (1, 0) (1, 3)
FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3)
FSTRING_END '"' (2, 3) (2, 4)
""")

self.check_tokenize(r'f"hola\\\r\ndfgf"', """\
FSTRING_START \'f"\' (1, 0) (1, 2)
FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16)
FSTRING_END \'"\' (1, 16) (1, 17)
""")

def test_function(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez.
4 changes: 2 additions & 2 deletions Parser/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -924,9 +924,9 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen

struct tok_state *tok;
if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
tok = _PyTokenizer_FromUTF8(str, exec_input);
tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
} else {
tok = _PyTokenizer_FromString(str, exec_input);
tok = _PyTokenizer_FromString(str, exec_input, 0);
}
if (tok == NULL) {
if (PyErr_Occurred()) {
Expand Down
38 changes: 26 additions & 12 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,8 @@ translate_into_utf8(const char* str, const char* enc) {


static char *
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
translate_newlines(const char *s, int exec_input, int preserve_crlf,
struct tok_state *tok) {
int skip_next_lf = 0;
size_t needed_length = strlen(s) + 2, final_length;
char *buf, *current;
Expand All @@ -792,7 +793,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
break;
}
}
if (c == '\r') {
if (!preserve_crlf && c == '\r') {
skip_next_lf = 1;
c = '\n';
}
Expand Down Expand Up @@ -822,14 +823,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
inside TOK. */

static char *
decode_str(const char *input, int single, struct tok_state *tok)
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
{
PyObject* utf8 = NULL;
char *str;
const char *s;
const char *newl[2] = {NULL, NULL};
int lineno = 0;
tok->input = str = translate_newlines(input, single, tok);
tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
if (str == NULL)
return NULL;
tok->enc = NULL;
Expand Down Expand Up @@ -881,14 +882,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
/* Set up tokenizer for string */

struct tok_state *
_PyTokenizer_FromString(const char *str, int exec_input)
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
char *decoded;

if (tok == NULL)
return NULL;
decoded = decode_str(str, exec_input, tok);
decoded = decode_str(str, exec_input, tok, preserve_crlf);
if (decoded == NULL) {
_PyTokenizer_Free(tok);
return NULL;
Expand All @@ -902,13 +903,13 @@ _PyTokenizer_FromString(const char *str, int exec_input)
/* Set up tokenizer for UTF-8 string */

struct tok_state *
_PyTokenizer_FromUTF8(const char *str, int exec_input)
_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
char *translated;
if (tok == NULL)
return NULL;
tok->input = translated = translate_newlines(str, exec_input, tok);
tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
if (translated == NULL) {
_PyTokenizer_Free(tok);
return NULL;
Expand Down Expand Up @@ -1050,7 +1051,7 @@ tok_underflow_interactive(struct tok_state *tok) {
}
char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
if (newtok != NULL) {
char *translated = translate_newlines(newtok, 0, tok);
char *translated = translate_newlines(newtok, 0, 0, tok);
PyMem_Free(newtok);
if (translated == NULL) {
return 0;
Expand Down Expand Up @@ -1594,6 +1595,9 @@ tok_decimal_tail(struct tok_state *tok)
static inline int
tok_continuation_line(struct tok_state *tok) {
int c = tok_nextc(tok);
if (c == '\r') {
c = tok_nextc(tok);
}
if (c != '\n') {
tok->done = E_LINECONT;
return -1;
Expand Down Expand Up @@ -1693,7 +1697,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
}
tok_backup(tok, c);
if (c == '#' || c == '\n') {
if (c == '#' || c == '\n' || c == '\r') {
/* Lines with only whitespace and/or comments
shouldn't affect the indentation and are
not passed to the parser as NEWLINE tokens,
Expand Down Expand Up @@ -1822,7 +1826,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
const char *prefix, *type_start;
int current_starting_col_offset;

while (c != EOF && c != '\n') {
while (c != EOF && c != '\n' && c != '\r') {
c = tok_nextc(tok);
}

Expand Down Expand Up @@ -2002,6 +2006,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(NAME);
}

if (c == '\r') {
c = tok_nextc(tok);
}

/* Newline */
if (c == '\n') {
tok->atbol = 1;
Expand Down Expand Up @@ -2405,7 +2413,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
else {
end_quote_size = 0;
if (c == '\\') {
tok_nextc(tok); /* skip escaped char */
c = tok_nextc(tok); /* skip escaped char */
if (c == '\r') {
c = tok_nextc(tok);
}
}
}
}
Expand Down Expand Up @@ -2696,6 +2707,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
return MAKE_TOKEN(FSTRING_MIDDLE);
} else if (c == '\\') {
int peek = tok_nextc(tok);
if (peek == '\r') {
peek = tok_nextc(tok);
}
// Special case when the backslash is right before a curly
// brace. We have to restore and return the control back
// to the loop for the next iteration.
Expand Down
4 changes: 2 additions & 2 deletions Parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ struct tok_state {
#endif
};

extern struct tok_state *_PyTokenizer_FromString(const char *, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
const char *, const char *);
extern void _PyTokenizer_Free(struct tok_state *);
Expand Down
9 changes: 7 additions & 2 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
if (filename == NULL) {
return NULL;
}
self->tok = _PyTokenizer_FromUTF8(source, 1);
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
if (self->tok == NULL) {
Py_DECREF(filename);
return NULL;
Expand Down Expand Up @@ -240,7 +240,12 @@ tokenizeriter_next(tokenizeriterobject *it)
type = NAME;
}
else if (type == NEWLINE) {
str = PyUnicode_FromString("\n");
Py_DECREF(str);
if (it->tok->start[0] == '\r') {
str = PyUnicode_FromString("\r\n");
} else {
str = PyUnicode_FromString("\n");
}
end_col_offset++;
}
}
Expand Down