Merge pull request #48 from lysnikolaou/fstring-middle-before-end

pablogsal · web-flow · commit 00e47ae63f07 · 2023-03-26T17:01:41.000+01:00
Emit FSTRING_MIDDLE for literal after last expression and before quotes
diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h
@@ -91,8 +91,7 @@ extern "C" {
                                  (x) == INDENT    || \
                                  (x) == DEDENT)
 #define ISSTRINGLIT(x)          ((x) == STRING           || \
-                                 (x) == FSTRING_MIDDLE   || \
-                                 (x) == FSTRING_END)
+                                 (x) == FSTRING_MIDDLE)
 
 
 // Symbols exported for test_peg_generator
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
@@ -1938,7 +1938,8 @@ def test_string(self):
 
         self.check_tokenize('f"abc"', """\
     FSTRING_START 'f"'          (1, 0) (1, 2)
-    FSTRING_END 'abc'         (1, 2) (1, 5)
+    FSTRING_MIDDLE 'abc'         (1, 2) (1, 5)
+    FSTRING_END '"'           (1, 5) (1, 6)
     """)
 
         self.check_tokenize('fR"a{b}c"', """\
@@ -1947,24 +1948,28 @@ def test_string(self):
     LBRACE     '{'           (1, 4) (1, 5)
     NAME       'b'           (1, 5) (1, 6)
     RBRACE     '}'           (1, 6) (1, 7)
-    FSTRING_END 'c'           (1, 7) (1, 8)
+    FSTRING_MIDDLE 'c'           (1, 7) (1, 8)
+    FSTRING_END '"'           (1, 8) (1, 9)
     """)
 
         self.check_tokenize('f"""abc"""', """\
     FSTRING_START 'f\"""'        (1, 0) (1, 4)
-    FSTRING_END 'abc'         (1, 4) (1, 7)
+    FSTRING_MIDDLE 'abc'         (1, 4) (1, 7)
+    FSTRING_END '\"""'         (1, 7) (1, 10)
     """)
 
         self.check_tokenize(r'f"abc\
 def"', """\
     FSTRING_START \'f"\'          (1, 0) (1, 2)
-    FSTRING_END 'abc\\\\\\ndef'  (1, 2) (2, 3)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 2) (2, 3)
+    FSTRING_END '"'           (2, 3) (2, 4)
     """)
 
         self.check_tokenize(r'Rf"abc\
 def"', """\
     FSTRING_START 'Rf"'         (1, 0) (1, 3)
-    FSTRING_END 'abc\\\\\\ndef'  (1, 3) (2, 3)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
+    FSTRING_END '"'           (2, 3) (2, 4)
     """)
 
     def test_function(self):
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
@@ -1388,15 +1388,9 @@ deal_with_gstring2(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b)
     if (quote_str == NULL) {
         return NULL;
     }
-
     int is_raw = strpbrk(quote_str, "rR") != NULL;
-    const char* _str = PyBytes_AsString(b->bytes);
-    if (_str == NULL) {
-        return NULL;
-    }
-    int is_b_empty = strlen(_str) == 0;
 
-    asdl_expr_seq *seq = _Py_asdl_expr_seq_new(is_b_empty ? n_items : n_items + 1, p->arena);
+    asdl_expr_seq *seq = _Py_asdl_expr_seq_new(n_items, p->arena);
     if (seq == NULL) {
         return NULL;
     }
@@ -1412,32 +1406,6 @@ deal_with_gstring2(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b)
         asdl_seq_SET(seq, i, item);
     }
 
-    if (!is_b_empty) {
-        PyObject* str = PyUnicode_FromString(_str);
-        if (str == NULL) {
-            return NULL;
-        }
-        if (_PyArena_AddPyObject(p->arena, str) < 0) {
-            Py_DECREF(str);
-            return NULL;
-        }
-
-        expr_ty the_str = _PyAST_Constant(str, NULL, b->lineno,
-                                b->col_offset, b->end_lineno,
-                                b->end_col_offset, p->arena);
-        if (the_str == NULL) {
-            return NULL;
-        }
-
-        expr_ty decoded_str = _PyPegen_decode_fstring_part(
-            p, is_raw, the_str
-        );
-        if (decoded_str == NULL) {
-            return NULL;
-        }
-
-        asdl_seq_SET(seq, n_items, decoded_str);
-    }
     return _PyAST_JoinedStr(seq, a->lineno, a->col_offset,
                             b->end_lineno, b->end_col_offset,
                             p->arena);
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
@@ -2434,18 +2434,41 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
     // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
     // before it.
     char start_char = tok_nextc(tok);
-    char peek = tok_nextc(tok);
-    tok_backup(tok, peek);
+    char peek1 = tok_nextc(tok);
+    char peek2 = tok_nextc(tok);
+    tok_backup(tok, peek2);
+    tok_backup(tok, peek1);
     tok_backup(tok, start_char);
 
-    if ((start_char == '{' && peek != '{') || (start_char == '}' && peek != '}')) {
+    if ((start_char == '{' && peek1 != '{') || (start_char == '}' && peek1 != '}')) {
         if (start_char == '{') {
             current_tok->bracket_mark[++current_tok->bracket_mark_index] = current_tok->bracket_stack;
         }
         tok->tok_mode_stack[tok->tok_mode_stack_index].kind = TOK_REGULAR_MODE;
         return tok_get_normal_mode(tok, current_tok, token);
     }
 
+    // Emit FSTRING_END in case we've reached the end of the string
+    if (start_char == current_tok->f_string_quote
+        && (current_tok->f_string_quote_size != 3 || (start_char == peek1 && start_char == peek2))) {
+        // Advance the tokenizer state again to create a token out of the end quotes
+        for (int i = 0; i < current_tok->f_string_quote_size; i++) {
+            tok_nextc(tok);
+        }
+
+        if (current_tok->last_expr_buffer != NULL) {
+            PyMem_Free(current_tok->last_expr_buffer);
+            current_tok->last_expr_buffer = NULL;
+            current_tok->last_expr_size = 0;
+            current_tok->last_expr_end = -1;
+        }
+
+        p_start = tok->start;
+        p_end = tok->cur;
+        tok->tok_mode_stack_index--;
+        return MAKE_TOKEN(FSTRING_END);
+    }
+
     int end_quote_size = 0;
     int unicode_escape = 0;
     while (end_quote_size != current_tok->f_string_quote_size) {
@@ -2549,17 +2572,14 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
         }
     }
 
-    if (current_tok->last_expr_buffer != NULL) {
-        PyMem_Free(current_tok->last_expr_buffer);
-        current_tok->last_expr_buffer = NULL;
-        current_tok->last_expr_size = 0;
-        current_tok->last_expr_end = -1;
+    // Backup the f-string quotes to emit a final FSTRING_MIDDLE and
+    // add the quotes to the FSTRING_END in the next tokenizer iteration.
+    for (int i = 0; i < current_tok->f_string_quote_size; i++) {
+        tok_backup(tok, current_tok->f_string_quote);
     }
-
     p_start = tok->start;
-    p_end = tok->cur - current_tok->f_string_quote_size;
-    tok->tok_mode_stack_index--;
-    return MAKE_TOKEN(FSTRING_END);
+    p_end = tok->cur;
+    return MAKE_TOKEN(FSTRING_MIDDLE);
 }
 
 
diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py
@@ -81,8 +81,7 @@ def update_file(file, content):
                                  (x) == INDENT    || \\
                                  (x) == DEDENT)
 #define ISSTRINGLIT(x)          ((x) == STRING           || \\
-                                 (x) == FSTRING_MIDDLE   || \\
-                                 (x) == FSTRING_END)
+                                 (x) == FSTRING_MIDDLE)
 
 
 // Symbols exported for test_peg_generator