From 66f9385f692bbd75ceb35e6278b906123aa4e3ba Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Mon, 20 May 2024 09:29:09 -0400
Subject: [PATCH 1/6] gh-119118: Fix performance regression in tokenize module

- Cache line object to avoid creating a Unicode object
  for all of the tokens in the same line.
- Speed up byte offset to column offset conversion by using the
  smallest buffer possible to measure the difference.
---
 Python/Python-tokenize.c | 48 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 41e8107e205b46..6c3aeceb9be69f 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -32,6 +32,12 @@ typedef struct
 {
     PyObject_HEAD struct tok_state *tok;
     int done;
+
+    /* Needed to cache line for performance */
+    const char *last_line_start;
+    Py_ssize_t last_line_size;
+    PyObject *last_line;
+    Py_ssize_t byte_col_offset_diff;
 } tokenizeriterobject;
 
 /*[clinic input]
@@ -68,6 +74,12 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
         self->tok->tok_extra_tokens = 1;
     }
     self->done = 0;
+
+    self->last_line_start = NULL;
+    self->last_line_size = -2;
+    self->last_line = NULL;
+    self->byte_col_offset_diff = 0;
+
     return (PyObject *)self;
 }
 
@@ -210,7 +222,20 @@ tokenizeriter_next(tokenizeriterobject *it)
         if (size >= 1 && it->tok->implicit_newline) {
             size -= 1;
         }
-        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
+
+        if (line_start != it->last_line_start || size != it->last_line_size) {
+            // Line has changed since last token, so we fetch the new line and cache it
+            // in the iter object.
+            Py_XDECREF(it->last_line);
+            line = PyUnicode_DecodeUTF8(line_start, size, "replace");
+            it->last_line = line;
+            it->last_line_start = line_start;
+            it->last_line_size = size;
+            it->byte_col_offset_diff = 0;
+        } else {
+            // Line hasn't changed so we reuse the cached one.
+            line = it->last_line;
+        }
     }
     if (line == NULL) {
         Py_DECREF(str);
@@ -222,10 +247,25 @@ tokenizeriter_next(tokenizeriterobject *it)
     Py_ssize_t col_offset = -1;
     Py_ssize_t end_col_offset = -1;
     if (token.start != NULL && token.start >= line_start) {
-        col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
+        Py_ssize_t byte_offset = token.start - line_start;
+        col_offset = byte_offset - it->byte_col_offset_diff;
     }
     if (token.end != NULL && token.end >= it->tok->line_start) {
-        end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
+        Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
+        if (lineno == end_lineno) {
+            // If the whole token is at the same line, we can just use the token.start
+            // buffer for figuring out the new column offset, since using line is not
+            // performant for very long lines.
+            Py_ssize_t token_byte_offset = token.end - token.start;
+            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_raw(
+                token.start, token_byte_offset
+            );
+            end_col_offset = col_offset + token_col_offset;
+            it->byte_col_offset_diff += token_byte_offset - token_col_offset;
+        } else {
+            end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
+            it->byte_col_offset_diff += end_byte_offset - end_col_offset;
+        }
     }
 
     if (it->tok->tok_extra_tokens) {
@@ -262,7 +302,7 @@ tokenizeriter_next(tokenizeriterobject *it)
         }
     }
 
-    result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
+    result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
 exit:
     _PyToken_Free(&token);
     if (type == ENDMARKER) {

From 143a420dc2775219a21cf3e7b3d02ce46de48ca5 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Tue, 28 May 2024 12:15:08 +0000
Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst  | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst

diff --git a/Misc/NEWS.d/next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst b/Misc/NEWS.d/next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst
new file mode 100644
index 00000000000000..a7bd847a16dc03
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst
@@ -0,0 +1 @@
+Fix performance regression in the :mod:`tokenize` module by caching the `line` token attribute and calculating the column offset more efficiently.

From 8318ab80e2f6f9aadf32a915a1f2194e66528902 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Tue, 28 May 2024 14:17:12 +0200
Subject: [PATCH 3/6] Update Python/Python-tokenize.c

---
 Python/Python-tokenize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 6c3aeceb9be69f..5c09d50da8b992 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -76,7 +76,7 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
     self->done = 0;
 
     self->last_line_start = NULL;
-    self->last_line_size = -2;
+    self->last_line_size = 0;
     self->last_line = NULL;
     self->byte_col_offset_diff = 0;
 

From 566adc3df02fd95b279d546518e9ad42b0e50503 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Tue, 28 May 2024 15:35:56 +0200
Subject: [PATCH 4/6] WIP: strcmp lines (need to fix) and use line buffer for
 end col offset

---
 Parser/pegen.c           | 25 +++++++++++++++++++++++++
 Parser/pegen.h           |  1 +
 Python/Python-tokenize.c | 12 +++++-------
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/Parser/pegen.c b/Parser/pegen.c
index 3d3e64559403b1..2955eab2dac7c4 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -18,6 +18,31 @@ _PyPegen_interactive_exit(Parser *p)
     return NULL;
 }
 
+Py_ssize_t
+_PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
+{
+    const char *data = PyUnicode_AsUTF8(line);
+
+    Py_ssize_t len = 0;
+    while (col_offset < end_col_offset) {
+        Py_UCS4 ch = data[col_offset];
+        if (ch < 0x80) {
+            col_offset += 1;
+        } else if ((ch & 0xe0) == 0xc0) {
+            col_offset += 2;
+        } else if ((ch & 0xf0) == 0xe0) {
+            col_offset += 3;
+        } else if ((ch & 0xf8) == 0xf0) {
+            col_offset += 4;
+        } else {
+            PyErr_SetString(PyExc_ValueError, "Invalid UTF-8 sequence");
+            return -1;
+        }
+        len++;
+    }
+    return len;
+}
+
 Py_ssize_t
 _PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
 {
diff --git a/Parser/pegen.h b/Parser/pegen.h
index 57b45a54c36c57..32c64e7774b878 100644
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -148,6 +148,7 @@ int _PyPegen_fill_token(Parser *p);
 expr_ty _PyPegen_name_token(Parser *p);
 expr_ty _PyPegen_number_token(Parser *p);
 void *_PyPegen_string_token(Parser *p);
+Py_ssize_t _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset);
 Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
 Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);
 
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 5c09d50da8b992..a97f993d41b942 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -223,7 +223,7 @@ tokenizeriter_next(tokenizeriterobject *it)
             size -= 1;
         }
 
-        if (line_start != it->last_line_start || size != it->last_line_size) {
+        if (size != it->last_line_size || strcmp(line_start, it->last_line_start) != 0) {
             // Line has changed since last token, so we fetch the new line and cache it
             // in the iter object.
             Py_XDECREF(it->last_line);
@@ -246,8 +246,9 @@ tokenizeriter_next(tokenizeriterobject *it)
     Py_ssize_t end_lineno = it->tok->lineno;
     Py_ssize_t col_offset = -1;
     Py_ssize_t end_col_offset = -1;
+    Py_ssize_t byte_offset = -1;
     if (token.start != NULL && token.start >= line_start) {
-        Py_ssize_t byte_offset = token.start - line_start;
+        byte_offset = token.start - line_start;
         col_offset = byte_offset - it->byte_col_offset_diff;
     }
     if (token.end != NULL && token.end >= it->tok->line_start) {
@@ -256,12 +257,9 @@ tokenizeriter_next(tokenizeriterobject *it)
             // If the whole token is at the same line, we can just use the token.start
             // buffer for figuring out the new column offset, since using line is not
             // performant for very long lines.
-            Py_ssize_t token_byte_offset = token.end - token.start;
-            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_raw(
-                token.start, token_byte_offset
-            );
+            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
             end_col_offset = col_offset + token_col_offset;
-            it->byte_col_offset_diff += token_byte_offset - token_col_offset;
+            it->byte_col_offset_diff += token.end - token.start - token_col_offset;
         } else {
             end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
             it->byte_col_offset_diff += end_byte_offset - end_col_offset;

From 858eff15bd8e698c0e6c53ee3c16948bd9a1c770 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Tue, 28 May 2024 16:19:22 +0200
Subject: [PATCH 5/6] Get rid of strcmp

Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
---
 Python/Python-tokenize.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index a97f993d41b942..9cc4b45de49f0b 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -34,9 +34,8 @@ typedef struct
     int done;
 
     /* Needed to cache line for performance */
-    const char *last_line_start;
-    Py_ssize_t last_line_size;
     PyObject *last_line;
+    Py_ssize_t last_lineno;
     Py_ssize_t byte_col_offset_diff;
 } tokenizeriterobject;
 
@@ -75,10 +74,9 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
     }
     self->done = 0;
 
-    self->last_line_start = NULL;
-    self->last_line_size = 0;
     self->last_line = NULL;
     self->byte_col_offset_diff = 0;
+    self->last_lineno = 0;
 
     return (PyObject *)self;
 }
@@ -223,14 +221,12 @@ tokenizeriter_next(tokenizeriterobject *it)
             size -= 1;
         }
 
-        if (size != it->last_line_size || strcmp(line_start, it->last_line_start) != 0) {
+        if (it->tok->lineno != it->last_lineno) {
             // Line has changed since last token, so we fetch the new line and cache it
             // in the iter object.
             Py_XDECREF(it->last_line);
             line = PyUnicode_DecodeUTF8(line_start, size, "replace");
             it->last_line = line;
-            it->last_line_start = line_start;
-            it->last_line_size = size;
             it->byte_col_offset_diff = 0;
         } else {
             // Line hasn't changed so we reuse the cached one.
@@ -244,6 +240,8 @@ tokenizeriter_next(tokenizeriterobject *it)
 
     Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
     Py_ssize_t end_lineno = it->tok->lineno;
+    it->last_lineno = lineno;
+
     Py_ssize_t col_offset = -1;
     Py_ssize_t end_col_offset = -1;
     Py_ssize_t byte_offset = -1;

From ab3437096a729c05cfbffd4ddd7a86baf697efb3 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Tue, 28 May 2024 16:21:06 +0200
Subject: [PATCH 6/6] Make NEWS linter happy

---
 .../Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst b/Misc/NEWS.d/next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst
index a7bd847a16dc03..3cf61662fe7767 100644
--- a/Misc/NEWS.d/next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst
+++ b/Misc/NEWS.d/next/Library/2024-05-28-12-15-03.gh-issue-119118.FMKz1F.rst
@@ -1 +1,2 @@
-Fix performance regression in the :mod:`tokenize` module by caching the `line` token attribute and calculating the column offset more efficiently.
+Fix performance regression in the :mod:`tokenize` module by caching the ``line``
+token attribute and calculating the column offset more efficiently.