Skip to content

Commit e45c61f

Browse files
miss-islingtonlysnikolaoupablogsal
authored
[3.13] gh-120317: Lock around global state in the tokenize module (GH-120318) (#121841)
(cherry picked from commit 8549559) Co-authored-by: Lysandros Nikolaou <[email protected]> Co-authored-by: Pablo Galindo <[email protected]>
1 parent 93ee63a commit e45c61f

File tree

2 files changed

+129
-43
lines changed

2 files changed

+129
-43
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import io
2+
import time
3+
import unittest
4+
import tokenize
5+
from functools import partial
6+
from threading import Thread
7+
8+
from test.support import threading_helper
9+
10+
11+
@threading_helper.requires_working_threading()
12+
class TestTokenize(unittest.TestCase):
13+
def test_tokenizer_iter(self):
14+
source = io.StringIO("for _ in a:\n pass")
15+
it = tokenize._tokenize.TokenizerIter(source.readline, extra_tokens=False)
16+
17+
tokens = []
18+
def next_token(it):
19+
while True:
20+
try:
21+
r = next(it)
22+
tokens.append(tokenize.TokenInfo._make(r))
23+
time.sleep(0.03)
24+
except StopIteration:
25+
return
26+
27+
threads = []
28+
for _ in range(5):
29+
threads.append(Thread(target=partial(next_token, it)))
30+
31+
for thread in threads:
32+
thread.start()
33+
34+
for thread in threads:
35+
thread.join()
36+
37+
expected_tokens = [
38+
tokenize.TokenInfo(type=1, string='for', start=(1, 0), end=(1, 3), line='for _ in a:\n'),
39+
tokenize.TokenInfo(type=1, string='_', start=(1, 4), end=(1, 5), line='for _ in a:\n'),
40+
tokenize.TokenInfo(type=1, string='in', start=(1, 6), end=(1, 8), line='for _ in a:\n'),
41+
tokenize.TokenInfo(type=1, string='a', start=(1, 9), end=(1, 10), line='for _ in a:\n'),
42+
tokenize.TokenInfo(type=11, string=':', start=(1, 10), end=(1, 11), line='for _ in a:\n'),
43+
tokenize.TokenInfo(type=4, string='', start=(1, 11), end=(1, 11), line='for _ in a:\n'),
44+
tokenize.TokenInfo(type=5, string='', start=(2, -1), end=(2, -1), line=' pass'),
45+
tokenize.TokenInfo(type=1, string='pass', start=(2, 2), end=(2, 6), line=' pass'),
46+
tokenize.TokenInfo(type=4, string='', start=(2, 6), end=(2, 6), line=' pass'),
47+
tokenize.TokenInfo(type=6, string='', start=(2, -1), end=(2, -1), line=' pass'),
48+
tokenize.TokenInfo(type=0, string='', start=(2, -1), end=(2, -1), line=' pass'),
49+
]
50+
51+
tokens.sort()
52+
expected_tokens.sort()
53+
self.assertListEqual(tokens, expected_tokens)
54+
55+
56+
if __name__ == "__main__":
57+
unittest.main()

Python/Python-tokenize.c

+72-43
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
#include "Python.h"
22
#include "errcode.h"
3+
#include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
34
#include "../Parser/lexer/state.h"
45
#include "../Parser/lexer/lexer.h"
56
#include "../Parser/tokenizer/tokenizer.h"
6-
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
7+
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
78

89
static struct PyModuleDef _tokenizemodule;
910

@@ -84,14 +85,16 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
8485
}
8586

8687
static int
87-
_tokenizer_error(struct tok_state *tok)
88+
_tokenizer_error(tokenizeriterobject *it)
8889
{
90+
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
8991
if (PyErr_Occurred()) {
9092
return -1;
9193
}
9294

9395
const char *msg = NULL;
9496
PyObject* errtype = PyExc_SyntaxError;
97+
struct tok_state *tok = it->tok;
9598
switch (tok->done) {
9699
case E_TOKEN:
97100
msg = "invalid token";
@@ -177,17 +180,78 @@ _tokenizer_error(struct tok_state *tok)
177180
return result;
178181
}
179182

183+
static PyObject *
184+
_get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size,
185+
int *line_changed)
186+
{
187+
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
188+
PyObject *line;
189+
if (it->tok->lineno != it->last_lineno) {
190+
// Line has changed since last token, so we fetch the new line and cache it
191+
// in the iter object.
192+
Py_XDECREF(it->last_line);
193+
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
194+
it->last_line = line;
195+
it->byte_col_offset_diff = 0;
196+
}
197+
else {
198+
line = it->last_line;
199+
*line_changed = 0;
200+
}
201+
return line;
202+
}
203+
204+
static void
205+
_get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
206+
PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
207+
Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
208+
{
209+
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
210+
Py_ssize_t byte_offset = -1;
211+
if (token.start != NULL && token.start >= line_start) {
212+
byte_offset = token.start - line_start;
213+
if (line_changed) {
214+
*col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
215+
it->byte_col_offset_diff = byte_offset - *col_offset;
216+
}
217+
else {
218+
*col_offset = byte_offset - it->byte_col_offset_diff;
219+
}
220+
}
221+
222+
if (token.end != NULL && token.end >= it->tok->line_start) {
223+
Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
224+
if (lineno == end_lineno) {
225+
// If the whole token is at the same line, we can just use the token.start
226+
// buffer for figuring out the new column offset, since using line is not
227+
// performant for very long lines.
228+
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
229+
*end_col_offset = *col_offset + token_col_offset;
230+
it->byte_col_offset_diff += token.end - token.start - token_col_offset;
231+
}
232+
else {
233+
*end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
234+
it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
235+
}
236+
}
237+
it->last_lineno = lineno;
238+
it->last_end_lineno = end_lineno;
239+
}
240+
180241
static PyObject *
181242
tokenizeriter_next(tokenizeriterobject *it)
182243
{
183244
PyObject* result = NULL;
245+
246+
Py_BEGIN_CRITICAL_SECTION(it);
247+
184248
struct token token;
185249
_PyToken_Init(&token);
186250

187251
int type = _PyTokenizer_Get(it->tok, &token);
188252
if (type == ERRORTOKEN) {
189253
if(!PyErr_Occurred()) {
190-
_tokenizer_error(it->tok);
254+
_tokenizer_error(it);
191255
assert(PyErr_Occurred());
192256
}
193257
goto exit;
@@ -224,18 +288,7 @@ tokenizeriter_next(tokenizeriterobject *it)
224288
size -= 1;
225289
}
226290

227-
if (it->tok->lineno != it->last_lineno) {
228-
// Line has changed since last token, so we fetch the new line and cache it
229-
// in the iter object.
230-
Py_XDECREF(it->last_line);
231-
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
232-
it->last_line = line;
233-
it->byte_col_offset_diff = 0;
234-
} else {
235-
// Line hasn't changed so we reuse the cached one.
236-
line = it->last_line;
237-
line_changed = 0;
238-
}
291+
line = _get_current_line(it, line_start, size, &line_changed);
239292
}
240293
if (line == NULL) {
241294
Py_DECREF(str);
@@ -244,36 +297,10 @@ tokenizeriter_next(tokenizeriterobject *it)
244297

245298
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
246299
Py_ssize_t end_lineno = it->tok->lineno;
247-
it->last_lineno = lineno;
248-
it->last_end_lineno = end_lineno;
249-
250300
Py_ssize_t col_offset = -1;
251301
Py_ssize_t end_col_offset = -1;
252-
Py_ssize_t byte_offset = -1;
253-
if (token.start != NULL && token.start >= line_start) {
254-
byte_offset = token.start - line_start;
255-
if (line_changed) {
256-
col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
257-
it->byte_col_offset_diff = byte_offset - col_offset;
258-
}
259-
else {
260-
col_offset = byte_offset - it->byte_col_offset_diff;
261-
}
262-
}
263-
if (token.end != NULL && token.end >= it->tok->line_start) {
264-
Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
265-
if (lineno == end_lineno) {
266-
// If the whole token is at the same line, we can just use the token.start
267-
// buffer for figuring out the new column offset, since using line is not
268-
// performant for very long lines.
269-
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
270-
end_col_offset = col_offset + token_col_offset;
271-
it->byte_col_offset_diff += token.end - token.start - token_col_offset;
272-
} else {
273-
end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
274-
it->byte_col_offset_diff += end_byte_offset - end_col_offset;
275-
}
276-
}
302+
_get_col_offsets(it, token, line_start, line, line_changed,
303+
lineno, end_lineno, &col_offset, &end_col_offset);
277304

278305
if (it->tok->tok_extra_tokens) {
279306
if (is_trailing_token) {
@@ -315,6 +342,8 @@ tokenizeriter_next(tokenizeriterobject *it)
315342
if (type == ENDMARKER) {
316343
it->done = 1;
317344
}
345+
346+
Py_END_CRITICAL_SECTION();
318347
return result;
319348
}
320349

0 commit comments

Comments
 (0)