1
1
#include "Python.h"
2
2
#include "errcode.h"
3
+ #include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
3
4
#include "../Parser/lexer/state.h"
4
5
#include "../Parser/lexer/lexer.h"
5
6
#include "../Parser/tokenizer/tokenizer.h"
6
- #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
7
+ #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
7
8
8
9
static struct PyModuleDef _tokenizemodule ;
9
10
@@ -84,14 +85,16 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
84
85
}
85
86
86
87
static int
87
- _tokenizer_error (struct tok_state * tok )
88
+ _tokenizer_error (tokenizeriterobject * it )
88
89
{
90
+ _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED (it );
89
91
if (PyErr_Occurred ()) {
90
92
return -1 ;
91
93
}
92
94
93
95
const char * msg = NULL ;
94
96
PyObject * errtype = PyExc_SyntaxError ;
97
+ struct tok_state * tok = it -> tok ;
95
98
switch (tok -> done ) {
96
99
case E_TOKEN :
97
100
msg = "invalid token" ;
@@ -177,17 +180,78 @@ _tokenizer_error(struct tok_state *tok)
177
180
return result ;
178
181
}
179
182
183
+ static PyObject *
184
+ _get_current_line (tokenizeriterobject * it , const char * line_start , Py_ssize_t size ,
185
+ int * line_changed )
186
+ {
187
+ _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED (it );
188
+ PyObject * line ;
189
+ if (it -> tok -> lineno != it -> last_lineno ) {
190
+ // Line has changed since last token, so we fetch the new line and cache it
191
+ // in the iter object.
192
+ Py_XDECREF (it -> last_line );
193
+ line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
194
+ it -> last_line = line ;
195
+ it -> byte_col_offset_diff = 0 ;
196
+ }
197
+ else {
198
+ line = it -> last_line ;
199
+ * line_changed = 0 ;
200
+ }
201
+ return line ;
202
+ }
203
+
204
+ static void
205
+ _get_col_offsets (tokenizeriterobject * it , struct token token , const char * line_start ,
206
+ PyObject * line , int line_changed , Py_ssize_t lineno , Py_ssize_t end_lineno ,
207
+ Py_ssize_t * col_offset , Py_ssize_t * end_col_offset )
208
+ {
209
+ _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED (it );
210
+ Py_ssize_t byte_offset = -1 ;
211
+ if (token .start != NULL && token .start >= line_start ) {
212
+ byte_offset = token .start - line_start ;
213
+ if (line_changed ) {
214
+ * col_offset = _PyPegen_byte_offset_to_character_offset_line (line , 0 , byte_offset );
215
+ it -> byte_col_offset_diff = byte_offset - * col_offset ;
216
+ }
217
+ else {
218
+ * col_offset = byte_offset - it -> byte_col_offset_diff ;
219
+ }
220
+ }
221
+
222
+ if (token .end != NULL && token .end >= it -> tok -> line_start ) {
223
+ Py_ssize_t end_byte_offset = token .end - it -> tok -> line_start ;
224
+ if (lineno == end_lineno ) {
225
+ // If the whole token is at the same line, we can just use the token.start
226
+ // buffer for figuring out the new column offset, since using line is not
227
+ // performant for very long lines.
228
+ Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line (line , byte_offset , end_byte_offset );
229
+ * end_col_offset = * col_offset + token_col_offset ;
230
+ it -> byte_col_offset_diff += token .end - token .start - token_col_offset ;
231
+ }
232
+ else {
233
+ * end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , end_byte_offset );
234
+ it -> byte_col_offset_diff += end_byte_offset - * end_col_offset ;
235
+ }
236
+ }
237
+ it -> last_lineno = lineno ;
238
+ it -> last_end_lineno = end_lineno ;
239
+ }
240
+
180
241
static PyObject *
181
242
tokenizeriter_next (tokenizeriterobject * it )
182
243
{
183
244
PyObject * result = NULL ;
245
+
246
+ Py_BEGIN_CRITICAL_SECTION (it );
247
+
184
248
struct token token ;
185
249
_PyToken_Init (& token );
186
250
187
251
int type = _PyTokenizer_Get (it -> tok , & token );
188
252
if (type == ERRORTOKEN ) {
189
253
if (!PyErr_Occurred ()) {
190
- _tokenizer_error (it -> tok );
254
+ _tokenizer_error (it );
191
255
assert (PyErr_Occurred ());
192
256
}
193
257
goto exit ;
@@ -224,18 +288,7 @@ tokenizeriter_next(tokenizeriterobject *it)
224
288
size -= 1 ;
225
289
}
226
290
227
- if (it -> tok -> lineno != it -> last_lineno ) {
228
- // Line has changed since last token, so we fetch the new line and cache it
229
- // in the iter object.
230
- Py_XDECREF (it -> last_line );
231
- line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
232
- it -> last_line = line ;
233
- it -> byte_col_offset_diff = 0 ;
234
- } else {
235
- // Line hasn't changed so we reuse the cached one.
236
- line = it -> last_line ;
237
- line_changed = 0 ;
238
- }
291
+ line = _get_current_line (it , line_start , size , & line_changed );
239
292
}
240
293
if (line == NULL ) {
241
294
Py_DECREF (str );
@@ -244,36 +297,10 @@ tokenizeriter_next(tokenizeriterobject *it)
244
297
245
298
Py_ssize_t lineno = ISSTRINGLIT (type ) ? it -> tok -> first_lineno : it -> tok -> lineno ;
246
299
Py_ssize_t end_lineno = it -> tok -> lineno ;
247
- it -> last_lineno = lineno ;
248
- it -> last_end_lineno = end_lineno ;
249
-
250
300
Py_ssize_t col_offset = -1 ;
251
301
Py_ssize_t end_col_offset = -1 ;
252
- Py_ssize_t byte_offset = -1 ;
253
- if (token .start != NULL && token .start >= line_start ) {
254
- byte_offset = token .start - line_start ;
255
- if (line_changed ) {
256
- col_offset = _PyPegen_byte_offset_to_character_offset_line (line , 0 , byte_offset );
257
- it -> byte_col_offset_diff = byte_offset - col_offset ;
258
- }
259
- else {
260
- col_offset = byte_offset - it -> byte_col_offset_diff ;
261
- }
262
- }
263
- if (token .end != NULL && token .end >= it -> tok -> line_start ) {
264
- Py_ssize_t end_byte_offset = token .end - it -> tok -> line_start ;
265
- if (lineno == end_lineno ) {
266
- // If the whole token is at the same line, we can just use the token.start
267
- // buffer for figuring out the new column offset, since using line is not
268
- // performant for very long lines.
269
- Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line (line , byte_offset , end_byte_offset );
270
- end_col_offset = col_offset + token_col_offset ;
271
- it -> byte_col_offset_diff += token .end - token .start - token_col_offset ;
272
- } else {
273
- end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , end_byte_offset );
274
- it -> byte_col_offset_diff += end_byte_offset - end_col_offset ;
275
- }
276
- }
302
+ _get_col_offsets (it , token , line_start , line , line_changed ,
303
+ lineno , end_lineno , & col_offset , & end_col_offset );
277
304
278
305
if (it -> tok -> tok_extra_tokens ) {
279
306
if (is_trailing_token ) {
@@ -315,6 +342,8 @@ tokenizeriter_next(tokenizeriterobject *it)
315
342
if (type == ENDMARKER ) {
316
343
it -> done = 1 ;
317
344
}
345
+
346
+ Py_END_CRITICAL_SECTION ();
318
347
return result ;
319
348
}
320
349
0 commit comments