Skip to content

Commit 22cde39

Browse files
authored
[3.11] bpo-43950: handle wide unicode characters in tracebacks (GH-28150) (#111373)
1 parent 762aba7 commit 22cde39

File tree

6 files changed

+189
-15
lines changed

6 files changed

+189
-15
lines changed

Lib/test/test_traceback.py

+56-1
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,62 @@ def f():
893893
f" callable()",
894894
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f",
895895
f" print(1, www(",
896-
f" ^^^^",
896+
f" ^^^^^^^",
897+
]
898+
self.assertEqual(actual, expected)
899+
900+
def test_byte_offset_with_wide_characters_term_highlight(self):
901+
def f():
902+
说明说明 = 1
903+
şçöğıĤellö = 0 # not wide but still non-ascii
904+
return 说明说明 / şçöğıĤellö
905+
906+
actual = self.get_exception(f)
907+
expected = [
908+
f"Traceback (most recent call last):",
909+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
910+
f" callable()",
911+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f",
912+
f" return 说明说明 / şçöğıĤellö",
913+
f" ~~~~~~~~~^~~~~~~~~~~~",
914+
]
915+
self.assertEqual(actual, expected)
916+
917+
def test_byte_offset_with_emojis_term_highlight(self):
918+
def f():
919+
return "✨🐍" + func_说明说明("📗🚛",
920+
"📗🚛") + "🐍"
921+
922+
actual = self.get_exception(f)
923+
expected = [
924+
f"Traceback (most recent call last):",
925+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
926+
f" callable()",
927+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f",
928+
f' return "✨🐍" + func_说明说明("📗🚛",',
929+
f" ^^^^^^^^^^^^^",
930+
]
931+
self.assertEqual(actual, expected)
932+
933+
def test_byte_offset_wide_chars_subscript(self):
934+
def f():
935+
my_dct = {
936+
"✨🚛✨": {
937+
"说明": {
938+
"🐍🐍🐍": None
939+
}
940+
}
941+
}
942+
return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]
943+
944+
actual = self.get_exception(f)
945+
expected = [
946+
f"Traceback (most recent call last):",
947+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
948+
f" callable()",
949+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f",
950+
f' return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]',
951+
f" ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^",
897952
]
898953
self.assertEqual(actual, expected)
899954

Lib/traceback.py

+41-12
Original file line numberDiff line numberDiff line change
@@ -465,39 +465,49 @@ def format_frame_summary(self, frame_summary):
465465
stripped_line = frame_summary.line.strip()
466466
row.append(' {}\n'.format(stripped_line))
467467

468-
orig_line_len = len(frame_summary._original_line)
468+
line = frame_summary._original_line
469+
orig_line_len = len(line)
469470
frame_line_len = len(frame_summary.line.lstrip())
470471
stripped_characters = orig_line_len - frame_line_len
471472
if (
472473
frame_summary.colno is not None
473474
and frame_summary.end_colno is not None
474475
):
475476
start_offset = _byte_offset_to_character_offset(
476-
frame_summary._original_line, frame_summary.colno) + 1
477+
line, frame_summary.colno)
477478
end_offset = _byte_offset_to_character_offset(
478-
frame_summary._original_line, frame_summary.end_colno) + 1
479+
line, frame_summary.end_colno)
480+
code_segment = line[start_offset:end_offset]
479481

480482
anchors = None
481483
if frame_summary.lineno == frame_summary.end_lineno:
482484
with suppress(Exception):
483-
anchors = _extract_caret_anchors_from_line_segment(
484-
frame_summary._original_line[start_offset - 1:end_offset - 1]
485-
)
485+
anchors = _extract_caret_anchors_from_line_segment(code_segment)
486486
else:
487-
end_offset = stripped_characters + len(stripped_line)
487+
# Don't count the newline since the anchors only need to
488+
# go up until the last character of the line.
489+
end_offset = len(line.rstrip())
488490

489491
# show indicators if primary char doesn't span the frame line
490492
if end_offset - start_offset < len(stripped_line) or (
491493
anchors and anchors.right_start_offset - anchors.left_end_offset > 0):
494+
# When showing this on a terminal, some of the non-ASCII characters
495+
# might be rendered as double-width characters, so we need to take
496+
# that into account when calculating the length of the line.
497+
dp_start_offset = _display_width(line, start_offset) + 1
498+
dp_end_offset = _display_width(line, end_offset) + 1
499+
492500
row.append(' ')
493-
row.append(' ' * (start_offset - stripped_characters))
501+
row.append(' ' * (dp_start_offset - stripped_characters))
494502

495503
if anchors:
496-
row.append(anchors.primary_char * (anchors.left_end_offset))
497-
row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset))
498-
row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset))
504+
dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset)
505+
dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset)
506+
row.append(anchors.primary_char * dp_left_end_offset)
507+
row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset))
508+
row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset))
499509
else:
500-
row.append('^' * (end_offset - start_offset))
510+
row.append('^' * (dp_end_offset - dp_start_offset))
501511

502512
row.append('\n')
503513

@@ -618,6 +628,25 @@ def _extract_caret_anchors_from_line_segment(segment):
618628

619629
return None
620630

631+
_WIDE_CHAR_SPECIFIERS = "WF"
632+
633+
def _display_width(line, offset):
634+
"""Calculate the extra amount of width space the given source
635+
code segment might take if it were to be displayed on a fixed
636+
width output device. Supports wide unicode characters and emojis."""
637+
638+
# Fast track for ASCII-only strings
639+
if line.isascii():
640+
return offset
641+
642+
import unicodedata
643+
644+
return sum(
645+
2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1
646+
for char in line[:offset]
647+
)
648+
649+
621650

622651
class _ExceptionPrintContext:
623652
def __init__(self):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Traceback location ranges involving wide unicode characters (like emoji and
2+
asian characters) now are properly highlighted. Patch by Batuhan Taskaya and
3+
Pablo Galindo.

Parser/pegen.c

+55
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
3838
return size;
3939
}
4040

41+
// Calculate the extra amount of width space the given source
42+
// code segment might take if it were to be displayed on a fixed
43+
// width output device. Supports wide unicode characters and emojis.
44+
Py_ssize_t
45+
_PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
46+
{
47+
PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
48+
if (!segment) {
49+
return -1;
50+
}
51+
52+
// Fast track for ascii strings
53+
if (PyUnicode_IS_ASCII(segment)) {
54+
Py_DECREF(segment);
55+
return character_offset;
56+
}
57+
58+
PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
59+
if (!width_fn) {
60+
return -1;
61+
}
62+
63+
Py_ssize_t width = 0;
64+
Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
65+
for (Py_ssize_t i = 0; i < len; i++) {
66+
PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
67+
if (!chr) {
68+
Py_DECREF(segment);
69+
Py_DECREF(width_fn);
70+
return -1;
71+
}
72+
73+
PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
74+
Py_DECREF(chr);
75+
if (!width_specifier) {
76+
Py_DECREF(segment);
77+
Py_DECREF(width_fn);
78+
return -1;
79+
}
80+
81+
if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
82+
_PyUnicode_EqualToASCIIString(width_specifier, "F")) {
83+
width += 2;
84+
}
85+
else {
86+
width += 1;
87+
}
88+
Py_DECREF(width_specifier);
89+
}
90+
91+
Py_DECREF(segment);
92+
Py_DECREF(width_fn);
93+
return width;
94+
}
95+
4196
// Here, mark is the start of the node, while p->mark is the end.
4297
// If node==NULL, they should be the same.
4398
int

Parser/pegen.h

+1
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ expr_ty _PyPegen_name_token(Parser *p);
143143
expr_ty _PyPegen_number_token(Parser *p);
144144
void *_PyPegen_string_token(Parser *p);
145145
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
146+
Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
146147

147148
// Error handling functions and APIs
148149
typedef enum {

Python/traceback.c

+33-2
Original file line numberDiff line numberDiff line change
@@ -907,8 +907,39 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
907907
goto done;
908908
}
909909

910-
if (print_error_location_carets(f, truncation, start_offset, end_offset,
911-
right_start_offset, left_end_offset,
910+
// Convert all offsets to display offsets (e.g. the space they would take up if printed
911+
// on the screen).
912+
Py_ssize_t dp_start = _PyPegen_calculate_display_width(source_line, start_offset);
913+
if (dp_start < 0) {
914+
err = ignore_source_errors() < 0;
915+
goto done;
916+
}
917+
918+
Py_ssize_t dp_end = _PyPegen_calculate_display_width(source_line, end_offset);
919+
if (dp_end < 0) {
920+
err = ignore_source_errors() < 0;
921+
goto done;
922+
}
923+
924+
Py_ssize_t dp_left_end = -1;
925+
Py_ssize_t dp_right_start = -1;
926+
if (has_secondary_ranges) {
927+
dp_left_end = _PyPegen_calculate_display_width(source_line, left_end_offset);
928+
if (dp_left_end < 0) {
929+
err = ignore_source_errors() < 0;
930+
goto done;
931+
}
932+
933+
dp_right_start = _PyPegen_calculate_display_width(source_line, right_start_offset);
934+
if (dp_right_start < 0) {
935+
err = ignore_source_errors() < 0;
936+
goto done;
937+
}
938+
}
939+
940+
941+
if (print_error_location_carets(f, truncation, dp_start, dp_end,
942+
dp_right_start, dp_left_end,
912943
primary_error_char, secondary_error_char) < 0) {
913944
err = -1;
914945
goto done;

0 commit comments

Comments
 (0)