Skip to content

Commit c81ebf5

Browse files
authored
[3.12] bpo-43950: handle wide unicode characters in tracebacks (GH-28150) (#111346)
1 parent e25d8b4 commit c81ebf5

File tree

5 files changed

+187
-16
lines changed

5 files changed

+187
-16
lines changed

Lib/test/test_traceback.py

+57-2
Original file line numberDiff line numberDiff line change
@@ -922,8 +922,63 @@ def f():
922922
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
923923
" callable()",
924924
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f",
925-
" print(1, www(",
926-
" ^^^^",
925+
f" print(1, www(",
926+
f" ^^^^^^^",
927+
]
928+
self.assertEqual(actual, expected)
929+
930+
def test_byte_offset_with_wide_characters_term_highlight(self):
931+
def f():
932+
说明说明 = 1
933+
şçöğıĤellö = 0 # not wide but still non-ascii
934+
return 说明说明 / şçöğıĤellö
935+
936+
actual = self.get_exception(f)
937+
expected = [
938+
f"Traceback (most recent call last):",
939+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
940+
f" callable()",
941+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f",
942+
f" return 说明说明 / şçöğıĤellö",
943+
f" ~~~~~~~~~^~~~~~~~~~~~",
944+
]
945+
self.assertEqual(actual, expected)
946+
947+
def test_byte_offset_with_emojis_term_highlight(self):
948+
def f():
949+
return "✨🐍" + func_说明说明("📗🚛",
950+
"📗🚛") + "🐍"
951+
952+
actual = self.get_exception(f)
953+
expected = [
954+
f"Traceback (most recent call last):",
955+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
956+
f" callable()",
957+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f",
958+
f' return "✨🐍" + func_说明说明("📗🚛",',
959+
f" ^^^^^^^^^^^^^",
960+
]
961+
self.assertEqual(actual, expected)
962+
963+
def test_byte_offset_wide_chars_subscript(self):
964+
def f():
965+
my_dct = {
966+
"✨🚛✨": {
967+
"说明": {
968+
"🐍🐍🐍": None
969+
}
970+
}
971+
}
972+
return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]
973+
974+
actual = self.get_exception(f)
975+
expected = [
976+
f"Traceback (most recent call last):",
977+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
978+
f" callable()",
979+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f",
980+
f' return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]',
981+
f" ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^",
927982
]
928983
self.assertEqual(actual, expected)
929984

Lib/traceback.py

+41-12
Original file line numberDiff line numberDiff line change
@@ -470,39 +470,49 @@ def format_frame_summary(self, frame_summary):
470470
stripped_line = frame_summary.line.strip()
471471
row.append(' {}\n'.format(stripped_line))
472472

473-
orig_line_len = len(frame_summary._original_line)
473+
line = frame_summary._original_line
474+
orig_line_len = len(line)
474475
frame_line_len = len(frame_summary.line.lstrip())
475476
stripped_characters = orig_line_len - frame_line_len
476477
if (
477478
frame_summary.colno is not None
478479
and frame_summary.end_colno is not None
479480
):
480481
start_offset = _byte_offset_to_character_offset(
481-
frame_summary._original_line, frame_summary.colno) + 1
482+
line, frame_summary.colno)
482483
end_offset = _byte_offset_to_character_offset(
483-
frame_summary._original_line, frame_summary.end_colno) + 1
484+
line, frame_summary.end_colno)
485+
code_segment = line[start_offset:end_offset]
484486

485487
anchors = None
486488
if frame_summary.lineno == frame_summary.end_lineno:
487489
with suppress(Exception):
488-
anchors = _extract_caret_anchors_from_line_segment(
489-
frame_summary._original_line[start_offset - 1:end_offset - 1]
490-
)
490+
anchors = _extract_caret_anchors_from_line_segment(code_segment)
491491
else:
492-
end_offset = stripped_characters + len(stripped_line)
492+
# Don't count the newline since the anchors only need to
493+
# go up until the last character of the line.
494+
end_offset = len(line.rstrip())
493495

494496
# show indicators if primary char doesn't span the frame line
495497
if end_offset - start_offset < len(stripped_line) or (
496498
anchors and anchors.right_start_offset - anchors.left_end_offset > 0):
499+
# When showing this on a terminal, some of the non-ASCII characters
500+
# might be rendered as double-width characters, so we need to take
501+
# that into account when calculating the length of the line.
502+
dp_start_offset = _display_width(line, start_offset) + 1
503+
dp_end_offset = _display_width(line, end_offset) + 1
504+
497505
row.append(' ')
498-
row.append(' ' * (start_offset - stripped_characters))
506+
row.append(' ' * (dp_start_offset - stripped_characters))
499507

500508
if anchors:
501-
row.append(anchors.primary_char * (anchors.left_end_offset))
502-
row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset))
503-
row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset))
509+
dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset)
510+
dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset)
511+
row.append(anchors.primary_char * dp_left_end_offset)
512+
row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset))
513+
row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset))
504514
else:
505-
row.append('^' * (end_offset - start_offset))
515+
row.append('^' * (dp_end_offset - dp_start_offset))
506516

507517
row.append('\n')
508518

@@ -623,6 +633,25 @@ def _extract_caret_anchors_from_line_segment(segment):
623633

624634
return None
625635

636+
_WIDE_CHAR_SPECIFIERS = "WF"
637+
638+
def _display_width(line, offset):
639+
"""Calculate the extra amount of width space the given source
640+
code segment might take if it were to be displayed on a fixed
641+
width output device. Supports wide unicode characters and emojis."""
642+
643+
# Fast track for ASCII-only strings
644+
if line.isascii():
645+
return offset
646+
647+
import unicodedata
648+
649+
return sum(
650+
2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1
651+
for char in line[:offset]
652+
)
653+
654+
626655

627656
class _ExceptionPrintContext:
628657
def __init__(self):

Parser/pegen.c

+55
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
3838
return size;
3939
}
4040

41+
// Calculate the extra amount of width space the given source
42+
// code segment might take if it were to be displayed on a fixed
43+
// width output device. Supports wide unicode characters and emojis.
44+
Py_ssize_t
45+
_PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
46+
{
47+
PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
48+
if (!segment) {
49+
return -1;
50+
}
51+
52+
// Fast track for ascii strings
53+
if (PyUnicode_IS_ASCII(segment)) {
54+
Py_DECREF(segment);
55+
return character_offset;
56+
}
57+
58+
PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
59+
if (!width_fn) {
60+
return -1;
61+
}
62+
63+
Py_ssize_t width = 0;
64+
Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
65+
for (Py_ssize_t i = 0; i < len; i++) {
66+
PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
67+
if (!chr) {
68+
Py_DECREF(segment);
69+
Py_DECREF(width_fn);
70+
return -1;
71+
}
72+
73+
PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
74+
Py_DECREF(chr);
75+
if (!width_specifier) {
76+
Py_DECREF(segment);
77+
Py_DECREF(width_fn);
78+
return -1;
79+
}
80+
81+
if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
82+
_PyUnicode_EqualToASCIIString(width_specifier, "F")) {
83+
width += 2;
84+
}
85+
else {
86+
width += 1;
87+
}
88+
Py_DECREF(width_specifier);
89+
}
90+
91+
Py_DECREF(segment);
92+
Py_DECREF(width_fn);
93+
return width;
94+
}
95+
4196
// Here, mark is the start of the node, while p->mark is the end.
4297
// If node==NULL, they should be the same.
4398
int

Parser/pegen.h

+1
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ expr_ty _PyPegen_name_token(Parser *p);
151151
expr_ty _PyPegen_number_token(Parser *p);
152152
void *_PyPegen_string_token(Parser *p);
153153
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
154+
Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
154155

155156
// Error handling functions and APIs
156157
typedef enum {

Python/traceback.c

+33-2
Original file line numberDiff line numberDiff line change
@@ -900,8 +900,39 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
900900
goto done;
901901
}
902902

903-
if (print_error_location_carets(f, truncation, start_offset, end_offset,
904-
right_start_offset, left_end_offset,
903+
// Convert all offsets to display offsets (e.g. the space they would take up if printed
904+
// on the screen).
905+
Py_ssize_t dp_start = _PyPegen_calculate_display_width(source_line, start_offset);
906+
if (dp_start < 0) {
907+
err = ignore_source_errors() < 0;
908+
goto done;
909+
}
910+
911+
Py_ssize_t dp_end = _PyPegen_calculate_display_width(source_line, end_offset);
912+
if (dp_end < 0) {
913+
err = ignore_source_errors() < 0;
914+
goto done;
915+
}
916+
917+
Py_ssize_t dp_left_end = -1;
918+
Py_ssize_t dp_right_start = -1;
919+
if (has_secondary_ranges) {
920+
dp_left_end = _PyPegen_calculate_display_width(source_line, left_end_offset);
921+
if (dp_left_end < 0) {
922+
err = ignore_source_errors() < 0;
923+
goto done;
924+
}
925+
926+
dp_right_start = _PyPegen_calculate_display_width(source_line, right_start_offset);
927+
if (dp_right_start < 0) {
928+
err = ignore_source_errors() < 0;
929+
goto done;
930+
}
931+
}
932+
933+
934+
if (print_error_location_carets(f, truncation, dp_start, dp_end,
935+
dp_right_start, dp_left_end,
905936
primary_error_char, secondary_error_char) < 0) {
906937
err = -1;
907938
goto done;

0 commit comments

Comments
 (0)