Skip to content

Commit 8102a9d

Browse files
committed
gh-88116: Initial work on wide unicode characters
1 parent 57be545 commit 8102a9d

File tree

5 files changed

+186
-15
lines changed

5 files changed

+186
-15
lines changed

Lib/test/test_traceback.py

+56-1
Original file line numberDiff line numberDiff line change
@@ -884,7 +884,62 @@ def f():
884884
f" callable()",
885885
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f",
886886
f" print(1, www(",
887-
f" ^^^^",
887+
f" ^^^^^^^",
888+
]
889+
self.assertEqual(actual, expected)
890+
891+
def test_byte_offset_with_wide_characters_term_highlight(self):
892+
def f():
893+
说明说明 = 1
894+
şçöğıĤellö = 0 # not wide but still non-ascii
895+
return 说明说明 / şçöğıĤellö
896+
897+
actual = self.get_exception(f)
898+
expected = [
899+
f"Traceback (most recent call last):",
900+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
901+
f" callable()",
902+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f",
903+
f" return 说明说明 / şçöğıĤellö",
904+
f" ~~~~~~~~~^~~~~~~~~~~~",
905+
]
906+
self.assertEqual(actual, expected)
907+
908+
def test_byte_offset_with_emojis_term_highlight(self):
909+
def f():
910+
return "✨🐍" + func_说明说明("📗🚛",
911+
"📗🚛") + "🐍"
912+
913+
actual = self.get_exception(f)
914+
expected = [
915+
f"Traceback (most recent call last):",
916+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
917+
f" callable()",
918+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f",
919+
f' return "✨🐍" + func_说明说明("📗🚛",',
920+
f" ^^^^^^^^^^^^^",
921+
]
922+
self.assertEqual(actual, expected)
923+
924+
def test_byte_offset_wide_chars_subscript(self):
925+
def f():
926+
my_dct = {
927+
"✨🚛✨": {
928+
"说明": {
929+
"🐍🐍🐍": None
930+
}
931+
}
932+
}
933+
return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]
934+
935+
actual = self.get_exception(f)
936+
expected = [
937+
f"Traceback (most recent call last):",
938+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
939+
f" callable()",
940+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f",
941+
f' return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]',
942+
f" ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^",
888943
]
889944
self.assertEqual(actual, expected)
890945

Lib/traceback.py

+41-12
Original file line numberDiff line numberDiff line change
@@ -469,39 +469,49 @@ def format_frame_summary(self, frame_summary):
469469
stripped_line = frame_summary.line.strip()
470470
row.append(' {}\n'.format(stripped_line))
471471

472-
orig_line_len = len(frame_summary._original_line)
472+
line = frame_summary._original_line
473+
orig_line_len = len(line)
473474
frame_line_len = len(frame_summary.line.lstrip())
474475
stripped_characters = orig_line_len - frame_line_len
475476
if (
476477
frame_summary.colno is not None
477478
and frame_summary.end_colno is not None
478479
):
479480
start_offset = _byte_offset_to_character_offset(
480-
frame_summary._original_line, frame_summary.colno) + 1
481+
line, frame_summary.colno)
481482
end_offset = _byte_offset_to_character_offset(
482-
frame_summary._original_line, frame_summary.end_colno) + 1
483+
line, frame_summary.end_colno)
484+
code_segment = line[start_offset:end_offset]
483485

484486
anchors = None
485487
if frame_summary.lineno == frame_summary.end_lineno:
486488
with suppress(Exception):
487-
anchors = _extract_caret_anchors_from_line_segment(
488-
frame_summary._original_line[start_offset - 1:end_offset - 1]
489-
)
489+
anchors = _extract_caret_anchors_from_line_segment(code_segment)
490490
else:
491-
end_offset = stripped_characters + len(stripped_line)
491+
# Don't count the newline since the anchors only need to
492+
# go up until the last character of the line.
493+
end_offset = len(line.rstrip())
492494

493495
# show indicators if primary char doesn't span the frame line
494496
if end_offset - start_offset < len(stripped_line) or (
495497
anchors and anchors.right_start_offset - anchors.left_end_offset > 0):
498+
# When showing this on a terminal, some of the non-ASCII characters
499+
# might be rendered as double-width characters, so we need to take
500+
# that into account when calculating the length of the line.
501+
dp_start_offset = _display_width(line, start_offset) + 1
502+
dp_end_offset = _display_width(line, end_offset) + 1
503+
496504
row.append(' ')
497-
row.append(' ' * (start_offset - stripped_characters))
505+
row.append(' ' * (dp_start_offset - stripped_characters))
498506

499507
if anchors:
500-
row.append(anchors.primary_char * (anchors.left_end_offset))
501-
row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset))
502-
row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset))
508+
dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset)
509+
dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset)
510+
row.append(anchors.primary_char * dp_left_end_offset)
511+
row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset))
512+
row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset))
503513
else:
504-
row.append('^' * (end_offset - start_offset))
514+
row.append('^' * (dp_end_offset - dp_start_offset))
505515

506516
row.append('\n')
507517

@@ -612,6 +622,25 @@ def _extract_caret_anchors_from_line_segment(segment):
612622

613623
return None
614624

625+
_WIDE_CHAR_SPECIFIERS = "WF"
626+
627+
def _display_width(line, offset):
628+
"""Calculate the extra amount of width space the given source
629+
code segment might take if it were to be displayed on a fixed
630+
width output device. Supports wide unicode characters and emojis."""
631+
632+
# Fast track for ASCII-only strings
633+
if line.isascii():
634+
return offset
635+
636+
import unicodedata
637+
638+
return sum(
639+
2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1
640+
for char in line[:offset]
641+
)
642+
643+
615644

616645
class _ExceptionPrintContext:
617646
def __init__(self):

Parser/pegen.c

+55
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
3838
return size;
3939
}
4040

41+
// Calculate the extra amount of width space the given source
42+
// code segment might take if it were to be displayed on a fixed
43+
// width output device. Supports wide unicode characters and emojis.
44+
Py_ssize_t
45+
_PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
46+
{
47+
PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
48+
if (!segment) {
49+
return -1;
50+
}
51+
52+
// Fast track for ascii strings
53+
if (PyUnicode_IS_ASCII(segment)) {
54+
Py_DECREF(segment);
55+
return character_offset;
56+
}
57+
58+
PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
59+
if (!width_fn) {
60+
return -1;
61+
}
62+
63+
Py_ssize_t width = 0;
64+
Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
65+
for (Py_ssize_t i = 0; i < len; i++) {
66+
PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
67+
if (!chr) {
68+
Py_DECREF(segment);
69+
Py_DECREF(width_fn);
70+
return -1;
71+
}
72+
73+
PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
74+
Py_DECREF(chr);
75+
if (!width_specifier) {
76+
Py_DECREF(segment);
77+
Py_DECREF(width_fn);
78+
return -1;
79+
}
80+
81+
if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
82+
_PyUnicode_EqualToASCIIString(width_specifier, "F")) {
83+
width += 2;
84+
}
85+
else {
86+
width += 1;
87+
}
88+
Py_DECREF(width_specifier);
89+
}
90+
91+
Py_DECREF(segment);
92+
Py_DECREF(width_fn);
93+
return width;
94+
}
95+
4196
// Here, mark is the start of the node, while p->mark is the end.
4297
// If node==NULL, they should be the same.
4398
int

Parser/pegen.h

+1
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ expr_ty _PyPegen_name_token(Parser *p);
144144
expr_ty _PyPegen_number_token(Parser *p);
145145
void *_PyPegen_string_token(Parser *p);
146146
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
147+
Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
147148

148149
// Error handling functions and APIs
149150
typedef enum {

Python/traceback.c

+33-2
Original file line numberDiff line numberDiff line change
@@ -884,8 +884,39 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
884884
goto done;
885885
}
886886

887-
if (print_error_location_carets(f, truncation, start_offset, end_offset,
888-
right_start_offset, left_end_offset,
887+
// Convert all offsets to display offsets (e.g. the space they would take up if printed
888+
// on the screen).
889+
Py_ssize_t dp_start = _PyPegen_calculate_display_width(source_line, start_offset);
890+
if (dp_start < 0) {
891+
err = ignore_source_errors() < 0;
892+
goto done;
893+
}
894+
895+
Py_ssize_t dp_end = _PyPegen_calculate_display_width(source_line, end_offset);
896+
if (dp_end < 0) {
897+
err = ignore_source_errors() < 0;
898+
goto done;
899+
}
900+
901+
Py_ssize_t dp_left_end = -1;
902+
Py_ssize_t dp_right_start = -1;
903+
if (has_secondary_ranges) {
904+
dp_left_end = _PyPegen_calculate_display_width(source_line, left_end_offset);
905+
if (dp_left_end < 0) {
906+
err = ignore_source_errors() < 0;
907+
goto done;
908+
}
909+
910+
dp_right_start = _PyPegen_calculate_display_width(source_line, right_start_offset);
911+
if (dp_right_start < 0) {
912+
err = ignore_source_errors() < 0;
913+
goto done;
914+
}
915+
}
916+
917+
918+
if (print_error_location_carets(f, truncation, dp_start, dp_end,
919+
dp_right_start, dp_left_end,
889920
primary_error_char, secondary_error_char) < 0) {
890921
err = -1;
891922
goto done;

0 commit comments

Comments
 (0)