Skip to content

Commit 3cf83d9

Browse files
gh-124008: Fix calculation of the number of written bytes for the Windows console (GH-124059)
Since MultiByteToWideChar()/WideCharToMultiByte() is not reversible if the data contains invalid UTF-8 sequences, use binary search to calculate the number of written bytes from the number of written characters. Also fix writing incomplete UTF-8 sequences. Also fix handling of memory allocation failures.
1 parent 83926d3 commit 3cf83d9

File tree

3 files changed

+115
-28
lines changed

3 files changed

+115
-28
lines changed

Lib/test/test_winconsoleio.py

+23
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,29 @@ def test_write_empty_data(self):
142142
with ConIO('CONOUT$', 'w') as f:
143143
self.assertEqual(f.write(b''), 0)
144144

145+
@requires_resource('console')
146+
def test_write(self):
147+
testcases = []
148+
with ConIO('CONOUT$', 'w') as f:
149+
for a in [
150+
b'',
151+
b'abc',
152+
b'\xc2\xa7\xe2\x98\x83\xf0\x9f\x90\x8d',
153+
b'\xff'*10,
154+
]:
155+
for b in b'\xc2\xa7', b'\xe2\x98\x83', b'\xf0\x9f\x90\x8d':
156+
testcases.append(a + b)
157+
for i in range(1, len(b)):
158+
data = a + b[:i]
159+
testcases.append(data + b'z')
160+
testcases.append(data + b'\xff')
161+
# incomplete multibyte sequence
162+
with self.subTest(data=data):
163+
self.assertEqual(f.write(data), len(a))
164+
for data in testcases:
165+
with self.subTest(data=data):
166+
self.assertEqual(f.write(data), len(data))
167+
145168
def assertStdinRoundTrip(self, text):
146169
stdin = open('CONIN$', 'r')
147170
old_stdin = sys.stdin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix possible crash (in debug build), incorrect output or returning incorrect
2+
value from raw binary ``write()`` when writing to console on Windows.

Modules/_io/winconsoleio.c

+90-28
Original file line numberDiff line numberDiff line change
@@ -135,19 +135,67 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
135135
}
136136

137137
static DWORD
138-
_find_last_utf8_boundary(const char *buf, DWORD len)
138+
_find_last_utf8_boundary(const unsigned char *buf, DWORD len)
139139
{
140-
/* This function never returns 0, returns the original len instead */
141-
DWORD count = 1;
142-
if (len == 0 || (buf[len - 1] & 0x80) == 0) {
143-
return len;
144-
}
145-
for (;; count++) {
146-
if (count > 3 || count >= len) {
140+
for (DWORD count = 1; count < 4 && count <= len; count++) {
141+
unsigned char c = buf[len - count];
142+
if (c < 0x80) {
143+
/* No starting byte found. */
147144
return len;
148145
}
149-
if ((buf[len - count] & 0xc0) != 0x80) {
150-
return len - count;
146+
if (c >= 0xc0) {
147+
if (c < 0xe0 /* 2-bytes sequence */ ? count < 2 :
148+
c < 0xf0 /* 3-bytes sequence */ ? count < 3 :
149+
c < 0xf8 /* 4-bytes sequence */)
150+
{
151+
/* Incomplete multibyte sequence. */
152+
return len - count;
153+
}
154+
/* Either complete or invalid sequence. */
155+
return len;
156+
}
157+
}
158+
/* Either complete 4-bytes sequence or invalid sequence. */
159+
return len;
160+
}
161+
162+
/* Find the number of UTF-8 bytes that corresponds to the specified number of
163+
* wchars.
164+
* I.e. find x <= len so that MultiByteToWideChar(CP_UTF8, 0, s, x, NULL, 0) == n.
165+
*
166+
* WideCharToMultiByte() cannot be used for this, because the UTF-8 -> wchar
167+
* conversion is not reversible (invalid UTF-8 byte produces \ufffd which
168+
* will be converted back to 3-bytes UTF-8 sequence \xef\xbf\xbd).
169+
* So we need to use binary search.
170+
*/
171+
static DWORD
172+
_wchar_to_utf8_count(const unsigned char *s, DWORD len, DWORD n)
173+
{
174+
DWORD start = 0;
175+
while (1) {
176+
DWORD mid = 0;
177+
for (DWORD i = len / 2; i <= len; i++) {
178+
mid = _find_last_utf8_boundary(s, i);
179+
if (mid != 0) {
180+
break;
181+
}
182+
/* The middle could split the first multibytes sequence. */
183+
}
184+
if (mid == len) {
185+
return start + len;
186+
}
187+
if (mid == 0) {
188+
mid = len > 1 ? len - 1 : 1;
189+
}
190+
DWORD wlen = MultiByteToWideChar(CP_UTF8, 0, s, mid, NULL, 0);
191+
if (wlen <= n) {
192+
s += mid;
193+
start += mid;
194+
len -= mid;
195+
n -= wlen;
196+
}
197+
else {
198+
len = mid;
151199
}
152200
}
153201
}
@@ -563,8 +611,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
563611
int err = 0, sig = 0;
564612

565613
wchar_t *buf = (wchar_t*)PyMem_Malloc(maxlen * sizeof(wchar_t));
566-
if (!buf)
614+
if (!buf) {
615+
PyErr_NoMemory();
567616
goto error;
617+
}
568618

569619
*readlen = 0;
570620

@@ -622,6 +672,7 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
622672
Py_UNBLOCK_THREADS
623673
if (!newbuf) {
624674
sig = -1;
675+
PyErr_NoMemory();
625676
break;
626677
}
627678
buf = newbuf;
@@ -645,8 +696,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
645696
if (*readlen > 0 && buf[0] == L'\x1a') {
646697
PyMem_Free(buf);
647698
buf = (wchar_t *)PyMem_Malloc(sizeof(wchar_t));
648-
if (!buf)
699+
if (!buf) {
700+
PyErr_NoMemory();
649701
goto error;
702+
}
650703
buf[0] = L'\0';
651704
*readlen = 0;
652705
}
@@ -824,8 +877,10 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
824877
bufsize = BUFSIZ;
825878

826879
buf = (wchar_t*)PyMem_Malloc((bufsize + 1) * sizeof(wchar_t));
827-
if (buf == NULL)
880+
if (buf == NULL) {
881+
PyErr_NoMemory();
828882
return NULL;
883+
}
829884

830885
while (1) {
831886
wchar_t *subbuf;
@@ -847,6 +902,7 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
847902
(bufsize + 1) * sizeof(wchar_t));
848903
if (tmp == NULL) {
849904
PyMem_Free(buf);
905+
PyErr_NoMemory();
850906
return NULL;
851907
}
852908
buf = tmp;
@@ -1022,43 +1078,49 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
10221078
len = (DWORD)b->len;
10231079

10241080
Py_BEGIN_ALLOW_THREADS
1025-
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
1026-
10271081
/* issue11395 there is an unspecified upper bound on how many bytes
10281082
can be written at once. We cap at 32k - the caller will have to
10291083
handle partial writes.
10301084
Since we don't know how many input bytes are being ignored, we
10311085
have to reduce and recalculate. */
1032-
while (wlen > 32766 / sizeof(wchar_t)) {
1033-
len /= 2;
1086+
const DWORD max_wlen = 32766 / sizeof(wchar_t);
1087+
/* UTF-8 to wchar ratio is at most 3:1. */
1088+
len = Py_MIN(len, max_wlen * 3);
1089+
while (1) {
10341090
/* Fix for github issues gh-110913 and gh-82052. */
10351091
len = _find_last_utf8_boundary(b->buf, len);
10361092
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
1093+
if (wlen <= max_wlen) {
1094+
break;
1095+
}
1096+
len /= 2;
10371097
}
10381098
Py_END_ALLOW_THREADS
10391099

1040-
if (!wlen)
1041-
return PyErr_SetFromWindowsErr(0);
1100+
if (!wlen) {
1101+
return PyLong_FromLong(0);
1102+
}
10421103

10431104
wbuf = (wchar_t*)PyMem_Malloc(wlen * sizeof(wchar_t));
1105+
if (!wbuf) {
1106+
PyErr_NoMemory();
1107+
return NULL;
1108+
}
10441109

10451110
Py_BEGIN_ALLOW_THREADS
10461111
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, wbuf, wlen);
10471112
if (wlen) {
10481113
res = WriteConsoleW(handle, wbuf, wlen, &n, NULL);
1114+
#ifdef Py_DEBUG
1115+
if (res) {
1116+
#else
10491117
if (res && n < wlen) {
1118+
#endif
10501119
/* Wrote fewer characters than expected, which means our
10511120
* len value may be wrong. So recalculate it from the
1052-
* characters that were written. As this could potentially
1053-
* result in a different value, we also validate that value.
1121+
* characters that were written.
10541122
*/
1055-
len = WideCharToMultiByte(CP_UTF8, 0, wbuf, n,
1056-
NULL, 0, NULL, NULL);
1057-
if (len) {
1058-
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len,
1059-
NULL, 0);
1060-
assert(wlen == len);
1061-
}
1123+
len = _wchar_to_utf8_count(b->buf, len, n);
10621124
}
10631125
} else
10641126
res = 0;

0 commit comments

Comments
 (0)