@@ -135,19 +135,67 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
135
135
}
136
136
137
137
static DWORD
138
- _find_last_utf8_boundary (const char * buf , DWORD len )
138
+ _find_last_utf8_boundary (const unsigned char * buf , DWORD len )
139
139
{
140
- /* This function never returns 0, returns the original len instead */
141
- DWORD count = 1 ;
142
- if (len == 0 || (buf [len - 1 ] & 0x80 ) == 0 ) {
143
- return len ;
144
- }
145
- for (;; count ++ ) {
146
- if (count > 3 || count >= len ) {
140
+ for (DWORD count = 1 ; count < 4 && count <= len ; count ++ ) {
141
+ unsigned char c = buf [len - count ];
142
+ if (c < 0x80 ) {
143
+ /* No starting byte found. */
147
144
return len ;
148
145
}
149
- if ((buf [len - count ] & 0xc0 ) != 0x80 ) {
150
- return len - count ;
146
+ if (c >= 0xc0 ) {
147
+ if (c < 0xe0 /* 2-bytes sequence */ ? count < 2 :
148
+ c < 0xf0 /* 3-bytes sequence */ ? count < 3 :
149
+ c < 0xf8 /* 4-bytes sequence */ )
150
+ {
151
+ /* Incomplete multibyte sequence. */
152
+ return len - count ;
153
+ }
154
+ /* Either complete or invalid sequence. */
155
+ return len ;
156
+ }
157
+ }
158
+ /* Either complete 4-bytes sequence or invalid sequence. */
159
+ return len ;
160
+ }
161
+
162
+ /* Find the number of UTF-8 bytes that corresponds to the specified number of
163
+ * wchars.
164
+ * I.e. find x <= len so that MultiByteToWideChar(CP_UTF8, 0, s, x, NULL, 0) == n.
165
+ *
166
+ * WideCharToMultiByte() cannot be used for this, because the UTF-8 -> wchar
167
+ * conversion is not reversible (invalid UTF-8 byte produces \ufffd which
168
+ * will be converted back to 3-bytes UTF-8 sequence \xef\xbf\xbd).
169
+ * So we need to use binary search.
170
+ */
171
+ static DWORD
172
+ _wchar_to_utf8_count (const unsigned char * s , DWORD len , DWORD n )
173
+ {
174
+ DWORD start = 0 ;
175
+ while (1 ) {
176
+ DWORD mid = 0 ;
177
+ for (DWORD i = len / 2 ; i <= len ; i ++ ) {
178
+ mid = _find_last_utf8_boundary (s , i );
179
+ if (mid != 0 ) {
180
+ break ;
181
+ }
182
+ /* The middle could split the first multibytes sequence. */
183
+ }
184
+ if (mid == len ) {
185
+ return start + len ;
186
+ }
187
+ if (mid == 0 ) {
188
+ mid = len > 1 ? len - 1 : 1 ;
189
+ }
190
+ DWORD wlen = MultiByteToWideChar (CP_UTF8 , 0 , s , mid , NULL , 0 );
191
+ if (wlen <= n ) {
192
+ s += mid ;
193
+ start += mid ;
194
+ len -= mid ;
195
+ n -= wlen ;
196
+ }
197
+ else {
198
+ len = mid ;
151
199
}
152
200
}
153
201
}
@@ -563,8 +611,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
563
611
int err = 0 , sig = 0 ;
564
612
565
613
wchar_t * buf = (wchar_t * )PyMem_Malloc (maxlen * sizeof (wchar_t ));
566
- if (!buf )
614
+ if (!buf ) {
615
+ PyErr_NoMemory ();
567
616
goto error ;
617
+ }
568
618
569
619
* readlen = 0 ;
570
620
@@ -622,6 +672,7 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
622
672
Py_UNBLOCK_THREADS
623
673
if (!newbuf ) {
624
674
sig = -1 ;
675
+ PyErr_NoMemory ();
625
676
break ;
626
677
}
627
678
buf = newbuf ;
@@ -645,8 +696,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
645
696
if (* readlen > 0 && buf [0 ] == L'\x1a' ) {
646
697
PyMem_Free (buf );
647
698
buf = (wchar_t * )PyMem_Malloc (sizeof (wchar_t ));
648
- if (!buf )
699
+ if (!buf ) {
700
+ PyErr_NoMemory ();
649
701
goto error ;
702
+ }
650
703
buf [0 ] = L'\0' ;
651
704
* readlen = 0 ;
652
705
}
@@ -824,8 +877,10 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
824
877
bufsize = BUFSIZ ;
825
878
826
879
buf = (wchar_t * )PyMem_Malloc ((bufsize + 1 ) * sizeof (wchar_t ));
827
- if (buf == NULL )
880
+ if (buf == NULL ) {
881
+ PyErr_NoMemory ();
828
882
return NULL ;
883
+ }
829
884
830
885
while (1 ) {
831
886
wchar_t * subbuf ;
@@ -847,6 +902,7 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
847
902
(bufsize + 1 ) * sizeof (wchar_t ));
848
903
if (tmp == NULL ) {
849
904
PyMem_Free (buf );
905
+ PyErr_NoMemory ();
850
906
return NULL ;
851
907
}
852
908
buf = tmp ;
@@ -1022,43 +1078,49 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
1022
1078
len = (DWORD )b -> len ;
1023
1079
1024
1080
Py_BEGIN_ALLOW_THREADS
1025
- wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1026
-
1027
1081
/* issue11395 there is an unspecified upper bound on how many bytes
1028
1082
can be written at once. We cap at 32k - the caller will have to
1029
1083
handle partial writes.
1030
1084
Since we don't know how many input bytes are being ignored, we
1031
1085
have to reduce and recalculate. */
1032
- while (wlen > 32766 / sizeof (wchar_t )) {
1033
- len /= 2 ;
1086
+ const DWORD max_wlen = 32766 / sizeof (wchar_t );
1087
+ /* UTF-8 to wchar ratio is at most 3:1. */
1088
+ len = Py_MIN (len , max_wlen * 3 );
1089
+ while (1 ) {
1034
1090
/* Fix for github issues gh-110913 and gh-82052. */
1035
1091
len = _find_last_utf8_boundary (b -> buf , len );
1036
1092
wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1093
+ if (wlen <= max_wlen ) {
1094
+ break ;
1095
+ }
1096
+ len /= 2 ;
1037
1097
}
1038
1098
Py_END_ALLOW_THREADS
1039
1099
1040
- if (!wlen )
1041
- return PyErr_SetFromWindowsErr (0 );
1100
+ if (!wlen ) {
1101
+ return PyLong_FromLong (0 );
1102
+ }
1042
1103
1043
1104
wbuf = (wchar_t * )PyMem_Malloc (wlen * sizeof (wchar_t ));
1105
+ if (!wbuf ) {
1106
+ PyErr_NoMemory ();
1107
+ return NULL ;
1108
+ }
1044
1109
1045
1110
Py_BEGIN_ALLOW_THREADS
1046
1111
wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , wbuf , wlen );
1047
1112
if (wlen ) {
1048
1113
res = WriteConsoleW (handle , wbuf , wlen , & n , NULL );
1114
+ #ifdef Py_DEBUG
1115
+ if (res ) {
1116
+ #else
1049
1117
if (res && n < wlen ) {
1118
+ #endif
1050
1119
/* Wrote fewer characters than expected, which means our
1051
1120
* len value may be wrong. So recalculate it from the
1052
- * characters that were written. As this could potentially
1053
- * result in a different value, we also validate that value.
1121
+ * characters that were written.
1054
1122
*/
1055
- len = WideCharToMultiByte (CP_UTF8 , 0 , wbuf , n ,
1056
- NULL , 0 , NULL , NULL );
1057
- if (len ) {
1058
- wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len ,
1059
- NULL , 0 );
1060
- assert (wlen == len );
1061
- }
1123
+ len = _wchar_to_utf8_count (b -> buf , len , n );
1062
1124
}
1063
1125
} else
1064
1126
res = 0 ;
0 commit comments