Skip to content

Commit 4ff8419

Browse files
committed
std.os.windows: UTF-16 to UTF-8 conversion for Windows console input (cooked mode only)
1 parent 5ac960f commit 4ff8419

File tree

3 files changed

+195
-31
lines changed

3 files changed

+195
-31
lines changed

lib/std/os.zig

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3185,8 +3185,7 @@ pub fn isatty(handle: fd_t) bool {
31853185
if (isCygwinPty(handle))
31863186
return true;
31873187

3188-
var out: windows.DWORD = undefined;
3189-
return windows.kernel32.GetConsoleMode(handle, &out) != 0;
3188+
return windows.IsConsoleHandle(handle);
31903189
}
31913190
if (builtin.link_libc) {
31923191
return system.isatty(handle) != 0;

lib/std/os/windows.zig

Lines changed: 193 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ pub fn GetQueuedCompletionStatusEx(
446446
pub fn CloseHandle(hObject: HANDLE) void {
447447
assert(ntdll.NtClose(hObject) == .SUCCESS);
448448
if (IsConsoleHandle(hObject)) {
449-
_ = RemoveConsoleHandleData(hObject) catch {};
449+
_ = removeConsoleHandleData(hObject) catch {};
450450
}
451451
}
452452

@@ -459,6 +459,7 @@ pub const ReadFileError = error{
459459
NetNameDeleted,
460460
OperationAborted,
461461
Unexpected,
462+
NotOpenForReading,
462463
};
463464

464465
/// If buffer's length exceeds what a Windows DWORD integer can hold, it will be broken into
@@ -526,21 +527,164 @@ pub fn ReadFile(in_hFile: HANDLE, buffer: []u8, offset: ?u64, io_mode: std.io.Mo
526527
};
527528
break :blk &overlapped_data;
528529
} else null;
529-
if (kernel32.ReadFile(in_hFile, buffer.ptr, want_read_count, &amt_read, overlapped) == 0) {
530-
switch (kernel32.GetLastError()) {
531-
.IO_PENDING => unreachable,
532-
.OPERATION_ABORTED => continue,
533-
.BROKEN_PIPE => return 0,
534-
.HANDLE_EOF => return 0,
535-
.NETNAME_DELETED => return error.NetNameDeleted,
536-
else => |err| return unexpectedError(err),
530+
var console_mode: DWORD = undefined;
531+
const is_console_handle: bool = kernel32.GetConsoleMode(in_hFile, &console_mode) != FALSE;
532+
const is_cooked_mode: bool = (console_mode & ENABLE_LINE_INPUT) != 0;
533+
// Implementation issue:
534+
// There is no reliable way to implement perfectly platform-agnostic UTF-16 to UTF-8
535+
// conversion for raw mode, because it is impossible to know the number of pending
536+
// code units stored in console input buffer, while in cooked mode we can rely on the
537+
// terminating LF character. Without knowing that, ReadConsoleW() may accidentally pop
538+
// out characters without blocking, or prompt for user input at unexpected timing.
539+
// In the case of raw mode, redirect to kernel32.ReadFile() without conversion for now,
540+
// just don't make things worse.
541+
if (is_console_handle and is_cooked_mode) {
542+
assert(offset == null);
543+
amt_read = ReadConsoleWithUtf16ToUtf8Conversion(in_hFile, buffer) catch |err| return err;
544+
} else {
545+
if (kernel32.ReadFile(in_hFile, buffer.ptr, want_read_count, &amt_read, overlapped) == 0) {
546+
switch (kernel32.GetLastError()) {
547+
.IO_PENDING => unreachable,
548+
.OPERATION_ABORTED => continue,
549+
.BROKEN_PIPE => return 0,
550+
.HANDLE_EOF => return 0,
551+
.NETNAME_DELETED => return error.NetNameDeleted,
552+
.INVALID_HANDLE => return error.NotOpenForReading,
553+
else => |err| return unexpectedError(err),
554+
}
537555
}
538556
}
539557
return amt_read;
540558
}
541559
}
542560
}
543561

562+
fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) ReadFileError!DWORD {
563+
const handle_data: *ConsoleHandleData = getConsoleHandleData(hConsoleInput) catch |err| switch (err) {
564+
error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."),
565+
else => return error.Unexpected,
566+
};
567+
// The temporary buffer can be huge, so keep it away from stack
568+
var heap_allocator: std.heap.HeapAllocator = std.heap.HeapAllocator.init();
569+
defer heap_allocator.deinit();
570+
const allocator: std.mem.Allocator = heap_allocator.allocator();
571+
var temp_buffer: []u8 = allocator.alloc(u8, buffer.len) catch @panic("Out of memory.");
572+
defer allocator.free(temp_buffer);
573+
574+
var bytes_read: DWORD = 0;
575+
var reached_end_of_line: bool = false;
576+
577+
// Try flushing leftover UTF-8 bytes first (one codepoint at most)
578+
if (handle_data.utf8_buffer.bytes_used != 0) {
579+
// LF will only appear at the first byte and there will be only one byte in the buffer
580+
if (handle_data.utf8_buffer.data[0] == 0x0A) {
581+
assert(handle_data.utf8_buffer.bytes_used == 1);
582+
reached_end_of_line = true;
583+
}
584+
// Is there enough space for all bytes in UTF-8 buffer?
585+
const has_enough_space: bool = buffer.len >= handle_data.utf8_buffer.bytes_used;
586+
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len;
587+
for (0..max_bytes_to_read) |index| {
588+
temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
589+
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
590+
handle_data.utf8_buffer.front_index +%= 1;
591+
}
592+
bytes_read += @truncate(max_bytes_to_read);
593+
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
594+
if (has_enough_space) {
595+
// UTF-8 buffer is now empty, we can safely reset front_index to zero
596+
handle_data.utf8_buffer.front_index = 0;
597+
} else {
598+
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
599+
}
600+
// LF ends a console read immediately
601+
if (reached_end_of_line) {
602+
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
603+
}
604+
}
605+
assert(handle_data.utf8_buffer.front_index == 0);
606+
while (bytes_read < buffer.len) {
607+
// Read only one code unit each loop
608+
var utf16_code_unit: u16 = undefined;
609+
var utf16_code_units_read: DWORD = undefined;
610+
if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) {
611+
switch (kernel32.GetLastError()) {
612+
.INVALID_HANDLE => return error.NotOpenForReading,
613+
else => |err| return unexpectedError(err),
614+
}
615+
}
616+
if (utf16_code_unit == 0x000D) {
617+
// CR should always be followed by an LF, so just discard it
618+
continue;
619+
} else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
620+
// When a high surrogate is encountered, store it into the UTF-16 buffer
621+
assert(handle_data.utf16_buffer.code_units_used == 0);
622+
handle_data.utf16_buffer.data[0] = utf16_code_unit;
623+
handle_data.utf16_buffer.code_units_used = 1;
624+
continue;
625+
} else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) {
626+
// When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8
627+
if (!(utf16_code_units_read == 1 and
628+
handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) {
629+
unreachable;
630+
}
631+
handle_data.utf16_buffer.data[1] = utf16_code_unit;
632+
handle_data.utf16_buffer.code_units_used = 0;
633+
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected;
634+
assert(utf8_bytes == 4);
635+
handle_data.utf8_buffer.bytes_used = 4;
636+
} else {
637+
assert(handle_data.utf16_buffer.code_units_used == 0);
638+
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected;
639+
handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes);
640+
// LF ends a console read immediately
641+
if (handle_data.utf8_buffer.bytes_used == 1 and handle_data.utf8_buffer.data[0] == 0x0A) {
642+
reached_end_of_line = true;
643+
}
644+
}
645+
// Is there enough space for all bytes in UTF-8 buffer?
646+
const has_enough_space: bool = buffer.len >= bytes_read + handle_data.utf8_buffer.bytes_used;
647+
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len - bytes_read;
648+
for (0..max_bytes_to_read) |index| {
649+
temp_buffer[bytes_read + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
650+
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
651+
handle_data.utf8_buffer.front_index +%= 1;
652+
}
653+
bytes_read += @truncate(max_bytes_to_read);
654+
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
655+
if (has_enough_space) {
656+
// UTF-8 buffer is now empty, we can safely reset front_index to zero
657+
handle_data.utf8_buffer.front_index = 0;
658+
} else {
659+
break;
660+
}
661+
// LF ends a console read immediately
662+
if (reached_end_of_line) {
663+
break;
664+
}
665+
}
666+
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, true);
667+
}
668+
669+
fn ReadConsoleProcessUtf8Buffer(buffer: []u8, temp_buffer: []u8, bytes_read: DWORD, comptime truncate_after_SUB: bool) DWORD {
670+
if (truncate_after_SUB) {
671+
// Truncate everything after the SUB (Ctrl+Z) character
672+
var index: DWORD = 0;
673+
var reached_end_of_file: bool = false;
674+
while (index < bytes_read and !reached_end_of_file) {
675+
if (temp_buffer[index] == 0x1A) {
676+
reached_end_of_file = true;
677+
}
678+
buffer[index] = temp_buffer[index];
679+
index += 1;
680+
}
681+
return index;
682+
} else {
683+
std.mem.copy(u8, buffer, temp_buffer);
684+
return bytes_read;
685+
}
686+
}
687+
544688
pub const WriteFileError = error{
545689
SystemResources,
546690
OperationAborted,
@@ -646,7 +790,7 @@ pub fn WriteFile(
646790
}
647791

648792
fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) WriteFileError!DWORD {
649-
const handle_data: *ConsoleHandleData = GetConsoleHandleData(handle) catch |err| switch (err) {
793+
const handle_data: *ConsoleHandleData = getConsoleHandleData(handle) catch |err| switch (err) {
650794
error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."),
651795
else => return error.Unexpected,
652796
};
@@ -688,6 +832,7 @@ fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) Writ
688832
byte_index += bytes_needed;
689833
}
690834
}
835+
// Handle LF to CRLF conversion
691836
switch (utf16_buffer[0]) {
692837
0x000D => {
693838
handle_data.last_character_written_is_CR = true;
@@ -5167,52 +5312,71 @@ pub fn ProcessBaseAddress(handle: HANDLE) ProcessBaseAddressError!HMODULE {
51675312
return ppeb.ImageBaseAddress;
51685313
}
51695314

5315+
pub const ENABLE_PROCESSED_INPUT = 0x0001;
5316+
pub const ENABLE_LINE_INPUT = 0x0002;
5317+
pub const ENABLE_ECHO_INPUT = 0x0004;
5318+
pub const ENABLE_WINDOW_INPUT = 0x0008;
5319+
pub const ENABLE_MOUSE_INPUT = 0x0010;
5320+
pub const ENABLE_INSERT_MODE = 0x0020;
5321+
pub const ENABLE_QUICK_EDIT_MODE = 0x0040;
5322+
pub const ENABLE_EXTENDED_FLAGS = 0x0080;
5323+
pub const ENABLE_AUTO_POSITION = 0x0100;
5324+
pub const ENABLE_VIRTUAL_TERMINAL_INPUT = 0x0200;
5325+
5326+
pub const CONSOLE_READCONSOLE_CONTROL = extern struct {
5327+
nLength: ULONG,
5328+
nInitialChars: ULONG,
5329+
dwCtrlWakeupMask: ULONG,
5330+
dwControlKeyState: ULONG,
5331+
};
5332+
5333+
pub const PCONSOLE_READCONSOLE_CONTROL = *CONSOLE_READCONSOLE_CONTROL;
5334+
51705335
pub fn IsConsoleHandle(handle: HANDLE) bool {
51715336
var out: DWORD = undefined;
51725337
return kernel32.GetConsoleMode(handle, &out) != FALSE;
51735338
}
51745339

5340+
// Non-public extra data associated with console handle, and its helper functions
51755341
const ConsoleHandleData = struct {
51765342
is_assigned: bool = false,
5177-
51785343
handle: ?HANDLE = null,
5179-
5180-
/// On Windows NT, UTF-8 encoded strings should be converted to UTF-16 before writing to the
5181-
/// native console. Since write() might be called with a string fragment or even a single byte,
5182-
/// we have to store residual UTF-8 byte(s) without returning error. UTF-16 code unit(s) will be
5183-
/// generated when we have enough bytes to complete a code point.
51845344
utf8_buffer: Utf8Buffer = .{},
5185-
5186-
/// Used for LF to CRLF conversion before writing to the native console.
5345+
utf16_buffer: Utf16Buffer = .{},
51875346
last_character_written_is_CR: bool = false,
51885347

5189-
pub const Utf8Buffer = struct {
5348+
const Utf8Buffer = struct {
51905349
data: [4]u8 = .{ 0x00, 0x00, 0x00, 0x00 },
51915350
bytes_used: u3 = 0,
5351+
front_index: u2 = 0,
5352+
};
5353+
5354+
const Utf16Buffer = struct {
5355+
data: [2]u16 = .{ 0x0000, 0x0000 },
5356+
code_units_used: u2 = 0,
51925357
};
51935358
};
51945359

5195-
const console_handle_data_limit = 64;
5360+
const max_console_handle_data = 64;
51965361

51975362
var console_handle_data_array: switch (builtin.os.tag) {
5198-
.windows => [console_handle_data_limit]ConsoleHandleData,
5363+
.windows => [max_console_handle_data]ConsoleHandleData,
51995364
else => void,
52005365
} = switch (builtin.os.tag) {
5201-
.windows => [_]ConsoleHandleData{.{}} ** console_handle_data_limit,
5366+
.windows => [_]ConsoleHandleData{.{}} ** max_console_handle_data,
52025367
else => void{},
52035368
};
52045369

52055370
const ConsoleHandleDataError = error{
5206-
OsUnsupported,
52075371
DataNotFound,
52085372
ConsoleHandleLimitReached,
52095373
};
52105374

5211-
fn GetConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!*ConsoleHandleData {
5375+
fn getConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!*ConsoleHandleData {
52125376
if (builtin.os.tag == .windows) {
52135377
var found_unassigned: bool = false;
52145378
var first_unassigned_index: usize = undefined;
5215-
for (0..console_handle_data_limit) |index| {
5379+
for (0..max_console_handle_data) |index| {
52165380
if (console_handle_data_array[index].is_assigned) {
52175381
if (console_handle_data_array[index].handle == handle) {
52185382
return &console_handle_data_array[index];
@@ -5232,20 +5396,20 @@ fn GetConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!*ConsoleHandleDat
52325396
return error.ConsoleHandleLimitReached;
52335397
}
52345398
} else {
5235-
return error.OsUnsupported;
5399+
@compileError("Unsupported OS");
52365400
}
52375401
}
52385402

5239-
fn RemoveConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!usize {
5403+
fn removeConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!usize {
52405404
if (builtin.os.tag == .windows) {
5241-
for (0..console_handle_data_limit) |index| {
5405+
for (0..max_console_handle_data) |index| {
52425406
if (console_handle_data_array[index].is_assigned and console_handle_data_array[index].handle == handle) {
52435407
console_handle_data_array[index].is_assigned = false;
52445408
return index;
52455409
}
52465410
}
52475411
return error.DataNotFound;
52485412
} else {
5249-
return error.OsUnsupported;
5413+
@compileError("Unsupported OS");
52505414
}
52515415
}

lib/std/os/windows/kernel32.zig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ pub extern "kernel32" fn FillConsoleOutputCharacterW(hConsoleOutput: HANDLE, cCh
174174
pub extern "kernel32" fn FillConsoleOutputAttribute(hConsoleOutput: HANDLE, wAttribute: WORD, nLength: DWORD, dwWriteCoord: COORD, lpNumberOfAttrsWritten: *DWORD) callconv(WINAPI) BOOL;
175175
pub extern "kernel32" fn SetConsoleCursorPosition(hConsoleOutput: HANDLE, dwCursorPosition: COORD) callconv(WINAPI) BOOL;
176176

177+
pub extern "kernel32" fn ReadConsoleW(hConsoleInput: HANDLE, lpBuffer: LPVOID, nNumberOfCharsToRead: DWORD, lpNumberOfCharsRead: *DWORD, pInputControl: ?LPVOID) callconv(WINAPI) BOOL;
177178
pub extern "kernel32" fn WriteConsoleW(hConsoleOutput: HANDLE, lpBuffer: *const anyopaque, nNumberOfCharsToWrite: DWORD, lpNumberOfCharsWritten: ?*DWORD, lpReserved: ?LPVOID) callconv(WINAPI) BOOL;
178179

179180
pub extern "kernel32" fn GetCurrentDirectoryW(nBufferLength: DWORD, lpBuffer: ?[*]WCHAR) callconv(WINAPI) DWORD;

0 commit comments

Comments
 (0)