Skip to content

Commit d4fe650

Browse files
committed
std.os.windows: Rewrite ReadConsoleWithUtf16ToUtf8Conversion() using fixed-size temporary buffer
1 parent fffbb92 commit d4fe650

File tree

1 file changed

+132
-108
lines changed

1 file changed

+132
-108
lines changed

lib/std/os/windows.zig

Lines changed: 132 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -564,125 +564,149 @@ fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) Rea
564564
error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."),
565565
else => return error.Unexpected,
566566
};
567-
// The temporary buffer can be huge, so keep it away from stack
568-
var heap_allocator: std.heap.HeapAllocator = std.heap.HeapAllocator.init();
569-
defer heap_allocator.deinit();
570-
const allocator: std.mem.Allocator = heap_allocator.allocator();
571-
var temp_buffer: []u8 = allocator.alloc(u8, buffer.len) catch @panic("Out of memory.");
572-
defer allocator.free(temp_buffer);
573-
567+
var temp_buffer: [1024]u8 = undefined;
574568
var bytes_read: DWORD = 0;
575569
var reached_end_of_line: bool = false;
576-
577-
// Try flushing leftover UTF-8 bytes first (one codepoint at most)
578-
if (handle_data.utf8_buffer.bytes_used != 0) {
579-
// LF will only appear at the first byte and there will be only one byte in the buffer
580-
if (handle_data.utf8_buffer.data[0] == 0x0A) {
581-
assert(handle_data.utf8_buffer.bytes_used == 1);
582-
reached_end_of_line = true;
583-
}
584-
// Is there enough space for all bytes in UTF-8 buffer?
585-
const has_enough_space: bool = buffer.len >= handle_data.utf8_buffer.bytes_used;
586-
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len;
587-
for (0..max_bytes_to_read) |index| {
588-
temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
589-
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
590-
handle_data.utf8_buffer.front_index +%= 1;
591-
}
592-
bytes_read += @truncate(max_bytes_to_read);
593-
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
594-
if (has_enough_space) {
595-
// UTF-8 buffer is now empty, we can safely reset front_index to zero
596-
handle_data.utf8_buffer.front_index = 0;
597-
} else {
598-
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
599-
}
600-
// LF ends a console read immediately
601-
if (reached_end_of_line) {
602-
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
603-
}
604-
}
605-
assert(handle_data.utf8_buffer.front_index == 0);
570+
const TruncateState = enum {
571+
do_not_truncate,
572+
truncate_after_SUB,
573+
truncate_all,
574+
};
575+
var truncate_state: TruncateState = .do_not_truncate;
606576
while (bytes_read < buffer.len) {
607-
// Read only one code unit each loop
608-
var utf16_code_unit: u16 = undefined;
609-
var utf16_code_units_read: DWORD = undefined;
610-
if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) {
611-
switch (kernel32.GetLastError()) {
612-
.INVALID_HANDLE => return error.NotOpenForReading,
613-
else => |err| return unexpectedError(err),
614-
}
577+
const remaining_buffer: []u8 = buffer[bytes_read..buffer.len];
578+
var has_enough_space_in_remaining_buffer: bool = undefined;
579+
var bytes_read_into_temp_buffer: DWORD = 0;
580+
var truncate_index: DWORD = undefined;
581+
// If a SUB character is encountered in a previous loop, truncate everything in this loop
582+
if (truncate_state == .truncate_after_SUB) {
583+
truncate_state = .truncate_all;
615584
}
616-
if (utf16_code_unit == 0x000D) {
617-
// CR should always be followed by an LF, so just discard it
618-
continue;
619-
} else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
620-
// When a high surrogate is encountered, store it into the UTF-16 buffer
621-
assert(handle_data.utf16_buffer.code_units_used == 0);
622-
handle_data.utf16_buffer.data[0] = utf16_code_unit;
623-
handle_data.utf16_buffer.code_units_used = 1;
624-
continue;
625-
} else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) {
626-
// When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8
627-
if (!(utf16_code_units_read == 1 and
628-
handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) {
629-
unreachable;
585+
// Try flushing leftover UTF-8 bytes first (one codepoint at most)
586+
if (handle_data.utf8_buffer.bytes_used != 0) {
587+
if (handle_data.utf8_buffer.data[0] == 0x0A) {
588+
assert(handle_data.utf8_buffer.bytes_used == 1);
589+
reached_end_of_line = true;
590+
} else if (handle_data.utf8_buffer.data[0] == 0x1A) {
591+
assert(handle_data.utf8_buffer.bytes_used == 1);
592+
// Truncate after SUB character in this loop if we never truncated in previous loops
593+
if (truncate_state == .do_not_truncate) {
594+
truncate_state = .truncate_after_SUB;
595+
truncate_index = 1;
596+
}
597+
}
598+
// Is there enough space for all bytes in UTF-8 buffer?
599+
const has_enough_space: bool = remaining_buffer.len >= handle_data.utf8_buffer.bytes_used;
600+
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else remaining_buffer.len;
601+
for (0..max_bytes_to_read) |index| {
602+
temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
603+
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
604+
handle_data.utf8_buffer.front_index +%= 1;
605+
}
606+
bytes_read_into_temp_buffer += @truncate(max_bytes_to_read);
607+
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
608+
if (has_enough_space) {
609+
// UTF-8 buffer is now empty, we can safely reset front_index to zero
610+
handle_data.utf8_buffer.front_index = 0;
611+
} else {
612+
switch (truncate_state) {
613+
.truncate_all => {},
614+
else => @memcpy(remaining_buffer[0..bytes_read_into_temp_buffer], temp_buffer[0..bytes_read_into_temp_buffer]),
615+
}
616+
bytes_read += bytes_read_into_temp_buffer;
617+
break;
630618
}
631-
handle_data.utf16_buffer.data[1] = utf16_code_unit;
632-
handle_data.utf16_buffer.code_units_used = 0;
633-
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected;
634-
assert(utf8_bytes == 4);
635-
handle_data.utf8_buffer.bytes_used = 4;
636-
} else {
637-
assert(handle_data.utf16_buffer.code_units_used == 0);
638-
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected;
639-
handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes);
640619
// LF ends a console read immediately
641-
if (handle_data.utf8_buffer.bytes_used == 1 and handle_data.utf8_buffer.data[0] == 0x0A) {
642-
reached_end_of_line = true;
620+
if (reached_end_of_line) {
621+
switch (truncate_state) {
622+
.truncate_all => {},
623+
else => @memcpy(remaining_buffer[0..bytes_read_into_temp_buffer], temp_buffer[0..bytes_read_into_temp_buffer]),
624+
}
625+
bytes_read += bytes_read_into_temp_buffer;
626+
break;
643627
}
644628
}
645-
// Is there enough space for all bytes in UTF-8 buffer?
646-
const has_enough_space: bool = buffer.len >= bytes_read + handle_data.utf8_buffer.bytes_used;
647-
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len - bytes_read;
648-
for (0..max_bytes_to_read) |index| {
649-
temp_buffer[bytes_read + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
650-
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
651-
handle_data.utf8_buffer.front_index +%= 1;
652-
}
653-
bytes_read += @truncate(max_bytes_to_read);
654-
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
655-
if (has_enough_space) {
656-
// UTF-8 buffer is now empty, we can safely reset front_index to zero
657-
handle_data.utf8_buffer.front_index = 0;
658-
} else {
659-
break;
629+
assert(handle_data.utf8_buffer.front_index == 0);
630+
while (bytes_read_into_temp_buffer < temp_buffer.len) {
631+
// Read only one code unit each loop
632+
var utf16_code_unit: u16 = undefined;
633+
var utf16_code_units_read: DWORD = undefined;
634+
if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) {
635+
switch (kernel32.GetLastError()) {
636+
.INVALID_HANDLE => return error.NotOpenForReading,
637+
else => |err| return unexpectedError(err),
638+
}
639+
}
640+
if (utf16_code_unit == 0x000D) {
641+
// CR should always be followed by an LF, so just discard it
642+
continue;
643+
} else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
644+
// When a high surrogate is encountered, store it into the UTF-16 buffer
645+
assert(handle_data.utf16_buffer.code_units_used == 0);
646+
handle_data.utf16_buffer.data[0] = utf16_code_unit;
647+
handle_data.utf16_buffer.code_units_used = 1;
648+
continue;
649+
} else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) {
650+
// When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8
651+
if (!(utf16_code_units_read == 1 and handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) {
652+
unreachable;
653+
}
654+
handle_data.utf16_buffer.data[1] = utf16_code_unit;
655+
handle_data.utf16_buffer.code_units_used = 0;
656+
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected;
657+
assert(utf8_bytes == 4);
658+
handle_data.utf8_buffer.bytes_used = 4;
659+
} else {
660+
assert(handle_data.utf16_buffer.code_units_used == 0);
661+
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected;
662+
handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes);
663+
if (handle_data.utf8_buffer.bytes_used == 1) {
664+
if (handle_data.utf8_buffer.data[0] == 0x0A) {
665+
reached_end_of_line = true;
666+
} else if (handle_data.utf8_buffer.data[0] == 0x1A) {
667+
if (truncate_state == .do_not_truncate) {
668+
truncate_state = .truncate_after_SUB;
669+
truncate_index = bytes_read_into_temp_buffer + 1;
670+
}
671+
}
672+
}
673+
}
674+
// Is there enough space for all bytes in UTF-8 buffer?
675+
has_enough_space_in_remaining_buffer = remaining_buffer.len >= bytes_read_into_temp_buffer + handle_data.utf8_buffer.bytes_used;
676+
const has_enough_space: bool = has_enough_space_in_remaining_buffer and temp_buffer.len >= bytes_read_into_temp_buffer + handle_data.utf8_buffer.bytes_used;
677+
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else remaining_buffer.len - bytes_read_into_temp_buffer;
678+
for (0..max_bytes_to_read) |index| {
679+
temp_buffer[bytes_read_into_temp_buffer + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
680+
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
681+
handle_data.utf8_buffer.front_index +%= 1;
682+
}
683+
bytes_read_into_temp_buffer += @truncate(max_bytes_to_read);
684+
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
685+
if (has_enough_space) {
686+
// UTF-8 buffer is now empty, we can safely reset front_index to zero
687+
handle_data.utf8_buffer.front_index = 0;
688+
} else {
689+
break;
690+
}
691+
// LF ends a console read immediately
692+
if (reached_end_of_line) {
693+
break;
694+
}
660695
}
661-
// LF ends a console read immediately
662-
if (reached_end_of_line) {
696+
// Copy to user-provided buffer
697+
const bytes_copied: DWORD = switch (truncate_state) {
698+
.do_not_truncate => bytes_read_into_temp_buffer,
699+
.truncate_after_SUB => truncate_index,
700+
.truncate_all => 0,
701+
};
702+
@memcpy(remaining_buffer[0..bytes_copied], temp_buffer[0..bytes_copied]);
703+
bytes_read += bytes_copied;
704+
// Early return conditions
705+
if (!has_enough_space_in_remaining_buffer or reached_end_of_line) {
663706
break;
664707
}
665708
}
666-
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, true);
667-
}
668-
669-
fn ReadConsoleProcessUtf8Buffer(buffer: []u8, temp_buffer: []u8, bytes_read: DWORD, comptime truncate_after_SUB: bool) DWORD {
670-
if (truncate_after_SUB) {
671-
// Truncate everything after the SUB (Ctrl+Z) character
672-
var index: DWORD = 0;
673-
var reached_end_of_file: bool = false;
674-
while (index < bytes_read and !reached_end_of_file) {
675-
if (temp_buffer[index] == 0x1A) {
676-
reached_end_of_file = true;
677-
}
678-
buffer[index] = temp_buffer[index];
679-
index += 1;
680-
}
681-
return index;
682-
} else {
683-
std.mem.copy(u8, buffer, temp_buffer);
684-
return bytes_read;
685-
}
709+
return bytes_read;
686710
}
687711

688712
pub const WriteFileError = error{
@@ -809,7 +833,7 @@ fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) Writ
809833
bytes_written += @truncate(bytes_available);
810834
return bytes_written;
811835
} else {
812-
utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, bytes[byte_index..byte_index + utf8_byte_sequence_length]) catch return error.InvalidUtf8;
836+
utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, bytes[byte_index .. byte_index + utf8_byte_sequence_length]) catch return error.InvalidUtf8;
813837
byte_index += utf8_byte_sequence_length;
814838
}
815839
} else {

0 commit comments

Comments
 (0)