diff --git a/lib/std/process.zig b/lib/std/process.zig index b083126b315b..e12bc28c0c3a 100644 --- a/lib/std/process.zig +++ b/lib/std/process.zig @@ -285,27 +285,35 @@ pub const ArgIteratorWasi = struct { pub const ArgIteratorWindows = struct { index: usize, - cmd_line: [*]const u8, + cmd_line: [*]const u16, - pub const NextError = error{OutOfMemory}; + pub const NextError = error{ OutOfMemory, InvalidCmdLine }; pub fn init() ArgIteratorWindows { - return initWithCmdLine(os.windows.kernel32.GetCommandLineA()); + return initWithCmdLine(os.windows.kernel32.GetCommandLineW()); } - pub fn initWithCmdLine(cmd_line: [*]const u8) ArgIteratorWindows { + pub fn initWithCmdLine(cmd_line: [*]const u16) ArgIteratorWindows { return ArgIteratorWindows{ .index = 0, .cmd_line = cmd_line, }; } + fn getPointAtIndex(self: *ArgIteratorWindows) u16 { + // According to + // https://docs.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks + // Microsoft uses UTF16-LE. So we just read assuming it's little + // endian. + return std.mem.littleToNative(u16, self.cmd_line[self.index]); + } + /// You must free the returned memory when done. pub fn next(self: *ArgIteratorWindows, allocator: *Allocator) ?(NextError![:0]u8) { // march forward over whitespace while (true) : (self.index += 1) { - const byte = self.cmd_line[self.index]; - switch (byte) { + const character = self.getPointAtIndex(); + switch (character) { 0 => return null, ' ', '\t' => continue, else => break, @@ -318,8 +326,8 @@ pub const ArgIteratorWindows = struct { pub fn skip(self: *ArgIteratorWindows) bool { // march forward over whitespace while (true) : (self.index += 1) { - const byte = self.cmd_line[self.index]; - switch (byte) { + const character = self.getPointAtIndex(); + switch (character) { 0 => return false, ' ', '\t' => continue, else => break, @@ -329,8 +337,8 @@ pub const ArgIteratorWindows = struct { var backslash_count: usize = 0; var in_quote = false; while (true) : (self.index += 1) { - const byte = self.cmd_line[self.index]; - switch (byte) { + const character = self.getPointAtIndex(); + switch (character) { 0 => return true, '"' => { const quote_is_real = backslash_count % 2 == 0; @@ -356,15 +364,17 @@ pub const ArgIteratorWindows = struct { } fn internalNext(self: *ArgIteratorWindows, allocator: *Allocator) NextError![:0]u8 { - var buf = try std.ArrayListSentineled(u8, 0).init(allocator, ""); + var buf = std.ArrayList(u16).init(allocator); defer buf.deinit(); var backslash_count: usize = 0; var in_quote = false; while (true) : (self.index += 1) { - const byte = self.cmd_line[self.index]; - switch (byte) { - 0 => return buf.toOwnedSlice(), + const character = self.getPointAtIndex(); + switch (character) { + 0 => { + return convertFromWindowsCmdLineToUTF8(allocator, buf.items); + }, '"' => { const quote_is_real = backslash_count % 2 == 0; try self.emitBackslashes(&buf, backslash_count / 2); @@ -373,7 +383,7 @@ pub const ArgIteratorWindows = struct { if (quote_is_real) { in_quote = !in_quote; } else { - try buf.append('"'); + try buf.append(std.mem.nativeToLittle(u16, '"')); } }, '\\' => { @@ -383,24 +393,34 @@ pub const ArgIteratorWindows = struct { try self.emitBackslashes(&buf, backslash_count); backslash_count = 0; if (in_quote) { - try buf.append(byte); + try buf.append(std.mem.nativeToLittle(u16, character)); } else { - return buf.toOwnedSlice(); + return convertFromWindowsCmdLineToUTF8(allocator, buf.items); } }, else => { try self.emitBackslashes(&buf, backslash_count); backslash_count = 0; - try buf.append(byte); + try buf.append(std.mem.nativeToLittle(u16, character)); }, } } } - fn emitBackslashes(self: *ArgIteratorWindows, buf: *std.ArrayListSentineled(u8, 0), emit_count: usize) !void { + fn convertFromWindowsCmdLineToUTF8(allocator: *Allocator, buf: []u16) NextError![:0]u8 { + return std.unicode.utf16leToUtf8AllocZ(allocator, buf) catch |err| switch (err) { + error.ExpectedSecondSurrogateHalf, + error.DanglingSurrogateHalf, + error.UnexpectedSecondSurrogateHalf, + => return error.InvalidCmdLine, + + error.OutOfMemory => return error.OutOfMemory, + }; + } + fn emitBackslashes(self: *ArgIteratorWindows, buf: *std.ArrayList(u16), emit_count: usize) !void { var i: usize = 0; while (i < emit_count) : (i += 1) { - try buf.append('\\'); + try buf.append(std.mem.nativeToLittle(u16, '\\')); } } }; @@ -552,14 +572,15 @@ pub fn argsFree(allocator: *mem.Allocator, args_alloc: []const [:0]u8) void { } test "windows arg parsing" { - testWindowsCmdLine("a b\tc d", &[_][]const u8{ "a", "b", "c", "d" }); - testWindowsCmdLine("\"abc\" d e", &[_][]const u8{ "abc", "d", "e" }); - testWindowsCmdLine("a\\\\\\b d\"e f\"g h", &[_][]const u8{ "a\\\\\\b", "de fg", "h" }); - testWindowsCmdLine("a\\\\\\\"b c d", &[_][]const u8{ "a\\\"b", "c", "d" }); - testWindowsCmdLine("a\\\\\\\\\"b c\" d e", &[_][]const u8{ "a\\\\b c", "d", "e" }); - testWindowsCmdLine("a b\tc \"d f", &[_][]const u8{ "a", "b", "c", "d f" }); - - testWindowsCmdLine("\".\\..\\zig-cache\\build\" \"bin\\zig.exe\" \".\\..\" \".\\..\\zig-cache\" \"--help\"", &[_][]const u8{ + const utf16Literal = std.unicode.utf8ToUtf16LeStringLiteral; + testWindowsCmdLine(utf16Literal("a b\tc d"), &[_][]const u8{ "a", "b", "c", "d" }); + testWindowsCmdLine(utf16Literal("\"abc\" d e"), &[_][]const u8{ "abc", "d", "e" }); + testWindowsCmdLine(utf16Literal("a\\\\\\b d\"e f\"g h"), &[_][]const u8{ "a\\\\\\b", "de fg", "h" }); + testWindowsCmdLine(utf16Literal("a\\\\\\\"b c d"), &[_][]const u8{ "a\\\"b", "c", "d" }); + testWindowsCmdLine(utf16Literal("a\\\\\\\\\"b c\" d e"), &[_][]const u8{ "a\\\\b c", "d", "e" }); + testWindowsCmdLine(utf16Literal("a b\tc \"d f"), &[_][]const u8{ "a", "b", "c", "d f" }); + + testWindowsCmdLine(utf16Literal("\".\\..\\zig-cache\\build\" \"bin\\zig.exe\" \".\\..\" \".\\..\\zig-cache\" \"--help\""), &[_][]const u8{ ".\\..\\zig-cache\\build", "bin\\zig.exe", ".\\..", @@ -568,7 +589,7 @@ test "windows arg parsing" { }); } -fn testWindowsCmdLine(input_cmd_line: [*]const u8, expected_args: []const []const u8) void { +fn testWindowsCmdLine(input_cmd_line: [*]const u16, expected_args: []const []const u8) void { var it = ArgIteratorWindows.initWithCmdLine(input_cmd_line); for (expected_args) |expected_arg| { const arg = it.next(std.testing.allocator).? catch unreachable; diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 2d4d4b40d96f..c791e07f780d 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -25,10 +25,10 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 { pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { // The switch is optimized much better than a "smart" approach using @clz return switch (first_byte) { - 0b0000_0000 ... 0b0111_1111 => 1, - 0b1100_0000 ... 0b1101_1111 => 2, - 0b1110_0000 ... 0b1110_1111 => 3, - 0b1111_0000 ... 0b1111_0111 => 4, + 0b0000_0000...0b0111_1111 => 1, + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, else => error.Utf8InvalidStartByte, }; } @@ -157,8 +157,8 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { /// Returns true if the given unicode codepoint can be encoded in UTF-8. pub fn utf8ValidCodepoint(value: u21) bool { return switch (value) { - 0xD800 ... 0xDFFF => false, // Surrogates range - 0x110000 ... 0x1FFFFF => false, // Above the maximum codepoint value + 0xD800...0xDFFF => false, // Surrogates range + 0x110000...0x1FFFFF => false, // Above the maximum codepoint value else => true, }; } @@ -574,6 +574,27 @@ pub fn utf16leToUtf8Alloc(allocator: *mem.Allocator, utf16le: []const u16) ![]u8 return result.toOwnedSlice(); } +/// Caller must free returned memory. +pub fn utf16leToUtf8AllocZ(allocator: *mem.Allocator, utf16le: []const u16) ![:0]u8 { + var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len); + // optimistically guess that it will all be ascii. + try result.ensureCapacity(utf16le.len); + var out_index: usize = 0; + var it = Utf16LeIterator.init(utf16le); + while (try it.nextCodepoint()) |codepoint| { + const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable; + try result.resize(result.items.len + utf8_len); + assert((utf8Encode(codepoint, result.items[out_index..]) catch unreachable) == utf8_len); + out_index += utf8_len; + } + + const len = result.items.len; + + try result.append(0); + + return result.toOwnedSlice()[0..len :0]; +} + /// Asserts that the output buffer is big enough. /// Returns end byte index into utf8. pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize {