diff --git a/CMakeLists.txt b/CMakeLists.txt index 021fd43cf014..d907a6dbfbe2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -452,6 +452,10 @@ set(ZIG_STD_FILES "index.zig" "io.zig" "linked_list.zig" + "string/string_utils.zig" + "string/utf8.zig" + "string/index.zig" + "string/ascii.zig" "macho.zig" "math/acos.zig" "math/acosh.zig" @@ -543,7 +547,6 @@ set(ZIG_STD_FILES "special/compiler_rt/umodti3.zig" "special/panic.zig" "special/test_runner.zig" - "unicode.zig" "zig/ast.zig" "zig/index.zig" "zig/parser.zig" diff --git a/build.zig b/build.zig index b72641a2efa4..7584e4cc37f2 100644 --- a/build.zig +++ b/build.zig @@ -9,6 +9,7 @@ const mem = std.mem; const ArrayList = std.ArrayList; const Buffer = std.Buffer; const io = std.io; +const string = std.string; pub fn build(b: &Builder) !void { const mode = b.standardReleaseOptions(); @@ -55,8 +56,8 @@ pub fn build(b: &Builder) !void { addCppLib(b, exe, cmake_binary_dir, "zig_cpp"); if (lld_include_dir.len != 0) { exe.addIncludeDir(lld_include_dir); - var it = mem.split(lld_libraries, ";"); - while (it.next()) |lib| { + var it = try string.utf8Split(lld_libraries, ";"); + while (it.nextBytes()) |lib| { exe.addObjectFile(lib); } } else { @@ -68,7 +69,7 @@ pub fn build(b: &Builder) !void { if (exe.target.getOs() == builtin.Os.linux) { const libstdcxx_path_padded = try b.exec([][]const u8{cxx_compiler, "-print-file-name=libstdc++.a"}); - const libstdcxx_path = ??mem.split(libstdcxx_path_padded, "\r\n").next(); + const libstdcxx_path = ??(try string.utf8Split(libstdcxx_path_padded, "\r\n")).nextBytes(); if (mem.eql(u8, libstdcxx_path, "libstdc++.a")) { warn( \\Unable to determine path to libstdc++.a @@ -172,8 +173,8 @@ fn findLLVM(b: &Builder, llvm_config_exe: []const u8) !LibraryDep { .libdirs = ArrayList([]const u8).init(b.allocator), }; { - var it = mem.split(libs_output, " \r\n"); - while (it.next()) |lib_arg| { + var it = try string.utf8Split(libs_output, " \r\n"); + while (it.nextBytes()) |lib_arg| { if (mem.startsWith(u8, lib_arg, "-l")) { try result.system_libs.append(lib_arg[2..]); } else { @@ -186,8 +187,8 @@ fn findLLVM(b: &Builder, llvm_config_exe: []const u8) !LibraryDep { } } { - var it = mem.split(includes_output, " \r\n"); - while (it.next()) |include_arg| { + var it = try string.utf8Split(includes_output, " \r\n"); + while (it.nextBytes()) |include_arg| { if (mem.startsWith(u8, include_arg, "-I")) { try result.includes.append(include_arg[2..]); } else { @@ -196,8 +197,8 @@ fn findLLVM(b: &Builder, llvm_config_exe: []const u8) !LibraryDep { } } { - var it = mem.split(libdir_output, " \r\n"); - while (it.next()) |libdir| { + var it = try string.utf8Split(libdir_output, " \r\n"); + while (it.nextBytes()) |libdir| { if (mem.startsWith(u8, libdir, "-L")) { try result.libdirs.append(libdir[2..]); } else { @@ -209,8 +210,8 @@ fn findLLVM(b: &Builder, llvm_config_exe: []const u8) !LibraryDep { } pub fn installStdLib(b: &Builder, stdlib_files: []const u8) void { - var it = mem.split(stdlib_files, ";"); - while (it.next()) |stdlib_file| { + var it = string.utf8Split(stdlib_files, ";") catch unreachable; + while (it.nextBytes()) |stdlib_file| { const src_path = os.path.join(b.allocator, "std", stdlib_file) catch unreachable; const dest_path = os.path.join(b.allocator, "lib", "zig", "std", stdlib_file) catch unreachable; b.installFile(src_path, dest_path); @@ -218,8 +219,8 @@ pub fn installStdLib(b: &Builder, stdlib_files: []const u8) void { } pub fn installCHeaders(b: &Builder, c_header_files: []const u8) void { - var it = mem.split(c_header_files, ";"); - while (it.next()) |c_header_file| { + var it = string.utf8Split(c_header_files, ";") catch unreachable; + while (it.nextBytes()) |c_header_file| { const src_path = os.path.join(b.allocator, "c_headers", c_header_file) catch unreachable; const dest_path = os.path.join(b.allocator, "lib", "zig", "include", c_header_file) catch unreachable; b.installFile(src_path, dest_path); diff --git a/doc/docgen.zig b/doc/docgen.zig index 56d9a0441260..10a561fb353b 100644 --- a/doc/docgen.zig +++ b/doc/docgen.zig @@ -5,6 +5,7 @@ const os = std.os; const warn = std.debug.warn; const mem = std.mem; const assert = std.debug.assert; +const string = std.string; const max_doc_file_size = 10 * 1024 * 1024; @@ -309,7 +310,7 @@ const Node = union(enum) { const Toc = struct { nodes: []Node, toc: []u8, - urls: std.HashMap([]const u8, Token, mem.hash_slice_u8, mem.eql_slice_u8), + urls: std.HashMap([]const u8, Token, string.hashStr, string.strEql), }; const Action = enum { @@ -318,7 +319,7 @@ const Action = enum { }; fn genToc(allocator: &mem.Allocator, tokenizer: &Tokenizer) !Toc { - var urls = std.HashMap([]const u8, Token, mem.hash_slice_u8, mem.eql_slice_u8).init(allocator); + var urls = std.HashMap([]const u8, Token, string.hashStr, string.strEql).init(allocator); errdefer urls.deinit(); var header_stack_size: usize = 0; @@ -600,7 +601,7 @@ const TermState = enum { test "term color" { const input_bytes = "A\x1b[32;1mgreen\x1b[0mB"; const result = try termColor(std.debug.global_allocator, input_bytes); - assert(mem.eql(u8, result, "AgreenB")); + assert(mem.eql(u8, result, "AgreenB")); } fn termColor(allocator: &mem.Allocator, input: []const u8) ![]u8 { @@ -718,7 +719,7 @@ fn genHtml(allocator: &mem.Allocator, tokenizer: &Tokenizer, toc: &Toc, out: var warn("docgen example code {}/{}...", code_progress_index, tokenizer.code_node_count); const raw_source = tokenizer.buffer[code.source_token.start..code.source_token.end]; - const trimmed_raw_source = mem.trim(u8, raw_source, " \n"); + const trimmed_raw_source = mem.trim(u8, raw_source, " \n", mem.Side.BOTH); const escaped_source = try escapeHtml(allocator, trimmed_raw_source); if (!code.is_inline) { try out.print("
{}.zig
", code.name); diff --git a/src-self-hosted/arg.zig b/src-self-hosted/arg.zig index 707f20828715..c27c613c3cb5 100644 --- a/src-self-hosted/arg.zig +++ b/src-self-hosted/arg.zig @@ -74,7 +74,7 @@ fn readFlagArguments(allocator: &Allocator, args: []const []const u8, required: } } -const HashMapFlags = HashMap([]const u8, FlagArg, std.hash.Fnv1a_32.hash, mem.eql_slice_u8); +const HashMapFlags = HashMap([]const u8, FlagArg, std.hash.Fnv1a_32.hash, std.string.strEql); // A store for querying found flags and positional arguments. pub const Args = struct { diff --git a/std/buf_map.zig b/std/buf_map.zig index 3e12d9a7d93a..c8703ac2a8c8 100644 --- a/std/buf_map.zig +++ b/std/buf_map.zig @@ -2,6 +2,7 @@ const std = @import("index.zig"); const HashMap = std.HashMap; const mem = std.mem; const Allocator = mem.Allocator; +const string = std.string; const assert = std.debug.assert; /// BufMap copies keys and values before they go into the map, and @@ -9,7 +10,7 @@ const assert = std.debug.assert; pub const BufMap = struct { hash_map: BufMapHashMap, - const BufMapHashMap = HashMap([]const u8, []const u8, mem.hash_slice_u8, mem.eql_slice_u8); + const BufMapHashMap = HashMap([]const u8, []const u8, string.hashStr, string.strEql); pub fn init(allocator: &Allocator) BufMap { var self = BufMap { diff --git a/std/buf_set.zig b/std/buf_set.zig index 618b985c41fe..e58b38376b54 100644 --- a/std/buf_set.zig +++ b/std/buf_set.zig @@ -1,11 +1,13 @@ -const HashMap = @import("hash_map.zig").HashMap; -const mem = @import("mem.zig"); +const std = @import("index.zig"); +const HashMap = std.HashMap; +const mem = std.mem; const Allocator = mem.Allocator; +const string = std.string; pub const BufSet = struct { hash_map: BufSetHashMap, - const BufSetHashMap = HashMap([]const u8, void, mem.hash_slice_u8, mem.eql_slice_u8); + const BufSetHashMap = HashMap([]const u8, void, string.hashStr, string.strEql); pub fn init(a: &Allocator) BufSet { var self = BufSet { diff --git a/std/build.zig b/std/build.zig index a4d745e4503b..b8bbf85082ac 100644 --- a/std/build.zig +++ b/std/build.zig @@ -14,6 +14,7 @@ const Term = os.ChildProcess.Term; const BufSet = std.BufSet; const BufMap = std.BufMap; const fmt_lib = std.fmt; +const string = std.string; pub const Builder = struct { uninstall_tls: TopLevelStep, @@ -48,8 +49,8 @@ pub const Builder = struct { cache_root: []const u8, release_mode: ?builtin.Mode, - const UserInputOptionsMap = HashMap([]const u8, UserInputOption, mem.hash_slice_u8, mem.eql_slice_u8); - const AvailableOptionsMap = HashMap([]const u8, AvailableOption, mem.hash_slice_u8, mem.eql_slice_u8); + const UserInputOptionsMap = HashMap([]const u8, UserInputOption, string.hashStr, string.strEql); + const AvailableOptionsMap = HashMap([]const u8, AvailableOption, string.hashStr, string.strEql); const AvailableOption = struct { name: []const u8, @@ -318,11 +319,11 @@ pub const Builder = struct { fn processNixOSEnvVars(self: &Builder) void { if (os.getEnvVarOwned(self.allocator, "NIX_CFLAGS_COMPILE")) |nix_cflags_compile| { - var it = mem.split(nix_cflags_compile, " "); + var it = string.utf8Split(nix_cflags_compile, " ") catch unreachable; while (true) { - const word = it.next() ?? break; + const word = it.nextBytes() ?? break; if (mem.eql(u8, word, "-isystem")) { - const include_path = it.next() ?? { + const include_path = it.nextBytes() ?? { warn("Expected argument after -isystem in NIX_CFLAGS_COMPILE\n"); break; }; @@ -336,11 +337,11 @@ pub const Builder = struct { assert(err == error.EnvironmentVariableNotFound); } if (os.getEnvVarOwned(self.allocator, "NIX_LDFLAGS")) |nix_ldflags| { - var it = mem.split(nix_ldflags, " "); + var it = string.utf8Split(nix_ldflags, " ") catch unreachable; while (true) { - const word = it.next() ?? break; + const word = it.nextBytes() ?? break; if (mem.eql(u8, word, "-rpath")) { - const rpath = it.next() ?? { + const rpath = it.nextBytes() ?? { warn("Expected argument after -rpath in NIX_LDFLAGS\n"); break; }; @@ -687,8 +688,8 @@ pub const Builder = struct { if (os.path.isAbsolute(name)) { return name; } - var it = mem.split(PATH, []u8{os.path.delimiter}); - while (it.next()) |path| { + var it = try string.utf8Split(PATH, []u8{os.path.delimiter}); + while (it.nextBytes()) |path| { const full_path = try os.path.join(self.allocator, path, self.fmt("{}{}", name, exe_extension)); if (os.path.real(self.allocator, full_path)) |real_path| { return real_path; diff --git a/std/index.zig b/std/index.zig index 07c4360aabb0..75e0caed5752 100644 --- a/std/index.zig +++ b/std/index.zig @@ -29,7 +29,7 @@ pub const net = @import("net.zig"); pub const os = @import("os/index.zig"); pub const rand = @import("rand/index.zig"); pub const sort = @import("sort.zig"); -pub const unicode = @import("unicode.zig"); +pub const string = @import("string/index.zig"); pub const zig = @import("zig/index.zig"); test "std" { @@ -62,6 +62,6 @@ test "std" { _ = @import("os/index.zig"); _ = @import("rand/index.zig"); _ = @import("sort.zig"); - _ = @import("unicode.zig"); + _ = @import("string/index.zig"); _ = @import("zig/index.zig"); } diff --git a/std/macho.zig b/std/macho.zig index 70e2c09788e0..70547d8c815b 100644 --- a/std/macho.zig +++ b/std/macho.zig @@ -129,7 +129,7 @@ pub fn loadSymbols(allocator: &mem.Allocator, in: &io.FileInStream) !SymbolTable for (syms) |sym| { if (!isSymbol(sym)) continue; const start = sym.n_strx; - const end = ??mem.indexOfScalarPos(u8, strings, start, 0); + const end = ??mem.indexOfScalarPos(u8, strings, start, 0, false); const name = strings[start..end]; const address = sym.n_value; symbols[nsym] = Symbol { .name = name, .address = address }; diff --git a/std/mem.zig b/std/mem.zig index 8a59d6251baa..fc6b9677c10c 100644 --- a/std/mem.zig +++ b/std/mem.zig @@ -170,65 +170,90 @@ pub fn dupe(allocator: &Allocator, comptime T: type, m: []const T) ![]T { return new_buf; } +pub const Side = enum { LEFT = 1, RIGHT = 2, BOTH = 3, }; + /// Remove values from the beginning and end of a slice. -pub fn trim(comptime T: type, slice: []const T, values_to_strip: []const T) []const T { +pub fn trim(comptime T: type, slice: []const T, values_to_strip: []const T, side: Side) []const T { var begin: usize = 0; var end: usize = slice.len; - while (begin < end and indexOfScalar(T, values_to_strip, slice[begin]) != null) : (begin += 1) {} - while (end > begin and indexOfScalar(T, values_to_strip, slice[end - 1]) != null) : (end -= 1) {} + + // Replace later with bitwise, Zig seems to require cast for enum bitwise? + if (side == Side.LEFT or side == Side.BOTH) { + while (begin < end and indexOfScalar(T, values_to_strip, slice[begin]) != null) : (begin += 1) {} + } + + if (side == Side.RIGHT or side == Side.BOTH) { + while (end > begin and indexOfScalar(T, values_to_strip, slice[end - 1]) != null) : (end -= 1) {} + } + return slice[begin..end]; } test "mem.trim" { - assert(eql(u8, trim(u8, " foo\n ", " \n"), "foo")); - assert(eql(u8, trim(u8, "foo", " \n"), "foo")); + assert(eql(u8, trim(u8, " foo\n ", " \n", Side.BOTH), "foo")); + assert(eql(u8, trim(u8, "foo", " \n", Side.BOTH), "foo")); + assert(eql(u8, trim(u8, " foo ", " ", Side.LEFT), "foo ")); } /// Linear search for the index of a scalar value inside a slice. pub fn indexOfScalar(comptime T: type, slice: []const T, value: T) ?usize { - return indexOfScalarPos(T, slice, 0, value); + return indexOfScalarPos(T, slice, 0, value, false); } -pub fn indexOfScalarPos(comptime T: type, slice: []const T, start_index: usize, value: T) ?usize { +pub fn indexOfScalarPos(comptime T: type, slice: []const T, start_index: usize, value: T, highest: bool) ?usize { var i: usize = start_index; + var out: ?usize = null; + while (i < slice.len) : (i += 1) { - if (slice[i] == value) - return i; + if (slice[i] == value) { + out = i; + if (!highest) break; + } } - return null; + + return out; } pub fn indexOfAny(comptime T: type, slice: []const T, values: []const T) ?usize { - return indexOfAnyPos(T, slice, 0, values); + return indexOfAnyPos(T, slice, 0, values, false); } -pub fn indexOfAnyPos(comptime T: type, slice: []const T, start_index: usize, values: []const T) ?usize { +pub fn indexOfAnyPos(comptime T: type, slice: []const T, start_index: usize, values: []const T, highest: bool) ?usize { var i: usize = start_index; + var out: ?usize = null; + while (i < slice.len) : (i += 1) { for (values) |value| { - if (slice[i] == value) - return i; + if (slice[i] == value) { + out = i; + if (!highest) break; + } } } - return null; + + return out; } pub fn indexOf(comptime T: type, haystack: []const T, needle: []const T) ?usize { - return indexOfPos(T, haystack, 0, needle); + return indexOfPos(T, haystack, 0, needle, false); } // TODO boyer-moore algorithm -pub fn indexOfPos(comptime T: type, haystack: []const T, start_index: usize, needle: []const T) ?usize { +pub fn indexOfPos(comptime T: type, haystack: []const T, start_index: usize, needle: []const T, highest: bool) ?usize { if (needle.len > haystack.len) return null; + var out : ?usize = null; var i: usize = start_index; const end = haystack.len - needle.len; while (i <= end) : (i += 1) { - if (eql(T, haystack[i .. i + needle.len], needle)) - return i; + if (eql(T, haystack[i .. i + needle.len], needle)) { + out = i; + if (!highest) break; + } } - return null; + + return out; } test "mem.indexOf" { @@ -236,6 +261,8 @@ test "mem.indexOf" { assert(indexOf(u8, "one two three four", "gour") == null); assert(??indexOf(u8, "foo", "foo") == 0); assert(indexOf(u8, "foo", "fool") == null); + assert(??indexOf(u8, "foo foo", "foo") == 0); + assert(??indexOfPos(u8, "foo foo", 0, "foo", true) == 4); } /// Reads an integer from memory with size equal to bytes.len. @@ -317,24 +344,11 @@ pub fn writeInt(buf: []u8, value: var, endian: builtin.Endian) void { assert(bits == 0); } - -pub fn hash_slice_u8(k: []const u8) u32 { - // FNV 32-bit hash - var h: u32 = 2166136261; - for (k) |b| { - h = (h ^ b) *% 16777619; - } - return h; -} - -pub fn eql_slice_u8(a: []const u8, b: []const u8) bool { - return eql(u8, a, b); -} - /// Returns an iterator that iterates over the slices of `buffer` that are not /// any of the bytes in `split_bytes`. /// split(" abc def ghi ", " ") /// Will return slices for "abc", "def", "ghi", null, in that order. +/// DON'T USE THIS FOR UNICODE OR CODE POINT STRINGS, USE `std.string.utils.Split` instead! pub fn split(buffer: []const u8, split_bytes: []const u8) SplitIterator { return SplitIterator { .index = 0, @@ -351,10 +365,6 @@ test "mem.split" { assert(it.next() == null); } -pub fn startsWith(comptime T: type, haystack: []const T, needle: []const T) bool { - return if (needle.len > haystack.len) false else eql(T, haystack[0 .. needle.len], needle); -} - pub const SplitIterator = struct { buffer: []const u8, split_bytes: []const u8, @@ -393,42 +403,12 @@ pub const SplitIterator = struct { } }; -/// Naively combines a series of strings with a separator. -/// Allocates memory for the result, which must be freed by the caller. -pub fn join(allocator: &Allocator, sep: u8, strings: ...) ![]u8 { - comptime assert(strings.len >= 1); - var total_strings_len: usize = strings.len; // 1 sep per string - { - comptime var string_i = 0; - inline while (string_i < strings.len) : (string_i += 1) { - const arg = ([]const u8)(strings[string_i]); - total_strings_len += arg.len; - } - } - - const buf = try allocator.alloc(u8, total_strings_len); - errdefer allocator.free(buf); - - var buf_index: usize = 0; - comptime var string_i = 0; - inline while (true) { - const arg = ([]const u8)(strings[string_i]); - string_i += 1; - copy(u8, buf[buf_index..], arg); - buf_index += arg.len; - if (string_i >= strings.len) break; - if (buf[buf_index - 1] != sep) { - buf[buf_index] = sep; - buf_index += 1; - } - } - - return buf[0..buf_index]; +pub fn startsWith(comptime T: type, haystack: []const T, needle: []const T) bool { + return if (needle.len > haystack.len) false else eql(T, haystack[0 .. needle.len], needle); } -test "mem.join" { - assert(eql(u8, try join(debug.global_allocator, ',', "a", "b", "c"), "a,b,c")); - assert(eql(u8, try join(debug.global_allocator, ',', "a"), "a")); +pub fn endsWith(comptime T: type, haystack: []const T, needle: []const T) bool { + return if (needle.len > haystack.len) false else eql(T, haystack[haystack.len - needle.len - 1..], needle); } test "testStringEquality" { @@ -441,6 +421,7 @@ test "testReadInt" { testReadIntImpl(); comptime testReadIntImpl(); } + fn testReadIntImpl() void { { const bytes = []u8{ 0x12, 0x34, 0x56, 0x78 }; diff --git a/std/os/child_process.zig b/std/os/child_process.zig index 8bb8b2d7e757..8cd0e791c8df 100644 --- a/std/os/child_process.zig +++ b/std/os/child_process.zig @@ -12,6 +12,7 @@ const Buffer = std.Buffer; const builtin = @import("builtin"); const Os = builtin.Os; const LinkedList = std.LinkedList; +const string = std.string; const is_windows = builtin.os == Os.windows; @@ -375,7 +376,6 @@ pub const ChildProcess = struct { } if (pid_result == 0) { // we are the child - setUpChildIo(self.stdin_behavior, stdin_pipe[0], posix.STDIN_FILENO, dev_null_fd) catch |err| forkChildErrReport(err_pipe[1], err); setUpChildIo(self.stdout_behavior, stdout_pipe[1], posix.STDOUT_FILENO, dev_null_fd) catch @@ -572,8 +572,8 @@ pub const ChildProcess = struct { const PATH = try os.getEnvVarOwned(self.allocator, "PATH"); defer self.allocator.free(PATH); - var it = mem.split(PATH, ";"); - while (it.next()) |search_path| { + var it = try string.utf8Split(PATH, ";"); + while (it.nextBytes()) |search_path| { const joined_path = try os.path.join(self.allocator, search_path, app_name); defer self.allocator.free(joined_path); @@ -622,7 +622,6 @@ pub const ChildProcess = struct { StdIo.Ignore => try os.posixDup2(dev_null_fd, std_fileno), } } - }; fn windowsCreateProcess(app_name: &u8, cmd_line: &u8, envp_ptr: ?&u8, cwd_ptr: ?&u8, diff --git a/std/os/index.zig b/std/os/index.zig index 15b54f2e98f6..9a8338d5e407 100644 --- a/std/os/index.zig +++ b/std/os/index.zig @@ -3,6 +3,7 @@ const builtin = @import("builtin"); const Os = builtin.Os; const is_windows = builtin.os == Os.windows; const os = this; +const string = std.string; test "std.os" { _ = @import("child_process.zig"); @@ -428,10 +429,10 @@ pub fn posixExecve(argv: []const []const u8, env_map: &const BufMap, // +1 for the null terminating byte const path_buf = try allocator.alloc(u8, PATH.len + exe_path.len + 2); defer allocator.free(path_buf); - var it = mem.split(PATH, ":"); + var it = string.utf8Split(PATH, ":") catch return error.NotDir; var seen_eacces = false; var err: usize = undefined; - while (it.next()) |search_path| { + while (it.nextBytes()) |search_path| { mem.copy(u8, path_buf, search_path); path_buf[search_path.len] = '/'; mem.copy(u8, path_buf[search_path.len + 1 ..], exe_path); diff --git a/std/os/path.zig b/std/os/path.zig index 0ea5d5a753a4..6d99f14fcbf9 100644 --- a/std/os/path.zig +++ b/std/os/path.zig @@ -11,6 +11,7 @@ const math = std.math; const posix = os.posix; const windows = os.windows; const cstr = std.cstr; +const string = std.string; pub const sep_windows = '\\'; pub const sep_posix = '/'; @@ -30,42 +31,32 @@ pub fn isSep(byte: u8) bool { } } -/// Naively combines a series of paths with the native path seperator. -/// Allocates memory for the result, which must be freed by the caller. pub fn join(allocator: &Allocator, paths: ...) ![]u8 { - if (is_windows) { - return joinWindows(allocator, paths); - } else { - return joinPosix(allocator, paths); - } -} - -pub fn joinWindows(allocator: &Allocator, paths: ...) ![]u8 { - return mem.join(allocator, sep_windows, paths); + return string.joinCharSep(u8, allocator, sep, paths); } -pub fn joinPosix(allocator: &Allocator, paths: ...) ![]u8 { - return mem.join(allocator, sep_posix, paths); +fn testJoin(allocator: &Allocator, comptime win: bool, paths: ...) ![]u8 { + return string.joinCharSep(u8, allocator, if (win) sep_windows else sep_posix, paths); } test "os.path.join" { - assert(mem.eql(u8, try joinWindows(debug.global_allocator, "c:\\a\\b", "c"), "c:\\a\\b\\c")); - assert(mem.eql(u8, try joinWindows(debug.global_allocator, "c:\\a\\b\\", "c"), "c:\\a\\b\\c")); + assert(mem.eql(u8, try testJoin(debug.global_allocator, true, "c:\\a\\b", "c"), "c:\\a\\b\\c")); + assert(mem.eql(u8, try testJoin(debug.global_allocator, true, "c:\\a\\b\\", "c"), "c:\\a\\b\\c")); - assert(mem.eql(u8, try joinWindows(debug.global_allocator, "c:\\", "a", "b\\", "c"), "c:\\a\\b\\c")); - assert(mem.eql(u8, try joinWindows(debug.global_allocator, "c:\\a\\", "b\\", "c"), "c:\\a\\b\\c")); + assert(mem.eql(u8, try testJoin(debug.global_allocator, true, "c:\\", "a", "b\\", "c"), "c:\\a\\b\\c")); + assert(mem.eql(u8, try testJoin(debug.global_allocator, true, "c:\\a\\", "b\\", "c"), "c:\\a\\b\\c")); - assert(mem.eql(u8, try joinWindows(debug.global_allocator, + assert(mem.eql(u8, try testJoin(debug.global_allocator, true, "c:\\home\\andy\\dev\\zig\\build\\lib\\zig\\std", "io.zig"), "c:\\home\\andy\\dev\\zig\\build\\lib\\zig\\std\\io.zig")); - assert(mem.eql(u8, try joinPosix(debug.global_allocator, "/a/b", "c"), "/a/b/c")); - assert(mem.eql(u8, try joinPosix(debug.global_allocator, "/a/b/", "c"), "/a/b/c")); + assert(mem.eql(u8, try testJoin(debug.global_allocator, false, "/a/b", "c"), "/a/b/c")); + assert(mem.eql(u8, try testJoin(debug.global_allocator, false, "/a/b/", "c"), "/a/b/c")); - assert(mem.eql(u8, try joinPosix(debug.global_allocator, "/", "a", "b/", "c"), "/a/b/c")); - assert(mem.eql(u8, try joinPosix(debug.global_allocator, "/a/", "b/", "c"), "/a/b/c")); + assert(mem.eql(u8, try testJoin(debug.global_allocator, false, "/", "a", "b/", "c"), "/a/b/c")); + assert(mem.eql(u8, try testJoin(debug.global_allocator, false, "/a/", "b/", "c"), "/a/b/c")); - assert(mem.eql(u8, try joinPosix(debug.global_allocator, "/home/andy/dev/zig/build/lib/zig/std", "io.zig"), + assert(mem.eql(u8, try testJoin(debug.global_allocator, false, "/home/andy/dev/zig/build/lib/zig/std", "io.zig"), "/home/andy/dev/zig/build/lib/zig/std/io.zig")); } @@ -149,7 +140,7 @@ pub const WindowsPath = struct { }; }; -pub fn windowsParsePath(path: []const u8) WindowsPath { +pub fn windowsParsePath(path: []const u8) !WindowsPath { if (path.len >= 2 and path[1] == ':') { return WindowsPath { .is_abs = isAbsoluteWindows(path), @@ -184,13 +175,13 @@ pub fn windowsParsePath(path: []const u8) WindowsPath { return relative_path; } - var it = mem.split(path, []u8{this_sep}); - _ = (it.next() ?? return relative_path); - _ = (it.next() ?? return relative_path); + var it = try string.utf8Split(path, []u8{this_sep}); + _ = ((it.nextBytes()) ?? return relative_path); + _ = ((it.nextBytes()) ?? return relative_path); return WindowsPath { .is_abs = isAbsoluteWindows(path), .kind = WindowsPath.Kind.NetworkShare, - .disk_designator = path[0..it.index], + .disk_designator = path[0..it.bufferIt.index], }; } } @@ -202,13 +193,13 @@ pub fn windowsParsePath(path: []const u8) WindowsPath { return relative_path; } - var it = mem.split(path, []u8{this_sep}); - _ = (it.next() ?? return relative_path); - _ = (it.next() ?? return relative_path); + var it = try string.utf8Split(path, []u8{this_sep}); + _ = ((it.nextBytes()) ?? return relative_path); + _ = ((it.nextBytes()) ?? return relative_path); return WindowsPath { .is_abs = isAbsoluteWindows(path), .kind = WindowsPath.Kind.NetworkShare, - .disk_designator = path[0..it.index], + .disk_designator = path[0..it.bufferIt.index], }; } } @@ -217,61 +208,60 @@ pub fn windowsParsePath(path: []const u8) WindowsPath { test "os.path.windowsParsePath" { { - const parsed = windowsParsePath("//a/b"); + const parsed = try windowsParsePath("//a/b"); assert(parsed.is_abs); assert(parsed.kind == WindowsPath.Kind.NetworkShare); assert(mem.eql(u8, parsed.disk_designator, "//a/b")); } { - const parsed = windowsParsePath("\\\\a\\b"); + const parsed = try windowsParsePath("\\\\a\\b"); assert(parsed.is_abs); assert(parsed.kind == WindowsPath.Kind.NetworkShare); assert(mem.eql(u8, parsed.disk_designator, "\\\\a\\b")); } { - const parsed = windowsParsePath("\\\\a\\"); + const parsed = try windowsParsePath("\\\\a\\"); assert(!parsed.is_abs); assert(parsed.kind == WindowsPath.Kind.None); assert(mem.eql(u8, parsed.disk_designator, "")); } { - const parsed = windowsParsePath("/usr/local"); + const parsed = try windowsParsePath("/usr/local"); assert(parsed.is_abs); assert(parsed.kind == WindowsPath.Kind.None); assert(mem.eql(u8, parsed.disk_designator, "")); } { - const parsed = windowsParsePath("c:../"); + const parsed = try windowsParsePath("c:../"); assert(!parsed.is_abs); assert(parsed.kind == WindowsPath.Kind.Drive); assert(mem.eql(u8, parsed.disk_designator, "c:")); } } -pub fn diskDesignator(path: []const u8) []const u8 { +pub fn diskDesignator(path: []const u8) ![]const u8 { if (is_windows) { - return diskDesignatorWindows(path); + return try diskDesignatorWindows(path); } else { return ""; } } -pub fn diskDesignatorWindows(path: []const u8) []const u8 { - return windowsParsePath(path).disk_designator; +pub fn diskDesignatorWindows(path: []const u8) ![]const u8 { + return (try windowsParsePath(path)).disk_designator; } -fn networkShareServersEql(ns1: []const u8, ns2: []const u8) bool { +fn networkShareServersEql(ns1: []const u8, ns2: []const u8) !bool { const sep1 = ns1[0]; const sep2 = ns2[0]; - var it1 = mem.split(ns1, []u8{sep1}); - var it2 = mem.split(ns2, []u8{sep2}); + var it1 = try string.utf8Split(ns1, []u8{sep1}); + var it2 = try string.utf8Split(ns2, []u8{sep2}); - // TODO ASCII is wrong, we actually need full unicode support to compare paths. - return asciiEqlIgnoreCase(??it1.next(), ??it2.next()); + return asciiEqlIgnoreCase(?? it1.nextBytes(), ?? it2.nextBytes()); } -fn compareDiskDesignators(kind: WindowsPath.Kind, p1: []const u8, p2: []const u8) bool { +fn compareDiskDesignators(kind: WindowsPath.Kind, p1: []const u8, p2: []const u8) !bool { switch (kind) { WindowsPath.Kind.None => { assert(p1.len == 0); @@ -285,22 +275,27 @@ fn compareDiskDesignators(kind: WindowsPath.Kind, p1: []const u8, p2: []const u8 const sep1 = p1[0]; const sep2 = p2[0]; - var it1 = mem.split(p1, []u8{sep1}); - var it2 = mem.split(p2, []u8{sep2}); + var it1 = try string.utf8Split(p1, []const u8{sep1}); + var it2 = try string.utf8Split(p2, []const u8{sep2}); - // TODO ASCII is wrong, we actually need full unicode support to compare paths. - return asciiEqlIgnoreCase(??it1.next(), ??it2.next()) and asciiEqlIgnoreCase(??it1.next(), ??it2.next()); + return asciiEqlIgnoreCase(?? it1.nextBytes(), ?? it2.nextBytes()) and asciiEqlIgnoreCase(?? it1.nextBytes(), ?? it2.nextBytes()); }, } } +// NOTE: When toUpper becomes a real thing (locales) +// this should be replaced with a unicode to upper +// so that it supports other languages fn asciiUpper(byte: u8) u8 { + // Replace later with proper localise checks see asciiEqlIgnoreCase for explanation return switch (byte) { - 'a' ... 'z' => 'A' + (byte - 'a'), + 'a' ... 'z' => byte - ('a' - 'A'), else => byte, }; } +// NOTE: When toUpper becomes a real thing (locales) +// this should use it rather than ascii upper fn asciiEqlIgnoreCase(s1: []const u8, s2: []const u8) bool { if (s1.len != s2.len) return false; @@ -351,7 +346,7 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { var first_index: usize = 0; var max_size: usize = 0; for (paths) |p, i| { - const parsed = windowsParsePath(p); + const parsed = try windowsParsePath(p); if (parsed.is_abs) { have_abs_path = true; first_index = i; @@ -372,7 +367,6 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { max_size += p.len + 1; } - // if we will result with a disk designator, loop again to determine // which is the last time the disk designator is absolutely specified, if any // and count up the max bytes for paths related to this disk designator @@ -383,15 +377,16 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { var correct_disk_designator = false; for (paths) |p, i| { - const parsed = windowsParsePath(p); + const parsed = try windowsParsePath(p); if (parsed.kind != WindowsPath.Kind.None) { if (parsed.kind == have_drive_kind) { - correct_disk_designator = compareDiskDesignators(have_drive_kind, + correct_disk_designator = try compareDiskDesignators(have_drive_kind, result_disk_designator, parsed.disk_designator); } else { continue; } } + if (!correct_disk_designator) { continue; } @@ -404,7 +399,6 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { } } - // Allocate result and fill in the disk designator, calling getCwd if we have to. var result: []u8 = undefined; var result_index: usize = 0; @@ -419,9 +413,9 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { }, WindowsPath.Kind.NetworkShare => { result = try allocator.alloc(u8, max_size); - var it = mem.split(paths[first_index], "/\\"); - const server_name = ??it.next(); - const other_name = ??it.next(); + var it = try string.utf8Split(paths[first_index], "/\\"); + const server_name = ?? it.nextBytes(); + const other_name = ?? it.nextBytes(); result[result_index] = '\\'; result_index += 1; @@ -440,7 +434,7 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { assert(is_windows); // resolveWindows called on non windows can't use getCwd const cwd = try os.getCwd(allocator); defer allocator.free(cwd); - const parsed_cwd = windowsParsePath(cwd); + const parsed_cwd = try windowsParsePath(cwd); result = try allocator.alloc(u8, max_size + parsed_cwd.disk_designator.len + 1); mem.copy(u8, result, parsed_cwd.disk_designator); result_index += parsed_cwd.disk_designator.len; @@ -461,7 +455,7 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { mem.copy(u8, result, cwd); result_index += cwd.len; - const parsed_cwd = windowsParsePath(result[0..result_index]); + const parsed_cwd = try windowsParsePath(result[0..result_index]); result_disk_designator = parsed_cwd.disk_designator; if (parsed_cwd.kind == WindowsPath.Kind.Drive) { result[0] = asciiUpper(result[0]); @@ -474,11 +468,11 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { // is big enough to append all the paths to. var correct_disk_designator = true; for (paths[first_index..]) |p, i| { - const parsed = windowsParsePath(p); + const parsed = try windowsParsePath(p); if (parsed.kind != WindowsPath.Kind.None) { if (parsed.kind == have_drive_kind) { - correct_disk_designator = compareDiskDesignators(have_drive_kind, + correct_disk_designator = try compareDiskDesignators(have_drive_kind, result_disk_designator, parsed.disk_designator); } else { continue; @@ -487,8 +481,9 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { if (!correct_disk_designator) { continue; } - var it = mem.split(p[parsed.disk_designator.len..], "/\\"); - while (it.next()) |component| { + + var it = try string.utf8Split(p[parsed.disk_designator.len..], "/\\"); + while (it.nextBytes()) |component| { if (mem.eql(u8, component, ".")) { continue; } else if (mem.eql(u8, component, "..")) { @@ -496,6 +491,7 @@ pub fn resolveWindows(allocator: &Allocator, paths: []const []const u8) ![]u8 { if (result_index == 0 or result_index == result_disk_designator.len) break; result_index -= 1; + if (result[result_index] == '\\' or result[result_index] == '/') break; } @@ -554,8 +550,8 @@ pub fn resolvePosix(allocator: &Allocator, paths: []const []const u8) ![]u8 { errdefer allocator.free(result); for (paths[first_index..]) |p, i| { - var it = mem.split(p, "/"); - while (it.next()) |component| { + var it = try string.utf8Split(p, "/"); + while (it.nextBytes()) |component| { if (mem.eql(u8, component, ".")) { continue; } else if (mem.eql(u8, component, "..")) { @@ -586,7 +582,7 @@ pub fn resolvePosix(allocator: &Allocator, paths: []const []const u8) ![]u8 { test "os.path.resolve" { const cwd = try os.getCwd(debug.global_allocator); if (is_windows) { - if (windowsParsePath(cwd).kind == WindowsPath.Kind.Drive) { + if ((try windowsParsePath(cwd)).kind == WindowsPath.Kind.Drive) { cwd[0] = asciiUpper(cwd[0]); } assert(mem.eql(u8, testResolveWindows([][]const u8{"."}), cwd)); @@ -599,7 +595,7 @@ test "os.path.resolve" { test "os.path.resolveWindows" { if (is_windows) { const cwd = try os.getCwd(debug.global_allocator); - const parsed_cwd = windowsParsePath(cwd); + const parsed_cwd = try windowsParsePath(cwd); { const result = testResolveWindows([][]const u8{"/usr/local", "lib\\zig\\std\\array_list.zig"}); const expected = try join(debug.global_allocator, @@ -668,7 +664,7 @@ pub fn dirnameWindows(path: []const u8) []const u8 { if (path.len == 0) return path[0..0]; - const root_slice = diskDesignatorWindows(path); + const root_slice = diskDesignatorWindows(path) catch unreachable; if (path.len == root_slice.len) return path; @@ -906,14 +902,14 @@ pub fn relativeWindows(allocator: &Allocator, from: []const u8, to: []const u8) const resolved_to = try resolveWindows(allocator, [][]const u8{to}); defer if (clean_up_resolved_to) allocator.free(resolved_to); - const parsed_from = windowsParsePath(resolved_from); - const parsed_to = windowsParsePath(resolved_to); + const parsed_from = try windowsParsePath(resolved_from); + const parsed_to = try windowsParsePath(resolved_to); const result_is_to = x: { if (parsed_from.kind != parsed_to.kind) { break :x true; } else switch (parsed_from.kind) { WindowsPath.Kind.NetworkShare => { - break :x !networkShareServersEql(parsed_to.disk_designator, parsed_from.disk_designator); + break :x !(try networkShareServersEql(parsed_to.disk_designator, parsed_from.disk_designator)); }, WindowsPath.Kind.Drive => { break :x asciiUpper(parsed_from.disk_designator[0]) != asciiUpper(parsed_to.disk_designator[0]); @@ -927,20 +923,23 @@ pub fn relativeWindows(allocator: &Allocator, from: []const u8, to: []const u8) return resolved_to; } - var from_it = mem.split(resolved_from, "/\\"); - var to_it = mem.split(resolved_to, "/\\"); + var from_it = try string.utf8Split(resolved_from, "/\\"); + var to_it = try string.utf8Split(resolved_to, "/\\"); while (true) { - const from_component = from_it.next() ?? return mem.dupe(allocator, u8, to_it.rest()); - const to_rest = to_it.rest(); - if (to_it.next()) |to_component| { + const from_component = from_it.nextBytes() ?? return mem.dupe(allocator, u8, to_it.restBytes() ?? ""); + const to_rest = to_it.restBytes() ?? ""; + + if (to_it.nextBytes()) |to_component| { // TODO ASCII is wrong, we actually need full unicode support to compare paths. if (asciiEqlIgnoreCase(from_component, to_component)) continue; } + var up_count: usize = 1; - while (from_it.next()) |_| { + while (from_it.nextBytes()) |_| { up_count += 1; } + const up_index_end = up_count * "..\\".len; const result = try allocator.alloc(u8, up_index_end + to_rest.len); errdefer allocator.free(result); @@ -957,14 +956,13 @@ pub fn relativeWindows(allocator: &Allocator, from: []const u8, to: []const u8) // shave off the trailing slash result_index -= 1; - var rest_it = mem.split(to_rest, "/\\"); - while (rest_it.next()) |to_component| { + var rest_it = try string.utf8Split(to_rest, "/\\"); + while (rest_it.nextBytes()) |to_component| { result[result_index] = '\\'; result_index += 1; mem.copy(u8, result[result_index..], to_component); result_index += to_component.len; } - return result[0..result_index]; } @@ -978,17 +976,17 @@ pub fn relativePosix(allocator: &Allocator, from: []const u8, to: []const u8) ![ const resolved_to = try resolvePosix(allocator, [][]const u8{to}); defer allocator.free(resolved_to); - var from_it = mem.split(resolved_from, "/"); - var to_it = mem.split(resolved_to, "/"); + var from_it = try string.utf8Split(resolved_from, "/"); + var to_it = try string.utf8Split(resolved_to, "/"); while (true) { - const from_component = from_it.next() ?? return mem.dupe(allocator, u8, to_it.rest()); - const to_rest = to_it.rest(); - if (to_it.next()) |to_component| { + const from_component = from_it.nextBytes() ?? return mem.dupe(allocator, u8, to_it.restBytes() ?? ""); + const to_rest = to_it.restBytes() ?? ""; + if (to_it.nextBytes()) |to_component| { if (mem.eql(u8, from_component, to_component)) continue; } var up_count: usize = 1; - while (from_it.next()) |_| { + while (from_it.nextBytes()) |_| { up_count += 1; } const up_index_end = up_count * "../".len; diff --git a/std/string/ascii.zig b/std/string/ascii.zig new file mode 100644 index 000000000000..ab4cdbfb95e7 --- /dev/null +++ b/std/string/ascii.zig @@ -0,0 +1,117 @@ +const std = @import("../index.zig"); +const mem = std.mem; +const math = std.math; +const Set = std.BufSet; +const assert = std.debug.assert; +const warn = std.debug.warn; +const DebugAllocator = std.debug.global_allocator; + +pub const Errors = error { + InvalidCharacter, + OutOfMemory, +}; + +pub const MemoryErrors = error { + OutOfMemory, +}; + +pub const Iterator = struct { + raw: []const u8, + index: usize, + + pub fn reset(it: &Iterator) void { + it.index = 0; + } + + pub fn nextBytes(it: &Iterator)?[]const u8 { + if (it.index >= it.raw.len) { + return null; + } + + // It wants an array not a singular character + var x = it.raw[it.index..it.index+1]; + it.index += 1; + return x; + } + + pub fn nextCodePoint(it: &Iterator)?u8 { + var x = it.nextBytes(); + return if (x) |y| y[0] else null; + } +}; + +pub const View = struct { + characters: []const u8, + + pub fn init(s: []const u8) !View { + for (s) |char| { + if (char > 127) return error.InvalidCharacter; + } + + return initUnchecked(s); + } + + pub fn initComptime(comptime s: []const u8) View { + if (comptime init(s)) |view| { + return view; + } else |err| { + // @Refactor: add on more information when converting enums to strings + // become a thing in the language + @compileError("Invalid bytes"); + } + } + + pub fn eql(self: &const View, other: &const View) bool { + return mem.eql(u8, self.characters, other.characters); + } + + pub fn sliceCodepoint(self: &const View, start: usize, end: usize) []const u8 { + return self.characters[start..end]; + } + + pub fn sliceCodepointToEndFrom(self: &const View, start: usize) []const u8 { + return self.characters[start..]; + } + + pub fn byteLen(self: &const View) usize { + return self.characters.len; + } + + pub fn getBytes(self: &const View) []const u8 { + return self.characters; + } + + pub fn sliceBytes(self: &const View, start: usize, end: usize) []const u8 { + return self.characters[start..end]; + } + + pub fn sliceBytesToEndFrom(self: &const View, start: usize) []const u8 { + return self.characters[start..]; + } + + pub fn byteAt(self: &const View, index: usize) u8 { + return self.characters[index]; + } + + pub fn byteFromEndAt(self: &const View, index: usize) u8 { + return self.characters[self.characters.len - 1 - index]; + } + + pub fn codePointAt(self: &const View, index: usize) u8 { + return self.characters[index]; + } + + pub fn codePointFromEndAt(self: &const View, index: usize) u8 { + return self.characters[self.characters.len - 1 - index]; + } + + pub fn initUnchecked(s: []const u8) View { + return View { + .characters = s + }; + } + + pub fn iterator(self: &const View) Iterator { + return Iterator { .index = 0, .raw = self.characters }; + } +}; diff --git a/std/string/index.zig b/std/string/index.zig new file mode 100644 index 000000000000..eeabe51a102e --- /dev/null +++ b/std/string/index.zig @@ -0,0 +1,137 @@ +const std = @import("../index.zig"); +const debug = std.debug; +const mem = std.mem; +const assert = debug.assert; + +pub const utf8 = @import("utf8.zig"); +pub const ascii = @import("ascii.zig"); +pub const utils = @import("string_utils.zig"); + +/// Returns a hash for a string +pub fn hashStr(k: []const u8) u32 { + // FNV 32-bit hash + var h: u32 = 2166136261; + for (k) |b| { + h = (h ^ b) *% 16777619; + } + return h; +} + +/// Returns if two strings are equal. +/// Note: just maps to mem.eql, this is mainly +/// for use in structures like in buf_map. +pub fn strEql(a: []const u8, b: []const u8)bool { + return mem.eql(u8, a, b); +} + +const AsciiSplitIt = utils.SplitIt(ascii.View, ascii.Iterator, u8, u8); + +/// Splits a string (ascii set). +/// It will split it at ANY of the split bytes. +/// i.e. splitting at "\n " means '\n' AND/OR ' '. +pub fn asciiSplit(a: []const u8, splitBytes: []const u8) !AsciiSplitIt { + return try AsciiSplitIt.init(a, splitBytes); +} + +const Utf8SplitIt = utils.SplitIt(utf8.View, utf8.Iterator, u8, u32); + +/// Splits a string (utf8 set). +/// It will split it at ANY of the split bytes. +/// i.e. splitting at "\n " means '\n' AND/OR ' '. +pub fn utf8Split(a: []const u8, splitBytes: []const u8) !Utf8SplitIt { + return try Utf8SplitIt.init(a, splitBytes); +} + +fn calculateLength(comptime BaseType: type, sep: []const BaseType, views: [][]const BaseType, strings: ...) usize { + var totalLength: usize = 0; + comptime var string_i = 0; + inline while (string_i < strings.len) : (string_i += 1) { + const arg = ([]const BaseType)(strings[string_i]); + totalLength += arg.len; + if (string_i < strings.len - 1 and (arg.len < sep.len or !mem.eql(BaseType, arg[arg.len - sep.len..], sep))) { + totalLength += sep.len; + } + views[string_i] = arg; + } + return totalLength; +} + +/// Joins strings together with a seperator. +/// Error: The allocator could fail. +pub fn join(comptime BaseType: type, allocator: &mem.Allocator, sep: []const BaseType, strings: ...) ![]BaseType { + var views: [strings.len][]const u8 = undefined; + const totalLength = calculateLength(BaseType, sep, views[0..], strings); + const buf = try allocator.alloc(BaseType, totalLength); + return utils.joinViewsBuffer(BaseType, sep, views[0..], totalLength, buf); +} + +/// Similar version as join but uses a buffer instead of an allocator. +pub fn joinBuffer(comptime BaseType: type, buffer: []BaseType, sep: []const BaseType, strings: ...) []BaseType { + var views: [strings.len][]const u8 = undefined; + const totalLength = calculateLength(BaseType, sep, views[0..], strings); + return utils.joinViewsBuffer(BaseType, sep, views[0..], totalLength, buffer); +} + +pub fn joinCharSep(comptime BaseType: type, allocator: &mem.Allocator, sep: BaseType, strings: ...) ![]BaseType { + return join(BaseType, allocator, []BaseType{ sep }, strings); +} + +pub fn joinBufferCharSep(comptime BaseType: type, buffer: []BaseType, sep: BaseType, strings: ...) ![]BaseType { + return joinBuffer(BaseType, buffer, []BaseType{ sep }, strings); +} + +/// To choose what sides. +pub const Side = enum { LEFT = 1, RIGHT = 2, BOTH = 3, }; + +/// Trim an ascii string from either/both sides. +pub fn asciiTrim(string: []const u8, trimChars: []const u8, side: Side)[]const u8 { + return utils.trim(ascii.View, u8, &ascii.View.initUnchecked(string), &ascii.View.initUnchecked(trimChars), side); +} + +/// Trim an utf8 string from either/both sides. +pub fn utf8Trim(string: []const u8, trimChars: []const u8, side: Side)[]const u8 { + return utils.trim(utf8.View, u8, &utf8.View.initUnchecked(string), &utf8.View.initUnchecked(trimChars), side); +} + +test "string.ascii.joinBuffer" { + var buf: [100]u8 = undefined; + assert(mem.eql(u8, joinBuffer(u8, buf[0..], ", ", "a", "߶", "۩", "°"), "a, ߶, ۩, °")); + assert(mem.eql(u8, joinBuffer(u8, buf[0..], ",", "۩"), "۩")); +} + +test "string.utf8.joinBuffer" { + var buf: [100]u8 = undefined; + assert(mem.eql(u8, joinBuffer(u8, buf[0..], ", ", "a", "b", "c"), "a, b, c")); + assert(mem.eql(u8, joinBuffer(u8, buf[0..], ",", "a"), "a")); +} + +test "string.ascii.trim" { + // Copied from mem.trim + assert(mem.eql(u8, asciiTrim(" foo\n ", " \n", Side.BOTH), "foo")); + assert(mem.eql(u8, asciiTrim("foo", " \n", Side.BOTH), "foo")); + assert(mem.eql(u8, asciiTrim(" foo ", " ", Side.LEFT), "foo ")); +} + +test "string.split.ascii" { + var it = try asciiSplit(" abc def ghi k ", " "); + assert(mem.eql(u8, ?? it.nextBytes(), "abc")); + assert(mem.eql(u8, ?? it.nextBytes(), "def")); + assert(mem.eql(u8, ?? it.restBytes(), "ghi k ")); + assert(mem.eql(u8, ?? it.nextBytes(), "ghi")); + assert(mem.eql(u8, ?? it.nextBytes(), "k")); + assert(it.nextBytes() == null); +} + +test "string.split.unicode" { + var it = try utf8Split(" abc ۩ g߶hi ", " "); + assert(mem.eql(u8, ?? it.nextBytes(), "abc")); + assert(mem.eql(u8, ?? it.nextBytes(), "۩")); + assert(mem.eql(u8, ?? it.restBytes(), "g߶hi ")); + assert(mem.eql(u8, ?? it.nextBytes(), "g߶hi")); + assert(it.nextBytes() == null); +} + +test "Strings" { + _ = @import("utf8.zig"); + _ = @import("ascii.zig"); +} \ No newline at end of file diff --git a/std/string/string_utils.zig b/std/string/string_utils.zig new file mode 100644 index 000000000000..8deb276ecef7 --- /dev/null +++ b/std/string/string_utils.zig @@ -0,0 +1,168 @@ +const std = @import("../index.zig"); +const mem = std.mem; +const math = std.math; +const Set = std.BufSet; +const debug = std.debug; +const assert = debug.assert; +const ascii = std.string.ascii; +const utf8 = std.string.utf8; + +/// Returns an iterator that iterates over the slices of `buffer` that are not +/// any of the code points in `split_bytes`. +/// split(" abc def ghi ", " ") +/// Will return slices for "abc", "def", "ghi", null, in that order. +/// This one is intended for use with strings +pub fn SplitIt(comptime ViewType: type, comptime IteratorType: type, comptime BaseType: type, comptime CodepointType: type) type { + return struct { + buffer: ViewType, + bufferIt: IteratorType, + splitBytesIt: IteratorType, + + const Self = this; + + /// Returns the next set of bytes + pub fn nextBytes(self: &Self) ?[]const BaseType { + // move to beginning of token + var nextSlice = self.bufferIt.nextBytes(); + + while (nextSlice) |curSlice| { + if (!self.isSplitByte(curSlice)) break; + nextSlice = self.bufferIt.nextBytes(); + } + + if (nextSlice) |next| { + // Go till we find another split + const start = self.bufferIt.index - next.len; + nextSlice = self.bufferIt.nextBytes(); + + while (nextSlice) |cSlice| { + if (self.isSplitByte(cSlice)) break; + nextSlice = self.bufferIt.nextBytes(); + } + + if (nextSlice) |slice| self.bufferIt.index -= slice.len; + + const end = self.bufferIt.index; + return self.buffer.sliceBytes(start, end); + } else { + return null; + } + } + + /// Decodes the next set of bytes. + pub fn nextCodepoint(self: &Self) ?[]const CodepointType { + return utf8.decode(self.nextBytes()); + } + + /// Returns the rest of the bytes. + pub fn restBytes(self: &Self) ?[]const BaseType { + // move to beginning of token + var index = self.bufferIt.index; + defer self.bufferIt.index = index; + var nextSlice = self.bufferIt.nextBytes(); + + while (nextSlice) |curSlice| { + if (!self.isSplitByte(curSlice)) break; + nextSlice = self.bufferIt.nextBytes(); + } + + if (nextSlice) |slice| { + const iterator = self.bufferIt.index - slice.len; + return self.buffer.sliceBytesToEndFrom(iterator); + } else { + return null; + } + } + + /// Returns if a split byte matches the bytes given. + fn isSplitByte(self: &Self, toCheck: []const BaseType) bool { + self.splitBytesIt.reset(); + var byte = self.splitBytesIt.nextBytes(); + + while (byte) |splitByte| { + if (mem.eql(BaseType, splitByte, toCheck)) { + return true; + } + byte = self.splitBytesIt.nextBytes(); + } + return false; + } + + /// Initialises the string split iterator. + fn init(view: []const BaseType, splitBytes: []const BaseType) !Self { + return Self { .buffer = try ViewType.init(view), .splitBytesIt = (try ViewType.init(splitBytes)).iterator(), .bufferIt = (try ViewType.init(view)).iterator() }; + } + }; +} + +pub fn joinViewsBuffer(comptime BaseType: type, sep: []const BaseType, strings: [][]const BaseType, totalLength: usize, buffer: []BaseType) []BaseType { + assert(totalLength <= buffer.len); + var buffer_i: usize = 0; + for (strings) |string| { + // Write to buffer + mem.copy(BaseType, buffer[buffer_i..], string); + buffer_i += string.len; + // As to not print the last one + if (buffer_i >= totalLength) break; + if (buffer_i < sep.len or !mem.eql(BaseType, buffer[buffer_i - sep.len..buffer_i], sep)) { + mem.copy(BaseType, buffer[buffer_i..], sep); + buffer_i += sep.len; + } + } + return buffer[0..buffer_i]; +} + +const Side = std.string.Side; + +/// Trim a provided string. +/// Note: you have to provide both a View and a BaseType +/// but don't have to supply an iterator, however `View.iterator` has to exist. +pub fn trim(comptime View: type, comptime BaseType: type, string: &View, trimCharacters: &View, side: Side) []const BaseType { + var initialIndex : usize = 0; + var endIndex : usize = string.byteLen(); + var it = string.iterator(); + + if (side == Side.LEFT or side == Side.BOTH) { + while (it.nextBytes()) |bytes| { + var trimIt = trimCharacters.iterator(); + var found = false; + while (trimIt.nextBytes()) |trimBytes| { + if (mem.eql(BaseType, trimBytes, bytes)) { + found = true; + break; + } + } + + if (!found) { + initialIndex = it.index - bytes.len; + break; + } + } + } + + if (side == Side.RIGHT or side == Side.BOTH) { + // Continue from where it started off but keep going till we hit the end keeping in track + // The length of the code points + var codePointLength : usize = 0; + while (it.nextBytes()) |bytes| { + var trimIt = trimCharacters.iterator(); + var found = false; + while (trimIt.nextBytes()) |trimBytes| { + if (mem.eql(BaseType, trimBytes, bytes)) { + found = true; + break; + } + } + + if (found) { + codePointLength += bytes.len; + } else { + codePointLength = 0; + } + } + + endIndex -= codePointLength; + } + + return string.sliceBytes(initialIndex, endIndex); +} diff --git a/std/string/utf8.zig b/std/string/utf8.zig new file mode 100644 index 000000000000..9327743482aa --- /dev/null +++ b/std/string/utf8.zig @@ -0,0 +1,530 @@ +const std = @import("../index.zig"); +const debug = std.debug; +const mem = std.mem; +const assert = debug.assert; + +pub const Errors = error { + InvalidStartByte, + CodepointTooLarge, + InvalidCodepoint, + ExpectedContinuation, + OverlongEncoding, + IndexOutOfBounds, + InvalidBytes, + OutOfMemory, +}; + +pub const MemoryErrors = error { + OutOfMemory, +}; + +/// Given the first byte of a UTF-8 codepoint, +/// returns a number 1-4 indicating the total length of the codepoint in bytes. +/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. +pub fn ByteSequenceLength(first_byte: u8) !u3 { + if (first_byte < 0b10000000) return u3(1); + if (first_byte & 0b11100000 == 0b11000000) return u3(2); + if (first_byte & 0b11110000 == 0b11100000) return u3(3); + if (first_byte & 0b11111000 == 0b11110000) return u3(4); + return error.InvalidStartByte; +} + +pub fn CodepointSequenceLength(c: u32) !u3 { + if (c < 0x80) return u3(1); + if (c < 0x800) return u3(2); + if (c -% 0xd800 < 0x800) return error.InvalidCodepoint; + if (c < 0x10000) return u3(3); + if (c < 0x110000) return u3(4); + return error.CodepointTooLarge; +} + +/// Encodes a code point back into utf8 +/// c: the code point +/// out: the out buffer to write to +/// Notes: out has to have a len big enough for the code point +/// however this limit is dependent on the code point +/// but giving it a minimum of 4 will ensure it will work +/// for all code points :). +/// Errors: Will return an error if the code point is invalid. +pub fn Encode(c: u32, out: []u8) !usize { + if (CodepointSequenceLength(c)) |length| { + debug.assert(out.len >= length); + switch (length) { + 1 => out[0] = u8(c), // Can just add a '0' and code point, thus output 'c' + 2 => { + // 64 to convert the characters into their segments + out[0] = u8(0b11000000 + c / 64); + out[1] = u8(0b10000000 + c % 64); + }, + 3 => { + // Again using 64 as a conversion into their segments + // But using C / 4096 (64 * 64) as the first, (C/64) % 64 as the second, and just C % 64 as the last + out[0] = u8(0b11100000 + c / 4096); + out[1] = u8(0b10000000 + (c / 64) % 64); + out[2] = u8(0b10000000 + c % 64); + }, + 4 => { + // Same as previously but now its C / 64^3 (262144), (C / 4096) % 64, (C / 64) % 64 and C % 64 + out[0] = u8(0b11110000 + c / 262144); + out[1] = u8(0b10000000 + (c / 4096) % 64); + out[2] = u8(0b10000000 + (c / 64) % 64); + out[3] = u8(0b10000000 + c % 64); + }, + else => unreachable, + } + return usize(length); + } else |err| { + return err; + } +} + +/// Decodes the UTF-8 codepoint encoded in the given slice of bytes. +/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. +/// If you already know the length at comptime, you can call one of +/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. +pub fn Decode(bytes: []const u8) !u32 { + return switch (bytes.len) { + 1 => u32(bytes[0]), + 2 => Decode2(bytes), + 3 => Decode3(bytes), + 4 => Decode4(bytes), + else => unreachable, + }; +} + +pub fn Decode2(bytes: []const u8) !u32 { + debug.assert(bytes.len == 2); + debug.assert(bytes[0] & 0b11100000 == 0b11000000); + var value: u32 = bytes[0] & 0b00011111; + + if (bytes[1] & 0b11000000 != 0b10000000) return error.ExpectedContinuation; + value <<= 6; + value |= bytes[1] & 0b00111111; + + if (value < 0x80) return error.OverlongEncoding; + + return value; +} + +pub fn Decode3(bytes: []const u8) !u32 { + debug.assert(bytes.len == 3); + debug.assert(bytes[0] & 0b11110000 == 0b11100000); + var value: u32 = bytes[0] & 0b00001111; + + if (bytes[1] & 0b11000000 != 0b10000000) return error.ExpectedContinuation; + value <<= 6; + value |= bytes[1] & 0b00111111; + + if (bytes[2] & 0b11000000 != 0b10000000) return error.ExpectedContinuation; + value <<= 6; + value |= bytes[2] & 0b00111111; + + if (value < 0x800) return error.OverlongEncoding; + if (0xd800 <= value and value <= 0xdfff) return error.EncodesSurrogateHalf; + + return value; +} + +pub fn Decode4(bytes: []const u8) !u32 { + debug.assert(bytes.len == 4); + debug.assert(bytes[0] & 0b11111000 == 0b11110000); + var value: u32 = bytes[0] & 0b00000111; + + if (bytes[1] & 0b11000000 != 0b10000000) return error.ExpectedContinuation; + value <<= 6; + value |= bytes[1] & 0b00111111; + + if (bytes[2] & 0b11000000 != 0b10000000) return error.ExpectedContinuation; + value <<= 6; + value |= bytes[2] & 0b00111111; + + if (bytes[3] & 0b11000000 != 0b10000000) return error.ExpectedContinuation; + value <<= 6; + value |= bytes[3] & 0b00111111; + + if (value < 0x10000) return error.OverlongEncoding; + if (value > 0x10FFFF) return error.CodepointTooLarge; + + return value; +} + +pub fn ValidateSlice(s: []const u8) bool { + var i: usize = 0; + while (i < s.len) { + if (ByteSequenceLength(s[i])) |cp_len| { + if (i + cp_len > s.len) { + return false; + } + + if (Decode(s[i..i+cp_len])) |_| {} else |_| { return false; } + i += cp_len; + } else |err| { + return false; + } + } + return true; +} + +/// Utf8View iterates the code points of a utf-8 encoded string. +/// +/// ``` +/// var utf8 = (try std.utf8.View.init("hi there")).iterator(); +/// while (utf8.nextCodepoint()) |codepoint| { +/// std.debug.warn("got codepoint {}\n", codepoint); +/// } +/// ``` +pub const View = struct { + bytes: []const u8, + + pub fn init(s: []const u8) !View { + if (!ValidateSlice(s)) { + return error.InvalidBytes; + } + + return initUnchecked(s); + } + + pub fn eql(self: &const View, other: &const View) bool { + return mem.eql(u8, self.bytes, other.bytes); + } + + pub fn sliceBytes(self: &const View, start: usize, end: usize) []const u8 { + return self.bytes[start..end]; + } + + pub fn sliceBytesToEndFrom(self: &const View, start: usize) []const u8 { + return self.bytes[start..]; + } + + pub fn getBytes(self: &const View) []const u8 { + return self.bytes; + } + + pub fn byteLen(self: &const View) usize { + return self.bytes.len; + } + + fn convertCodepointIndexToRaw(self: &const View, initialPoint: usize, codepointIndex: usize) !usize { + var i: usize = initialPoint; + var rawIndex: usize = i; + + while (i != codepointIndex) { + if (rawIndex >= self.bytes.len) return error.IndexOutOfBounds; + rawIndex += ByteSequenceLength(self.bytes[rawIndex]); + i += 1; + } + return rawIndex; + } + + // Slices using two code point indexes + // Will return an error if out of bounds. + pub fn sliceCodepoint(self: &const View, start: usize, end: usize) ![]const u8 { + // Grab first code point length and keep going till i == start + const rawStart: usize = try self.convertCodepointIndexToRaw(start, true); + const rawEnd: usize = try self.convertCodepointIndexToRaw(end, false); + + return self.sliceBytes(rawStart, rawEnd); + } + + pub fn sliceCodepointToEndFrom(self: &const View, start: usize) ![]const u8 { + return self.sliceBytesToEndFrom(try self.convertCodepointIndexToRaw(start, true)); + } + + pub fn byteAt(self: &const View, index: usize) u8 { + return self.bytes[index]; + } + + pub fn byteFromEndAt(self: &const View, index: usize) u8 { + return self.bytes[self.bytes.len - 1 - index]; + } + + /// Returns the code point at the position asked for + /// Note: the index refers to code point indexes not raw indexes. + /// also that this is meant for when code point is in lower half + /// use codePointFromEndAt for when you know it is in upper half. + pub fn codePointAt(self: &const View, index: usize) !u32 { + const rawIndex: usize = try self.convertCodepointIndexToRaw(index, true); + const length: usize = ByteSequenceLength(self.bytes[rawIndex]); + + switch (length) { + 1 => return u32(self.bytes[rawIndex]), + 2 => return try Decode2(self.bytes[rawIndex..rawIndex+2]), + 3 => return try Decode3(self.bytes[rawIndex..rawIndex+3]), + 4 => return try Decode4(self.bytes[rawIndex..rawIndex+4]), + else => unreachable, + } + } + + /// Returns the code point at the position asked for FROM the end + /// i.e. codePointFromEndAt(2) returns two code points from end code point + /// codePointFromEndAt(0) returns the last code point + /// Note: the index refers to code point indexes not raw indexes. + /// also that this is meant for when code point is in lower half + /// use codePointFromEndAt for when you know it is in upper half. + pub fn codePointFromEndAt(self: &const View, index: usize) !u32 { + // Going back to front is a little more convuluted + // If we do (c & 0xC0) == 0x80 then we know we can skip it due to nature of Utf8 + // Therefore we can iterate backwards, once we reach a point where it is no longer true + // We know how many we have gone backwards and therefore can just raw ptr respectively + // @Refactoring: We can have a length field that resets on i += 1, and just increment it + // as you decrement raw index :). I don't think it'll provide enough benefit + // to implement but it is a possibility. + var i: usize = 0; // -1 as a index of 0 is the last code point + var rawIndex: usize = self.bytes.len - 1; + + // Effectively simulates a '-1' value on i + while (i <= index) { + if (rawIndex < 0) return error.IndexOutOfBounds; + + if (self.bytes[rawIndex] & 0xC0 == 0x80) { + // We can skip over this + rawIndex -= 1; + } else { + // This is valid + i += 1; + } + } + + const length: usize = ByteSequenceLength(self.bytes[rawIndex]); + + switch (length) { + 1 => return u32(self.bytes[rawIndex]), + 2 => return try Decode2(self.bytes[rawIndex..rawIndex+2]), + 3 => return try Decode3(self.bytes[rawIndex..rawIndex+3]), + 4 => return try Decode4(self.bytes[rawIndex..rawIndex+4]), + else => unreachable, + } + } + + pub fn initUnchecked(s: []const u8) View { + return View { + .bytes = s, + }; + } + + pub fn initComptime(comptime s: []const u8) View { + if (comptime init(s)) |r| { + return r; + } else |err| { + // @Refactor: when an 'enum' can be converted to a string + // representation print error :). + @compileError("Invalid Bytes"); + } + } + + pub fn iterator(s: &const View) Iterator { + return Iterator { + .raw = s.bytes, + .index = 0, + }; + } +}; + +pub const Iterator = struct { + raw: []const u8, + index: usize, + + pub fn reset(it: &Iterator) void { + it.index = 0; + } + + pub fn nextBytes(it: &Iterator) ?[]const u8 { + if (it.index >= it.raw.len) { + return null; + } + + const cp_len = ByteSequenceLength(it.raw[it.index]) catch unreachable; + + it.index+= cp_len; + return it.raw[it.index-cp_len..it.index]; + } + + pub fn nextCodepoint(it: &Iterator) ?u32 { + const slice = it.nextBytes() ?? return null; + + const r = switch (slice.len) { + 1 => u32(slice[0]), + 2 => Decode2(slice), + 3 => Decode3(slice), + 4 => Decode4(slice), + else => unreachable, + }; + + return r catch unreachable; + } +}; + +test "utf8 encode" { + // A few taken from wikipedia a few taken elsewhere + var array: [4]u8 = undefined; + debug.assert((try Encode(try Decode("€"), array[0..])) == 3); + debug.assert(array[0] == 0b11100010); + debug.assert(array[1] == 0b10000010); + debug.assert(array[2] == 0b10101100); + + debug.assert((try Encode(try Decode("$"), array[0..])) == 1); + debug.assert(array[0] == 0b00100100); + + debug.assert((try Encode(try Decode("¢"), array[0..])) == 2); + debug.assert(array[0] == 0b11000010); + debug.assert(array[1] == 0b10100010); + + debug.assert((try Encode(try Decode("𐍈"), array[0..])) == 4); + debug.assert(array[0] == 0b11110000); + debug.assert(array[1] == 0b10010000); + debug.assert(array[2] == 0b10001101); + debug.assert(array[3] == 0b10001000); +} + +test "utf8 iterator" { + var view = View.init("$¢") catch unreachable; + var it = view.iterator(); + var buf = ?? it.nextBytes(); + assert(buf[0] == 0b00100100); + buf = ?? it.nextBytes(); + assert(buf[0] == 0b11000010); + assert(buf[1] == 0b10100010); +} + +test "utf8 encode error" { + var array: [4]u8 = undefined; + testErrorEncode(0xFFFFFF, array[0..], error.CodepointTooLarge); + testErrorEncode(0xd900, array[0..], error.InvalidCodepoint); +} + +fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void { + if (Encode(codePoint, array)) |_| { + unreachable; + } else |err| { + assert(err == expectedErr); + } +} + +test "utf8 iterator on ascii" { + const s = View.initComptime("abc"); + + var it1 = s.iterator(); + debug.assert(std.mem.eql(u8, "a", ??it1.nextBytes())); + debug.assert(std.mem.eql(u8, "b", ??it1.nextBytes())); + debug.assert(std.mem.eql(u8, "c", ??it1.nextBytes())); + debug.assert(it1.nextBytes() == null); + + var it2 = s.iterator(); + debug.assert(??it2.nextCodepoint() == 'a'); + debug.assert(??it2.nextCodepoint() == 'b'); + debug.assert(??it2.nextCodepoint() == 'c'); + debug.assert(it2.nextCodepoint() == null); +} + +test "utf8 view bad" { + // Compile-time error. + // const s3 = Utf8View.initComptime("\xfe\xf2"); + + const s = View.init("hel\xadlo"); + if (s) |_| { unreachable; } else |err| { debug.assert(err == error.InvalidBytes); } +} + +test "utf8 view ok" { + const s = View.initComptime("東京市"); + + var it1 = s.iterator(); + debug.assert(std.mem.eql(u8, "東", ??it1.nextBytes())); + debug.assert(std.mem.eql(u8, "京", ??it1.nextBytes())); + debug.assert(std.mem.eql(u8, "市", ??it1.nextBytes())); + debug.assert(it1.nextBytes() == null); + + var it2 = s.iterator(); + debug.assert(??it2.nextCodepoint() == 0x6771); + debug.assert(??it2.nextCodepoint() == 0x4eac); + debug.assert(??it2.nextCodepoint() == 0x5e02); + debug.assert(it2.nextCodepoint() == null); +} + +test "bad utf8 slice" { + debug.assert(ValidateSlice("abc")); + debug.assert(!ValidateSlice("abc\xc0")); + debug.assert(!ValidateSlice("abc\xc0abc")); + debug.assert(ValidateSlice("abc\xdf\xbf")); +} + +test "valid utf8" { + testValid("\x00", 0x0); + testValid("\x20", 0x20); + testValid("\x7f", 0x7f); + testValid("\xc2\x80", 0x80); + testValid("\xdf\xbf", 0x7ff); + testValid("\xe0\xa0\x80", 0x800); + testValid("\xe1\x80\x80", 0x1000); + testValid("\xef\xbf\xbf", 0xffff); + testValid("\xf0\x90\x80\x80", 0x10000); + testValid("\xf1\x80\x80\x80", 0x40000); + testValid("\xf3\xbf\xbf\xbf", 0xfffff); + testValid("\xf4\x8f\xbf\xbf", 0x10ffff); +} + +test "invalid utf8 continuation bytes" { + // unexpected continuation + testError("\x80", error.InvalidStartByte); + testError("\xbf", error.InvalidStartByte); + // too many leading 1's + testError("\xf8", error.InvalidStartByte); + testError("\xff", error.InvalidStartByte); + // expected continuation for 2 byte sequences + testError("\xc2", error.UnexpectedEof); + testError("\xc2\x00", error.ExpectedContinuation); + testError("\xc2\xc0", error.ExpectedContinuation); + // expected continuation for 3 byte sequences + testError("\xe0", error.UnexpectedEof); + testError("\xe0\x00", error.UnexpectedEof); + testError("\xe0\xc0", error.UnexpectedEof); + testError("\xe0\xa0", error.UnexpectedEof); + testError("\xe0\xa0\x00", error.ExpectedContinuation); + testError("\xe0\xa0\xc0", error.ExpectedContinuation); + // expected continuation for 4 byte sequences + testError("\xf0", error.UnexpectedEof); + testError("\xf0\x00", error.UnexpectedEof); + testError("\xf0\xc0", error.UnexpectedEof); + testError("\xf0\x90\x00", error.UnexpectedEof); + testError("\xf0\x90\xc0", error.UnexpectedEof); + testError("\xf0\x90\x80\x00", error.ExpectedContinuation); + testError("\xf0\x90\x80\xc0", error.ExpectedContinuation); +} + +test "overlong utf8 codepoint" { + testError("\xc0\x80", error.OverlongEncoding); + testError("\xc1\xbf", error.OverlongEncoding); + testError("\xe0\x80\x80", error.OverlongEncoding); + testError("\xe0\x9f\xbf", error.OverlongEncoding); + testError("\xf0\x80\x80\x80", error.OverlongEncoding); + testError("\xf0\x8f\xbf\xbf", error.OverlongEncoding); +} + +test "misc invalid utf8" { + // codepoint out of bounds + testError("\xf4\x90\x80\x80", error.CodepointTooLarge); + testError("\xf7\xbf\xbf\xbf", error.CodepointTooLarge); + // surrogate halves + testValid("\xed\x9f\xbf", 0xd7ff); + testError("\xed\xa0\x80", error.EncodesSurrogateHalf); + testError("\xed\xbf\xbf", error.EncodesSurrogateHalf); + testValid("\xee\x80\x80", 0xe000); +} + +fn testError(bytes: []const u8, expected_err: error) void { + if (testDecode(bytes)) |_| { + unreachable; + } else |err| { + debug.assert(err == expected_err); + } +} + +fn testValid(bytes: []const u8, expected_codepoint: u32) void { + debug.assert((testDecode(bytes) catch unreachable) == expected_codepoint); +} + +fn testDecode(bytes: []const u8) !u32 { + const length = try ByteSequenceLength(bytes[0]); + if (bytes.len < length) return error.UnexpectedEof; + debug.assert(bytes.len == length); + return Decode(bytes); +} diff --git a/std/unicode.zig b/std/unicode.zig deleted file mode 100644 index 356df824f091..000000000000 --- a/std/unicode.zig +++ /dev/null @@ -1,300 +0,0 @@ -const std = @import("./index.zig"); -const debug = std.debug; - -/// Given the first byte of a UTF-8 codepoint, -/// returns a number 1-4 indicating the total length of the codepoint in bytes. -/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. -pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { - if (first_byte < 0b10000000) return u3(1); - if (first_byte & 0b11100000 == 0b11000000) return u3(2); - if (first_byte & 0b11110000 == 0b11100000) return u3(3); - if (first_byte & 0b11111000 == 0b11110000) return u3(4); - return error.Utf8InvalidStartByte; -} - -/// Decodes the UTF-8 codepoint encoded in the given slice of bytes. -/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. -/// If you already know the length at comptime, you can call one of -/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. -pub fn utf8Decode(bytes: []const u8) !u32 { - return switch (bytes.len) { - 1 => u32(bytes[0]), - 2 => utf8Decode2(bytes), - 3 => utf8Decode3(bytes), - 4 => utf8Decode4(bytes), - else => unreachable, - }; -} -pub fn utf8Decode2(bytes: []const u8) !u32 { - debug.assert(bytes.len == 2); - debug.assert(bytes[0] & 0b11100000 == 0b11000000); - var value: u32 = bytes[0] & 0b00011111; - - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; - value <<= 6; - value |= bytes[1] & 0b00111111; - - if (value < 0x80) return error.Utf8OverlongEncoding; - - return value; -} -pub fn utf8Decode3(bytes: []const u8) !u32 { - debug.assert(bytes.len == 3); - debug.assert(bytes[0] & 0b11110000 == 0b11100000); - var value: u32 = bytes[0] & 0b00001111; - - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; - value <<= 6; - value |= bytes[1] & 0b00111111; - - if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; - value <<= 6; - value |= bytes[2] & 0b00111111; - - if (value < 0x800) return error.Utf8OverlongEncoding; - if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf; - - return value; -} -pub fn utf8Decode4(bytes: []const u8) !u32 { - debug.assert(bytes.len == 4); - debug.assert(bytes[0] & 0b11111000 == 0b11110000); - var value: u32 = bytes[0] & 0b00000111; - - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; - value <<= 6; - value |= bytes[1] & 0b00111111; - - if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; - value <<= 6; - value |= bytes[2] & 0b00111111; - - if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; - value <<= 6; - value |= bytes[3] & 0b00111111; - - if (value < 0x10000) return error.Utf8OverlongEncoding; - if (value > 0x10FFFF) return error.Utf8CodepointTooLarge; - - return value; -} - -pub fn utf8ValidateSlice(s: []const u8) bool { - var i: usize = 0; - while (i < s.len) { - if (utf8ByteSequenceLength(s[i])) |cp_len| { - if (i + cp_len > s.len) { - return false; - } - - if (utf8Decode(s[i..i+cp_len])) |_| {} else |_| { return false; } - i += cp_len; - } else |err| { - return false; - } - } - return true; -} - -/// Utf8View iterates the code points of a utf-8 encoded string. -/// -/// ``` -/// var utf8 = (try std.unicode.Utf8View.init("hi there")).iterator(); -/// while (utf8.nextCodepointSlice()) |codepoint| { -/// std.debug.warn("got codepoint {}\n", codepoint); -/// } -/// ``` -pub const Utf8View = struct { - bytes: []const u8, - - pub fn init(s: []const u8) !Utf8View { - if (!utf8ValidateSlice(s)) { - return error.InvalidUtf8; - } - - return initUnchecked(s); - } - - pub fn initUnchecked(s: []const u8) Utf8View { - return Utf8View { - .bytes = s, - }; - } - - pub fn initComptime(comptime s: []const u8) Utf8View { - if (comptime init(s)) |r| { - return r; - } else |err| switch (err) { - error.InvalidUtf8 => { - @compileError("invalid utf8"); - unreachable; - } - } - } - - pub fn iterator(s: &const Utf8View) Utf8Iterator { - return Utf8Iterator { - .bytes = s.bytes, - .i = 0, - }; - } -}; - -const Utf8Iterator = struct { - bytes: []const u8, - i: usize, - - pub fn nextCodepointSlice(it: &Utf8Iterator) ?[]const u8 { - if (it.i >= it.bytes.len) { - return null; - } - - const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable; - - it.i += cp_len; - return it.bytes[it.i-cp_len..it.i]; - } - - pub fn nextCodepoint(it: &Utf8Iterator) ?u32 { - const slice = it.nextCodepointSlice() ?? return null; - - const r = switch (slice.len) { - 1 => u32(slice[0]), - 2 => utf8Decode2(slice), - 3 => utf8Decode3(slice), - 4 => utf8Decode4(slice), - else => unreachable, - }; - - return r catch unreachable; - } -}; - -test "utf8 iterator on ascii" { - const s = Utf8View.initComptime("abc"); - - var it1 = s.iterator(); - debug.assert(std.mem.eql(u8, "a", ??it1.nextCodepointSlice())); - debug.assert(std.mem.eql(u8, "b", ??it1.nextCodepointSlice())); - debug.assert(std.mem.eql(u8, "c", ??it1.nextCodepointSlice())); - debug.assert(it1.nextCodepointSlice() == null); - - var it2 = s.iterator(); - debug.assert(??it2.nextCodepoint() == 'a'); - debug.assert(??it2.nextCodepoint() == 'b'); - debug.assert(??it2.nextCodepoint() == 'c'); - debug.assert(it2.nextCodepoint() == null); -} - -test "utf8 view bad" { - // Compile-time error. - // const s3 = Utf8View.initComptime("\xfe\xf2"); - - const s = Utf8View.init("hel\xadlo"); - if (s) |_| { unreachable; } else |err| { debug.assert(err == error.InvalidUtf8); } -} - -test "utf8 view ok" { - const s = Utf8View.initComptime("東京市"); - - var it1 = s.iterator(); - debug.assert(std.mem.eql(u8, "東", ??it1.nextCodepointSlice())); - debug.assert(std.mem.eql(u8, "京", ??it1.nextCodepointSlice())); - debug.assert(std.mem.eql(u8, "市", ??it1.nextCodepointSlice())); - debug.assert(it1.nextCodepointSlice() == null); - - var it2 = s.iterator(); - debug.assert(??it2.nextCodepoint() == 0x6771); - debug.assert(??it2.nextCodepoint() == 0x4eac); - debug.assert(??it2.nextCodepoint() == 0x5e02); - debug.assert(it2.nextCodepoint() == null); -} - -test "bad utf8 slice" { - debug.assert(utf8ValidateSlice("abc")); - debug.assert(!utf8ValidateSlice("abc\xc0")); - debug.assert(!utf8ValidateSlice("abc\xc0abc")); - debug.assert(utf8ValidateSlice("abc\xdf\xbf")); -} - -test "valid utf8" { - testValid("\x00", 0x0); - testValid("\x20", 0x20); - testValid("\x7f", 0x7f); - testValid("\xc2\x80", 0x80); - testValid("\xdf\xbf", 0x7ff); - testValid("\xe0\xa0\x80", 0x800); - testValid("\xe1\x80\x80", 0x1000); - testValid("\xef\xbf\xbf", 0xffff); - testValid("\xf0\x90\x80\x80", 0x10000); - testValid("\xf1\x80\x80\x80", 0x40000); - testValid("\xf3\xbf\xbf\xbf", 0xfffff); - testValid("\xf4\x8f\xbf\xbf", 0x10ffff); -} - -test "invalid utf8 continuation bytes" { - // unexpected continuation - testError("\x80", error.Utf8InvalidStartByte); - testError("\xbf", error.Utf8InvalidStartByte); - // too many leading 1's - testError("\xf8", error.Utf8InvalidStartByte); - testError("\xff", error.Utf8InvalidStartByte); - // expected continuation for 2 byte sequences - testError("\xc2", error.UnexpectedEof); - testError("\xc2\x00", error.Utf8ExpectedContinuation); - testError("\xc2\xc0", error.Utf8ExpectedContinuation); - // expected continuation for 3 byte sequences - testError("\xe0", error.UnexpectedEof); - testError("\xe0\x00", error.UnexpectedEof); - testError("\xe0\xc0", error.UnexpectedEof); - testError("\xe0\xa0", error.UnexpectedEof); - testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation); - testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation); - // expected continuation for 4 byte sequences - testError("\xf0", error.UnexpectedEof); - testError("\xf0\x00", error.UnexpectedEof); - testError("\xf0\xc0", error.UnexpectedEof); - testError("\xf0\x90\x00", error.UnexpectedEof); - testError("\xf0\x90\xc0", error.UnexpectedEof); - testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation); - testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation); -} - -test "overlong utf8 codepoint" { - testError("\xc0\x80", error.Utf8OverlongEncoding); - testError("\xc1\xbf", error.Utf8OverlongEncoding); - testError("\xe0\x80\x80", error.Utf8OverlongEncoding); - testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding); - testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding); - testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding); -} - -test "misc invalid utf8" { - // codepoint out of bounds - testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge); - testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge); - // surrogate halves - testValid("\xed\x9f\xbf", 0xd7ff); - testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf); - testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf); - testValid("\xee\x80\x80", 0xe000); -} - -fn testError(bytes: []const u8, expected_err: error) void { - if (testDecode(bytes)) |_| { - unreachable; - } else |err| { - debug.assert(err == expected_err); - } -} - -fn testValid(bytes: []const u8, expected_codepoint: u32) void { - debug.assert((testDecode(bytes) catch unreachable) == expected_codepoint); -} - -fn testDecode(bytes: []const u8) !u32 { - const length = try utf8ByteSequenceLength(bytes[0]); - if (bytes.len < length) return error.UnexpectedEof; - debug.assert(bytes.len == length); - return utf8Decode(bytes); -} diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index a2c4def9e0df..3cb728b6a152 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -1036,27 +1036,28 @@ pub const Tokenizer = struct { // (note that \n was checked before we got here) return 1; } + // looks fine to me. return 0; } else { // check utf8-encoded character. - const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; + const length = std.string.utf8.ByteSequenceLength(c0) catch return 1; if (self.index + length > self.buffer.len) { return u3(self.buffer.len - self.index); } const bytes = self.buffer[self.index..self.index + length]; switch (length) { 2 => { - const value = std.unicode.utf8Decode2(bytes) catch return length; + const value = std.string.utf8.Decode2(bytes) catch return length; if (value == 0x85) return length; // U+0085 (NEL) }, 3 => { - const value = std.unicode.utf8Decode3(bytes) catch return length; + const value = std.string.utf8.Decode3(bytes) catch return length; if (value == 0x2028) return length; // U+2028 (LS) if (value == 0x2029) return length; // U+2029 (PS) }, 4 => { - _ = std.unicode.utf8Decode4(bytes) catch return length; + _ = std.string.utf8.Decode4(bytes) catch return length; }, else => unreachable, }