Skip to content

std: add ascii with C ASCII character classes #2095

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ set(ZIG_CPP_SOURCES

set(ZIG_STD_FILES
"array_list.zig"
"ascii.zig"
"atomic.zig"
"atomic/int.zig"
"atomic/queue.zig"
Expand Down
232 changes: 232 additions & 0 deletions std/ascii.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
// Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does.
// I could have taken only a u7 to make this clear, but it would be slower
// It is my opinion that encodings other than UTF-8 should not be supported.
//
// (and 128 bytes is not much to pay).
// Also does not handle Unicode character classes.
//
// https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png

const tIndex = enum(u3) {
Alpha,
Hex,
Space,
Digit,
Lower,
Upper,
// Ctrl, < 0x20 || == DEL
// Print, = Graph || == ' '. NOT '\t' et cetera
Punct,
Graph,
//ASCII, | ~0b01111111
//isBlank, == ' ' || == '\x09'
};

const combinedTable = init: {
comptime var table: [256]u8 = undefined;

const std = @import("std");
const mem = std.mem;

const alpha = []u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
};
const lower = []u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
};
const upper = []u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const digit = []u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const hex = []u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,

0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const space = []u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const punct = []u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,

1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
};
const graph = []u1{
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
};

comptime var i = 0;
inline while (i < 128) : (i += 1) {
table[i] =
u8(alpha[i]) << @enumToInt(tIndex.Alpha) |
u8(hex[i]) << @enumToInt(tIndex.Hex) |
u8(space[i]) << @enumToInt(tIndex.Space) |
u8(digit[i]) << @enumToInt(tIndex.Digit) |
u8(lower[i]) << @enumToInt(tIndex.Lower) |
u8(upper[i]) << @enumToInt(tIndex.Upper) |
u8(punct[i]) << @enumToInt(tIndex.Punct) |
u8(graph[i]) << @enumToInt(tIndex.Graph);
}
mem.set(u8, table[128..256], 0);
break :init table;
};

fn inTable(c: u8, t: tIndex) bool {
return (combinedTable[c] & (u8(1) << @enumToInt(t))) != 0;
}

pub fn isAlNum(c: u8) bool {
return (combinedTable[c] & ((u8(1) << @enumToInt(tIndex.Alpha)) |
u8(1) << @enumToInt(tIndex.Digit))) != 0;
}

pub fn isAlpha(c: u8) bool {
return inTable(c, tIndex.Alpha);
}

pub fn isCtrl(c: u8) bool {
return c < 0x20 or c == 127; //DEL
}

pub fn isCntrl(c: u8) bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why have this alias?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because it matches the C function, iscntrl(). While "ctrl" is the more accepted abbreviation of control, I thought it would make things easier to just support both.

Copy link
Member

@andrewrk andrewrk Mar 23, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should just pick one. The zig way in situations like this is to pick a side and make everybody conform.

return isCtrl(c);
}

pub fn isDigit(c: u8) bool {
return inTable(c, tIndex.Digit);
}

pub fn isGraph(c: u8) bool {
return inTable(c, tIndex.Graph);
}

pub fn isLower(c: u8) bool {
return inTable(c, tIndex.Lower);
}

pub fn isPrint(c: u8) bool {
return inTable(c, tIndex.Graph) or c == ' ';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth rewriting as isGraph(c) or c == ' '?

}

pub fn isPunct(c: u8) bool {
return inTable(c, tIndex.Punct);
}

pub fn isSpace(c: u8) bool {
return inTable(c, tIndex.Space);
}

pub fn isUpper(c: u8) bool {
return inTable(c, tIndex.Upper);
}

pub fn isXDigit(c: u8) bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these function names/capitalisation make sense?
isPunct is cutting off letters
isASCII is mostly caps
isXDigit looks weird

Related to #1884 I guess?

return inTable(c, tIndex.Hex);
}

pub fn isASCII(c: u8) bool {
return c < 128;
Copy link
Contributor

@daurnimator daurnimator Mar 23, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use c & 0x80 == 0 instead?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a feeling; I've always thought of all the ascii sets tests as bitmasking operations. It's odd for me to see them as normal comparisons....

}

pub fn isBlank(c: u8) bool {
return (c == ' ') or (c == '\x09');
}

pub fn toUpper(c: u8) u8 {
if (isLower(c)) {
return c - 0x20;
} else {
return c;
}
}

pub fn toLower(c: u8) u8 {
if (isUpper(c)) {
return c + 0x20;
} else {
return c;
}
}

test "ascii character classes" {
const std = @import("std");
const testing = std.testing;

testing.expect('C' == toUpper('c'));
testing.expect(':' == toUpper(':'));
testing.expect('\xab' == toUpper('\xab'));
testing.expect('c' == toLower('C'));
testing.expect(isAlpha('c'));
testing.expect(!isAlpha('5'));
testing.expect(isSpace(' '));
}
1 change: 1 addition & 0 deletions std/std.zig
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ pub const pdb = @import("pdb.zig");
pub const rand = @import("rand.zig");
pub const rb = @import("rb.zig");
pub const sort = @import("sort.zig");
pub const ascii = @import("ascii.zig");
pub const testing = @import("testing.zig");
pub const unicode = @import("unicode.zig");
pub const valgrind = @import("valgrind.zig");
Expand Down