-
-
Notifications
You must be signed in to change notification settings - Fork 2.8k
std: add ascii with C ASCII character classes #2095
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
// Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does. | ||
// I could have taken only a u7 to make this clear, but it would be slower | ||
// It is my opinion that encodings other than UTF-8 should not be supported. | ||
// | ||
// (and 128 bytes is not much to pay). | ||
// Also does not handle Unicode character classes. | ||
// | ||
// https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png | ||
|
||
const tIndex = enum(u3) { | ||
Alpha, | ||
Hex, | ||
Space, | ||
Digit, | ||
Lower, | ||
Upper, | ||
// Ctrl, < 0x20 || == DEL | ||
// Print, = Graph || == ' '. NOT '\t' et cetera | ||
Punct, | ||
Graph, | ||
//ASCII, | ~0b01111111 | ||
//isBlank, == ' ' || == '\x09' | ||
}; | ||
|
||
const combinedTable = init: { | ||
comptime var table: [256]u8 = undefined; | ||
|
||
const std = @import("std"); | ||
const mem = std.mem; | ||
|
||
const alpha = []u1{ | ||
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
|
||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, | ||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, | ||
}; | ||
const lower = []u1{ | ||
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
|
||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, | ||
}; | ||
const upper = []u1{ | ||
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
|
||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
}; | ||
const digit = []u1{ | ||
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, | ||
|
||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
}; | ||
const hex = []u1{ | ||
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, | ||
|
||
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
}; | ||
const space = []u1{ | ||
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
|
||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
}; | ||
const punct = []u1{ | ||
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, | ||
|
||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, | ||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, | ||
}; | ||
const graph = []u1{ | ||
// 0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15 | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
|
||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, | ||
}; | ||
|
||
comptime var i = 0; | ||
inline while (i < 128) : (i += 1) { | ||
table[i] = | ||
u8(alpha[i]) << @enumToInt(tIndex.Alpha) | | ||
u8(hex[i]) << @enumToInt(tIndex.Hex) | | ||
u8(space[i]) << @enumToInt(tIndex.Space) | | ||
u8(digit[i]) << @enumToInt(tIndex.Digit) | | ||
u8(lower[i]) << @enumToInt(tIndex.Lower) | | ||
u8(upper[i]) << @enumToInt(tIndex.Upper) | | ||
u8(punct[i]) << @enumToInt(tIndex.Punct) | | ||
u8(graph[i]) << @enumToInt(tIndex.Graph); | ||
} | ||
mem.set(u8, table[128..256], 0); | ||
break :init table; | ||
}; | ||
|
||
fn inTable(c: u8, t: tIndex) bool { | ||
return (combinedTable[c] & (u8(1) << @enumToInt(t))) != 0; | ||
} | ||
|
||
pub fn isAlNum(c: u8) bool { | ||
return (combinedTable[c] & ((u8(1) << @enumToInt(tIndex.Alpha)) | | ||
u8(1) << @enumToInt(tIndex.Digit))) != 0; | ||
} | ||
|
||
pub fn isAlpha(c: u8) bool { | ||
return inTable(c, tIndex.Alpha); | ||
} | ||
|
||
pub fn isCtrl(c: u8) bool { | ||
return c < 0x20 or c == 127; //DEL | ||
} | ||
|
||
pub fn isCntrl(c: u8) bool { | ||
return isCtrl(c); | ||
} | ||
|
||
pub fn isDigit(c: u8) bool { | ||
return inTable(c, tIndex.Digit); | ||
} | ||
|
||
pub fn isGraph(c: u8) bool { | ||
return inTable(c, tIndex.Graph); | ||
} | ||
|
||
pub fn isLower(c: u8) bool { | ||
return inTable(c, tIndex.Lower); | ||
} | ||
|
||
pub fn isPrint(c: u8) bool { | ||
return inTable(c, tIndex.Graph) or c == ' '; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth rewriting as |
||
} | ||
|
||
pub fn isPunct(c: u8) bool { | ||
return inTable(c, tIndex.Punct); | ||
} | ||
|
||
pub fn isSpace(c: u8) bool { | ||
return inTable(c, tIndex.Space); | ||
} | ||
|
||
pub fn isUpper(c: u8) bool { | ||
return inTable(c, tIndex.Upper); | ||
} | ||
|
||
pub fn isXDigit(c: u8) bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do these function names/capitalisation make sense? Related to #1884 I guess? |
||
return inTable(c, tIndex.Hex); | ||
} | ||
|
||
pub fn isASCII(c: u8) bool { | ||
return c < 128; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a feeling; I've always thought of all the ascii sets tests as bitmasking operations. It's odd for me to see them as normal comparisons.... |
||
} | ||
|
||
pub fn isBlank(c: u8) bool { | ||
return (c == ' ') or (c == '\x09'); | ||
} | ||
|
||
pub fn toUpper(c: u8) u8 { | ||
if (isLower(c)) { | ||
return c - 0x20; | ||
} else { | ||
return c; | ||
} | ||
} | ||
|
||
pub fn toLower(c: u8) u8 { | ||
if (isUpper(c)) { | ||
return c + 0x20; | ||
} else { | ||
return c; | ||
} | ||
} | ||
|
||
test "ascii character classes" { | ||
const std = @import("std"); | ||
const testing = std.testing; | ||
|
||
testing.expect('C' == toUpper('c')); | ||
testing.expect(':' == toUpper(':')); | ||
testing.expect('\xab' == toUpper('\xab')); | ||
testing.expect('c' == toLower('C')); | ||
testing.expect(isAlpha('c')); | ||
testing.expect(!isAlpha('5')); | ||
testing.expect(isSpace(' ')); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why have this alias?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because it matches the C function,
iscntrl()
. While "ctrl" is the more accepted abbreviation of control, I thought it would make things easier to just support both.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should just pick one. The zig way in situations like this is to pick a side and make everybody conform.