Skip to content

Commit 5eaff67

Browse files
cuviperBurntSushi
authored andcommitted
syntax: regenerate tables for Unicode 11
This adds `scripts/generate.py`, and uses it to regenerate all tables with data from Unicode 11.0.0. This also restores the character tests that were first added in #400, with a new one for 11.
1 parent eeffc7f commit 5eaff67

12 files changed

+3723
-3328
lines changed

regex-syntax/src/lib.rs

+9-2
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,15 @@ mod tests {
213213
assert!(is_word_byte(b'a'));
214214
assert!(!is_word_byte(b'-'));
215215

216-
assert!(is_word_character('a'));
217-
assert!(is_word_character('β'));
216+
assert!(is_word_character('a'), "ASCII");
217+
assert!(is_word_character('à'), "Latin-1");
218+
assert!(is_word_character('β'), "Greek");
219+
assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)");
220+
assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)");
221+
assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)");
222+
assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)");
223+
assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)");
224+
assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)");
218225
assert!(!is_word_character('-'));
219226
assert!(!is_word_character('☃'));
220227
}

regex-syntax/src/unicode.rs

+1
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ fn ages(canonical_age: &str) -> Result<AgeIter> {
346346
("V8_0", age::V8_0),
347347
("V9_0", age::V9_0),
348348
("V10_0", age::V10_0),
349+
("V11_0", age::V11_0),
349350
];
350351
assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
351352

regex-syntax/src/unicode_tables/age.rs

+37-6
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
22
//
3-
// ucd-generate age tmp/ucd-10.0.0/ --chars
3+
// ucd-generate age tmp/ucd-11.0.0/ --chars
44
//
55
// ucd-generate is available on crates.io.
66

77
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
8-
("V10_0", V10_0), ("V1_1", V1_1), ("V2_0", V2_0), ("V2_1", V2_1),
9-
("V3_0", V3_0), ("V3_1", V3_1), ("V3_2", V3_2), ("V4_0", V4_0),
10-
("V4_1", V4_1), ("V5_0", V5_0), ("V5_1", V5_1), ("V5_2", V5_2),
11-
("V6_0", V6_0), ("V6_1", V6_1), ("V6_2", V6_2), ("V6_3", V6_3),
12-
("V7_0", V7_0), ("V8_0", V8_0), ("V9_0", V9_0),
8+
("V10_0", V10_0), ("V11_0", V11_0), ("V1_1", V1_1), ("V2_0", V2_0),
9+
("V2_1", V2_1), ("V3_0", V3_0), ("V3_1", V3_1), ("V3_2", V3_2),
10+
("V4_0", V4_0), ("V4_1", V4_1), ("V5_0", V5_0), ("V5_1", V5_1),
11+
("V5_2", V5_2), ("V6_0", V6_0), ("V6_1", V6_1), ("V6_2", V6_2),
12+
("V6_3", V6_3), ("V7_0", V7_0), ("V8_0", V8_0), ("V9_0", V9_0),
1313
];
1414

1515
pub const V10_0: &'static [(char, char)] = &[
@@ -25,6 +25,37 @@ pub const V10_0: &'static [(char, char)] = &[
2525
('🥟', '🥫'), ('🦒', '🦗'), ('🧐', '🧦'), ('𬺰', '𮯠'),
2626
];
2727

28+
pub const V11_0: &'static [(char, char)] = &[
29+
('\u{560}', '\u{560}'), ('\u{588}', '\u{588}'), ('\u{5ef}', '\u{5ef}'),
30+
('\u{7fd}', '\u{7ff}'), ('\u{8d3}', '\u{8d3}'), ('\u{9fe}', '\u{9fe}'),
31+
('\u{a76}', '\u{a76}'), ('\u{c04}', '\u{c04}'), ('\u{c84}', '\u{c84}'),
32+
('\u{1878}', '\u{1878}'), ('\u{1c90}', '\u{1cba}'),
33+
('\u{1cbd}', '\u{1cbf}'), ('\u{2bba}', '\u{2bbc}'),
34+
('\u{2bd3}', '\u{2beb}'), ('\u{2bf0}', '\u{2bfe}'),
35+
('\u{2e4a}', '\u{2e4e}'), ('\u{312f}', '\u{312f}'),
36+
('\u{9feb}', '\u{9fef}'), ('\u{a7af}', '\u{a7af}'),
37+
('\u{a7b8}', '\u{a7b9}'), ('\u{a8fe}', '\u{a8ff}'),
38+
('\u{10a34}', '\u{10a35}'), ('\u{10a48}', '\u{10a48}'),
39+
('\u{10d00}', '\u{10d27}'), ('\u{10d30}', '\u{10d39}'),
40+
('\u{10f00}', '\u{10f27}'), ('\u{10f30}', '\u{10f59}'),
41+
('\u{110cd}', '\u{110cd}'), ('\u{11144}', '\u{11146}'),
42+
('\u{1133b}', '\u{1133b}'), ('\u{1145e}', '\u{1145e}'),
43+
('\u{1171a}', '\u{1171a}'), ('\u{11800}', '\u{1183b}'),
44+
('\u{11a9d}', '\u{11a9d}'), ('\u{11d60}', '\u{11d65}'),
45+
('\u{11d67}', '\u{11d68}'), ('\u{11d6a}', '\u{11d8e}'),
46+
('\u{11d90}', '\u{11d91}'), ('\u{11d93}', '\u{11d98}'),
47+
('\u{11da0}', '\u{11da9}'), ('\u{11ee0}', '\u{11ef8}'),
48+
('\u{16e40}', '\u{16e9a}'), ('\u{187ed}', '\u{187f1}'),
49+
('\u{1d2e0}', '\u{1d2f3}'), ('\u{1d372}', '\u{1d378}'),
50+
('\u{1ec71}', '\u{1ecb4}'), ('\u{1f12f}', '\u{1f12f}'),
51+
('\u{1f6f9}', '\u{1f6f9}'), ('\u{1f7d5}', '\u{1f7d8}'),
52+
('\u{1f94d}', '\u{1f94f}'), ('\u{1f96c}', '\u{1f970}'),
53+
('\u{1f973}', '\u{1f976}'), ('\u{1f97a}', '\u{1f97a}'),
54+
('\u{1f97c}', '\u{1f97f}'), ('\u{1f998}', '\u{1f9a2}'),
55+
('\u{1f9b0}', '\u{1f9b9}'), ('\u{1f9c1}', '\u{1f9c2}'),
56+
('\u{1f9e7}', '\u{1f9ff}'), ('\u{1fa60}', '\u{1fa6d}'),
57+
];
58+
2859
pub const V1_1: &'static [(char, char)] = &[
2960
('\u{0}', 'ǵ'), ('Ǻ', 'ȗ'), ('ɐ', 'ʨ'), ('ʰ', '˞'), ('ˠ', '˩'),
3061
('̀', 'ͅ'), ('͠', '͡'), ('ʹ', '͵'), ('ͺ', 'ͺ'), (';', ';'),

regex-syntax/src/unicode_tables/case_folding_simple.rs

+487-424
Large diffs are not rendered by default.

regex-syntax/src/unicode_tables/general_category.rs

+1,141-1,078
Large diffs are not rendered by default.

regex-syntax/src/unicode_tables/perl_word.rs

+150-141
Large diffs are not rendered by default.

regex-syntax/src/unicode_tables/property_bool.rs

+1,509-1,427
Large diffs are not rendered by default.

regex-syntax/src/unicode_tables/property_names.rs

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
22
//
3-
// ucd-generate property-names tmp/ucd-10.0.0/
3+
// ucd-generate property-names tmp/ucd-11.0.0/
44
//
55
// ucd-generate is available on crates.io.
66

@@ -47,11 +47,13 @@ pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
4747
("di", "Default_Ignorable_Code_Point"), ("dia", "Diacritic"),
4848
("diacritic", "Diacritic"), ("dm", "Decomposition_Mapping"),
4949
("dt", "Decomposition_Type"), ("ea", "East_Asian_Width"),
50-
("eastasianwidth", "East_Asian_Width"), ("expandsonnfc", "Expands_On_NFC"),
51-
("expandsonnfd", "Expands_On_NFD"), ("expandsonnfkc", "Expands_On_NFKC"),
52-
("expandsonnfkd", "Expands_On_NFKD"), ("ext", "Extender"),
53-
("extender", "Extender"), ("fcnfkc", "FC_NFKC_Closure"),
54-
("fcnfkcclosure", "FC_NFKC_Closure"),
50+
("eastasianwidth", "East_Asian_Width"),
51+
("equideo", "Equivalent_Unified_Ideograph"),
52+
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
53+
("expandsonnfc", "Expands_On_NFC"), ("expandsonnfd", "Expands_On_NFD"),
54+
("expandsonnfkc", "Expands_On_NFKC"), ("expandsonnfkd", "Expands_On_NFKD"),
55+
("ext", "Extender"), ("extender", "Extender"),
56+
("fcnfkc", "FC_NFKC_Closure"), ("fcnfkcclosure", "FC_NFKC_Closure"),
5557
("fullcompositionexclusion", "Full_Composition_Exclusion"),
5658
("gc", "General_Category"), ("gcb", "Grapheme_Cluster_Break"),
5759
("generalcategory", "General_Category"), ("graphemebase", "Grapheme_Base"),

0 commit comments

Comments
 (0)