Skip to content

Update to Unicode 13.0 and implement confusable detection. #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ exclude = [ "target/*", "Cargo.lock" ]

[dependencies]
unicode-script = { version = "0.4.0", default-features = false }
unicode-normalization = { version = "0.1.12", default-features = false }
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
compiler_builtins = { version = "0.1", optional = true }
Expand Down
81 changes: 79 additions & 2 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
'''

UNICODE_VERSION = (12, 1, 0)
UNICODE_VERSION = (13, 0, 0)

UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

Expand All @@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

for line in fileinput.input(os.path.basename(f)):
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
prop = None
d_lo = 0
d_hi = 0
Expand All @@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):

return props

def load_confusables(f):
fetch(f)
confusables = []
re1 = re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*")

for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
d_input = 0
d_outputs = []
m = re1.match(line)
if not m:
continue
d_inputs = m.group(1).split()
if len(d_inputs) != 1:
raise Exception('More than one code point in first column')
d_input = int(d_inputs[0].strip(), 16)
for d_output in m.group(2).split():
d_outputitem = int(d_output, 16);
d_outputs.append(d_outputitem);
confusables.append((d_input, d_outputs))

return confusables

def format_table_content(f, content, indent):
line = " "*indent
first = True
Expand All @@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
def escape_char(c):
return "'\\u{%x}'" % c

def escape_char_list(l):
line = "[";
first = True;
for c in l:
if first:
line += escape_char(c);
else:
line += ", " + escape_char(c);
first = False;
line += "]";
return line

def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
pub_string = "const"
Expand Down Expand Up @@ -173,10 +207,51 @@ def emit_identifier_module(f):
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
f.write("}\n\n")

def emit_confusable_detection_module(f):
f.write("pub mod confusable_detection {")
f.write("""

#[inline]
pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
// FIXME: do we want to special case ASCII here?
match c as usize {
_ => super::util::bsearch_value_table(c, CONFUSABLES)
}
}

""")

f.write(" // Confusable table:\n")
confusable_table = load_confusables("confusables.txt")
confusable_table.sort(key=lambda w: w[0])

last_key = None
for (k, v) in confusable_table:
if k == last_key:
raise Exception("duplicate keys in confusables table: %s" % k)
last_key = k

emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, &'static [char])]", is_pub=False,
pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1])))
f.write("}\n\n")


def emit_util_mod(f):
f.write("""
pub mod util {
use core::result::Result::{Ok, Err};

#[inline]
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
match r.binary_search_by_key(&c, |&(k, _)| k) {
Ok(idx) => {
let (_, v) = r[idx];
Some(v)
}
Err(_) => None
}
}

#[inline]
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
use core::cmp::Ordering::{Equal, Less, Greater};
Expand Down Expand Up @@ -224,3 +299,5 @@ def emit_util_mod(f):
emit_util_mod(rf)
### identifier module
emit_identifier_module(rf)
### confusable_detection module
emit_confusable_detection_module(rf)
39 changes: 39 additions & 0 deletions src/confusable_detection.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
//! [Confusable detection](https://www.unicode.org/reports/tr39/#Confusable_Detection)

use core::iter;

enum OnceOrMore<T, I> {
Once(iter::Once<T>),
More(I),
}

impl<T, I> Iterator for OnceOrMore<T, I>
where
I: Iterator<Item = T>,
{
type Item = T;

fn next(&mut self) -> Option<T> {
use OnceOrMore::*;
match self {
Once(v) => v.next(),
More(i) => i.next(),
}
}
}

type StaticSliceIterCloned = core::iter::Cloned<core::slice::Iter<'static, char>>;

fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {
use crate::tables::confusable_detection::char_confusable_prototype;
match char_confusable_prototype(c) {
None => OnceOrMore::Once(iter::once(c)),
Some(l) => OnceOrMore::More(l.iter().cloned()),
}
}

/// Calculate skeleton for string, as defined by UTS 39
pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
use unicode_normalization::UnicodeNormalization;
s.chars().nfd().flat_map(char_prototype).nfd()
}
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,12 @@ extern crate test;

pub use tables::UNICODE_VERSION;

pub mod confusable_detection;
pub mod general_security_profile;
pub mod mixed_script;
pub mod restriction_level;

pub use confusable_detection::skeleton;
pub use general_security_profile::GeneralSecurityProfile;
pub use mixed_script::MixedScript;
pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection};
Expand Down
Loading