Skip to content

Add unicode escaping in resolver #84

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fluent-bundle/benches/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ fn get_ids(res: &FluentResource) -> Vec<String> {
}

fn resolver_bench(c: &mut Criterion) {
let tests = &["simple", "menubar"];
let tests = &["simple", "menubar", "unescape"];
let ftl_strings = get_strings(tests);

c.bench_function_over_inputs(
Expand Down
9 changes: 9 additions & 0 deletions fluent-bundle/benches/unescape.ftl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
face-with-tears-of-joy = 😂
tetragram-for-centre = 𝌆

surrogates-in-text = \uD83D\uDE02
surrogates-in-string = {"\uD83D\uDE02"}
surrogates-in-adjacent-strings = {"\uD83D"}{"\uDE02"}

emoji-in-text = A face 😂 with tears of joy.
emoji-in-string = {"A face 😂 with tears of joy."}
4 changes: 2 additions & 2 deletions fluent-bundle/src/resolve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use super::bundle::FluentBundle;
use super::entry::GetEntry;
use super::types::FluentValue;
use fluent_syntax::ast;
use fluent_syntax::unicode::unescape_unicode;

#[derive(Debug, PartialEq)]
pub enum ResolverError {
Expand Down Expand Up @@ -176,8 +177,7 @@ impl<'source> ResolveValue for ast::InlineExpression<'source> {
fn to_value(&self, env: &Env) -> Result<FluentValue, ResolverError> {
match self {
ast::InlineExpression::StringLiteral { raw } => {
// XXX: We need to decode the raw into unicode here.
Ok(FluentValue::from(*raw))
Ok(FluentValue::from(unescape_unicode(raw).into_owned()))
}
ast::InlineExpression::NumberLiteral { value } => {
Ok(FluentValue::as_number(*value).unwrap())
Expand Down
30 changes: 29 additions & 1 deletion fluent-syntax/benches/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::io;
use std::io::Read;

use fluent_syntax::parser::parse;
use fluent_syntax::unicode::unescape_unicode;

fn read_file(path: &str) -> Result<String, io::Error> {
let mut f = File::open(path)?;
Expand Down Expand Up @@ -38,5 +39,32 @@ fn parser_bench(c: &mut Criterion) {
);
}

criterion_group!(benches, parser_bench);
fn unicode_unescape_bench(c: &mut Criterion) {
let strings = &[
"foo",
"This is an example value",
"Hello \\u00e3\\u00e9 World",
"\\u004c\\u006f\\u0072\\u0065\\u006d \\u0069\\u0070\\u0073\\u0075\\u006d \\u0064\\u006f\\u006c\\u006f\\u0072 \\u0073\\u0069\\u0074 \\u0061\\u006d\\u0065\\u0074",
"Let me introduce \\\"The\\\" Fluent",
"And here's an example of \\\\ a character to be escaped",
"But this message is completely unescape free",
"And so is this one",
"Maybe this one is as well completely escape free",
"Welcome to Mozilla Firefox",
"\\u0054\\u0068\\u0065\\u0073\\u0065 \\u0073\\u0065\\u0074\\u0074\\u0069\\u006e\\u0067\\u0073 \\u0061\\u0072\\u0065 \\u0074\\u0061\\u0069\\u006c\\u006f\\u0072\\u0065\\u0064 \\u0074\\u006f \\u0079\\u006f\\u0075\\u0072 \\u0063\\u006f\\u006d\\u0070\\u0075\\u0074\\u0065\\u0072\\u2019\\u0073 \\u0068\\u0061\\u0072\\u0064\\u0077\\u0061\\u0072\\u0065 \\u0061\\u006e\\u0064 \\u006f\\u0070\\u0065\\u0072\\u0061\\u0074\\u0069\\u006e\\u0067 \\u0073\\u0079\\u0073\\u0074\\u0065\\u006d\\u002e",
"These settings are tailored to your computer’s hardware and operating system",
"Use recommended performance settings",
"\\u0041\\u0064\\u0064\\u0069\\u0074\\u0069\\u006f\\u006e\\u0061\\u006c \\u0063\\u006f\\u006e\\u0074\\u0065\\u006e\\u0074 \\u0070\\u0072\\u006f\\u0063\\u0065\\u0073\\u0073\\u0065\\u0073 \\u0063\\u0061\\u006e \\u0069\\u006d\\u0070\\u0072\\u006f\\u0076\\u0065 \\u0070\\u0065\\u0072\\u0066\\u006f\\u0072\\u006d\\u0061\\u006e\\u0063\\u0065 \\u0077\\u0068\\u0065\\u006e \\u0075\\u0073\\u0069\\u006e\\u0067 \\u006d\\u0075\\u006c\\u0074\\u0069\\u0070\\u006c\\u0065 \\u0074\\u0061\\u0062\\u0073\\u002c \\u0062\\u0075\\u0074 \\u0077\\u0069\\u006c\\u006c \\u0061\\u006c\\u0073\\u006f \\u0075\\u0073\\u0065 \\u006d\\u006f\\u0072\\u0065 \\u006d\\u0065\\u006d\\u006f\\u0072\\u0079\\u002e",
"Additional content processes can improve performance when using multiple tabs, but will also use more memory.",
];
c.bench_function("unicode", move |b| {
b.iter(|| {
for s in strings {
unescape_unicode(s);
}
})
});
}

criterion_group!(benches, parser_bench, unicode_unescape_bench);
criterion_main!(benches);
1 change: 1 addition & 0 deletions fluent-syntax/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod ast;
pub mod parser;
pub mod unicode;
50 changes: 50 additions & 0 deletions fluent-syntax/src/unicode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use std::borrow::Cow;
use std::char;

fn encode_unicode(s: &str) -> char {
u32::from_str_radix(s, 16)
.ok()
.and_then(char::from_u32)
.unwrap_or('�')
}

pub fn unescape_unicode<'u>(input: &'u str) -> Cow<'u, str> {
let bytes = input.as_bytes();
let mut result = Cow::from(input);

let mut ptr = 0;

while let Some(b) = bytes.get(ptr) {
if b != &b'\\' {
if let Cow::Owned(ref mut s) = result {
s.push(*b as char);
}
ptr += 1;
continue;
}

if let Cow::Borrowed(_) = result {
result = Cow::from(&input[0..ptr]);
}

ptr += 1;

let new_char = match bytes.get(ptr) {
Some(b'\\') => '\\',
Some(b'"') => '"',
Some(u @ b'u') | Some(u @ b'U') => {
let start = ptr + 1;
let len = if u == &b'u' { 4 } else { 6 };
ptr += len;
input
.get(start..(start + len))
.map(|slice| encode_unicode(slice))
.unwrap_or('�')
}
_ => '�',
};
result.to_mut().push(new_char);
ptr += 1;
}
result
}
67 changes: 0 additions & 67 deletions fluent-syntax/tests/ast/helper.rs

This file was deleted.

5 changes: 2 additions & 3 deletions fluent-syntax/tests/ast/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
mod helper;

use fluent_syntax::ast;
use fluent_syntax::unicode::unescape_unicode;
use serde::ser::SerializeMap;
use serde::ser::SerializeSeq;
use serde::{Serialize, Serializer};
Expand Down Expand Up @@ -360,7 +359,7 @@ where
let mut map = serializer.serialize_map(Some(3))?;
map.serialize_entry("type", "StringLiteral")?;
map.serialize_entry("raw", raw)?;
map.serialize_entry("value", &helper::unescape_unicode(&raw))?;
map.serialize_entry("value", &unescape_unicode(&raw))?;
map.end()
}

Expand Down
32 changes: 32 additions & 0 deletions fluent-syntax/tests/unicode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use fluent_syntax::unicode::unescape_unicode;
use std::borrow::Cow;

fn is_cow_borrowed<'a>(input: Cow<'a, str>) -> bool {
if let Cow::Borrowed(_) = input {
true
} else {
false
}
}

#[test]
fn unescape_unicode_test() {
assert!(is_cow_borrowed(unescape_unicode("foo")));

assert_eq!(unescape_unicode("foo"), "foo");
assert_eq!(unescape_unicode("foo \\\\"), "foo \\");
assert_eq!(unescape_unicode("foo \\\""), "foo \"");
assert_eq!(unescape_unicode("foo \\\\ faa"), "foo \\ faa");
assert_eq!(
unescape_unicode("foo \\\\ faa \\\\ fii"),
"foo \\ faa \\ fii"
);
assert_eq!(
unescape_unicode("foo \\\\\\\" faa \\\"\\\\ fii"),
"foo \\\" faa \"\\ fii"
);
assert_eq!(unescape_unicode("\\u0041\\u004F"), "AO");
assert_eq!(unescape_unicode("\\uA"), "�");
assert_eq!(unescape_unicode("\\uA0Pl"), "�");
assert_eq!(unescape_unicode("\\d Foo"), "� Foo");
}