Skip to content

Improve performance of the uts46 crate #453

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 27, 2018
Merged
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ jobs:
- cargo update
# getopts is only used in tests. Its versions 0.2.16+ don’t build on 1.17.0
- cargo update -p getopts --precise 0.2.15

- cargo update -p unicode-normalization --precise 0.1.5

# data-url uses pub(crate) which is unstable in 1.17
script: cargo test --all-features -p url -p idna -p percent-encoding -p url_serde

Expand Down
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ rustc-test = "0.3"
rustc-serialize = "0.3"
serde_json = ">=0.6.1, <0.9"

bencher = "0.1"

[features]
query_encoding = ["encoding"]
heap_size = ["heapsize"]
Expand All @@ -48,5 +50,9 @@ percent-encoding = { version = "1.0.0", path = "./percent_encoding" }
rustc-serialize = {version = "0.3", optional = true}
serde = {version = ">=0.6.1, <0.9", optional = true}

[[bench]]
name = "parse_url"
harness = false

[package.metadata.docs.rs]
features = ["query_encoding"]
18 changes: 18 additions & 0 deletions benches/parse_url.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#[macro_use]
extern crate bencher;

extern crate url;

use bencher::{black_box, Bencher};

use url::Url;

fn short(bench: &mut Bencher) {
let url = "https://example.com/bench";

bench.bytes = url.len() as u64;
bench.iter(|| black_box(url).parse::<Url>().unwrap());
}

benchmark_group!(benches, short);
benchmark_main!(benches);
69 changes: 61 additions & 8 deletions idna/src/make_uts46_mapping_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# You can get the latest idna table from
# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt

from __future__ import print_function
import collections
import itertools

Expand All @@ -23,8 +24,6 @@
// except according to those terms.

// Generated by make_idna_table.py

static TABLE: &'static [Range] = &[
''')

txt = open("IdnaMappingTable.txt")
Expand Down Expand Up @@ -84,6 +83,7 @@ def rust_slice(s):

def mergeable_key(r):
mapping = r[2]

# These types have associated data, so we should not merge them.
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
return r
Expand Down Expand Up @@ -123,12 +123,65 @@ def mergeable_key(r):
unicode_str = group[0][3]
optimized_ranges.append((first, last, mapping, unicode_str))

for (first, last, mapping, unicode_str) in optimized_ranges:
if unicode_str is not None:
mapping += rust_slice(strtab_slice(unicode_str))
print(" Range { from: '%s', to: '%s', mapping: %s }," % (escape_char(char(first)),
escape_char(char(last)),
mapping))
def is_single_char_range(r):
(first, last, _, _) = r
return first == last

# We can reduce the size of the character range table and the index table to about 1/4
# by merging runs of single character ranges and using character offsets from the start
# of that range to retrieve the correct `Mapping` value
def merge_single_char_ranges(ranges):
current = []
for r in ranges:
if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
current.append(r)
continue
if len(current) != 0:
ret = current
current = [r]
yield ret
continue
current.append(r)
ret = current
current = []
yield ret
yield current

optimized_ranges = list(merge_single_char_ranges(optimized_ranges))


print("static TABLE: &'static [Range] = &[")

for ranges in optimized_ranges:
first = ranges[0][0]
last = ranges[-1][1]
print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
escape_char(char(last))))

print("];\n")

print("static INDEX_TABLE: &'static [u16] = &[")

SINGLE_MARKER = 1 << 15

offset = 0
for ranges in optimized_ranges:
assert offset < SINGLE_MARKER

block_len = len(ranges)
single = SINGLE_MARKER if block_len == 1 else 0
print(" %s," % (offset | single))
offset += block_len

print("];\n")

print("static MAPPING_TABLE: &'static [Mapping] = &[")

for ranges in optimized_ranges:
for (first, last, mapping, unicode_str) in ranges:
if unicode_str is not None:
mapping += rust_slice(strtab_slice(unicode_str))
print(" %s," % mapping)

print("];\n")

Expand Down
1 change: 1 addition & 0 deletions idna/src/punycode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

use std::u32;
use std::char;
#[allow(unused_imports, deprecated)]
use std::ascii::AsciiExt;

// Bootstring parameters for Punycode
Expand Down
38 changes: 28 additions & 10 deletions idna/src/uts46.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

use self::Mapping::*;
use punycode;
#[allow(unused_imports, deprecated)]
use std::ascii::AsciiExt;
use std::cmp::Ordering::{Equal, Less, Greater};
use unicode_bidi::{BidiClass, bidi_class};
Expand Down Expand Up @@ -55,7 +56,6 @@ enum Mapping {
struct Range {
from: char,
to: char,
mapping: Mapping,
}

fn find_char(codepoint: char) -> &'static Mapping {
Expand All @@ -68,7 +68,19 @@ fn find_char(codepoint: char) -> &'static Mapping {
Equal
}
});
r.ok().map(|i| &TABLE[i].mapping).unwrap()
r.ok().map(|i| {
const SINGLE_MARKER: u16 = 1 << 15;

let x = INDEX_TABLE[i];
let single = (x & SINGLE_MARKER) != 0;
let offset = !SINGLE_MARKER & x;

if single {
&MAPPING_TABLE[offset as usize]
} else {
&MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize]
}
}).unwrap()
}

fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) {
Expand Down Expand Up @@ -221,17 +233,21 @@ fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
}

/// http://www.unicode.org/reports/tr46/#Validity_Criteria
fn validate_full(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
// V1: Must be in NFC form.
if label.nfc().ne(label.chars()) {
errors.push(Error::ValidityCriteria);
} else {
validate(label, is_bidi_domain, flags, errors);
}
}

fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
let first_char = label.chars().next();
if first_char == None {
// Empty string, pass
}

// V1: Must be in NFC form.
else if label.nfc().ne(label.chars()) {
errors.push(Error::ValidityCriteria);
}

// V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
//
// NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
Expand Down Expand Up @@ -279,11 +295,12 @@ fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Er

/// http://www.unicode.org/reports/tr46/#Processing
fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
let mut mapped = String::new();
let mut mapped = String::with_capacity(domain.len());
for c in domain.chars() {
map_char(c, flags, &mut mapped, errors)
}
let normalized: String = mapped.nfc().collect();
let mut normalized = String::with_capacity(mapped.len());
normalized.extend(mapped.nfc());

// Find out if it's a Bidi Domain Name
//
Expand Down Expand Up @@ -322,12 +339,13 @@ fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
Some(decoded_label) => {
let flags = Flags { transitional_processing: false, ..flags };
validate(&decoded_label, is_bidi_domain, flags, errors);
validate_full(&decoded_label, is_bidi_domain, flags, errors);
validated.push_str(&decoded_label)
}
None => errors.push(Error::PunycodeError)
}
} else {
// `normalized` is already `NFC` so we can skip that check
validate(label, is_bidi_domain, flags, errors);
validated.push_str(label)
}
Expand Down
Loading