servo · bors-servo · Jun 27, 2018 · Jun 20, 2018 · Jun 20, 2018 · Jun 20, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -8,6 +8,9 @@ jobs:
     - cargo update
     # getopts is only used in tests. Its versions 0.2.16+ don’t build on 1.17.0
     - cargo update -p getopts --precise 0.2.15
+
+    - cargo update -p unicode-normalization --precise 0.1.5
+
     # data-url uses pub(crate) which is unstable in 1.17
     script: cargo test --all-features -p url -p idna -p percent-encoding -p url_serde
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -35,6 +35,8 @@ rustc-test = "0.3"
 rustc-serialize = "0.3"
 serde_json = ">=0.6.1, <0.9"
 
+bencher = "0.1"
+
 [features]
 query_encoding = ["encoding"]
 heap_size = ["heapsize"]
@@ -48,5 +50,9 @@ percent-encoding = { version = "1.0.0", path = "./percent_encoding" }
 rustc-serialize = {version = "0.3", optional = true}
 serde = {version = ">=0.6.1, <0.9", optional = true}
 
+[[bench]]
+name = "parse_url"
+harness = false
+
 [package.metadata.docs.rs]
 features = ["query_encoding"]
diff --git a/benches/parse_url.rs b/benches/parse_url.rs
@@ -0,0 +1,18 @@
+#[macro_use]
+extern crate bencher;
+
+extern crate url;
+
+use bencher::{black_box, Bencher};
+
+use url::Url;
+
+fn short(bench: &mut Bencher) {
+    let url = "https://example.com/bench";
+
+    bench.bytes = url.len() as u64;
+    bench.iter(|| black_box(url).parse::<Url>().unwrap());
+}
+
+benchmark_group!(benches, short);
+benchmark_main!(benches);
diff --git a/idna/src/make_uts46_mapping_table.py b/idna/src/make_uts46_mapping_table.py
@@ -10,6 +10,7 @@
 # You can get the latest idna table from
 # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
 
+from __future__ import print_function
 import collections
 import itertools
 
@@ -23,8 +24,6 @@
 // except according to those terms.
 
 // Generated by make_idna_table.py
-
-static TABLE: &'static [Range] = &[
 ''')
 
 txt = open("IdnaMappingTable.txt")
@@ -84,6 +83,7 @@ def rust_slice(s):
 
 def mergeable_key(r):
     mapping = r[2]
+
     # These types have associated data, so we should not merge them.
     if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
         return r
@@ -123,12 +123,65 @@ def mergeable_key(r):
     unicode_str = group[0][3]
     optimized_ranges.append((first, last, mapping, unicode_str))
 
-for (first, last, mapping, unicode_str) in optimized_ranges:
-    if unicode_str is not None:
-        mapping += rust_slice(strtab_slice(unicode_str))
-    print("    Range { from: '%s', to: '%s', mapping: %s }," % (escape_char(char(first)),
-                                                                escape_char(char(last)),
-                                                                mapping))
+def is_single_char_range(r):
+    (first, last, _, _) = r
+    return first == last
+
+# We can reduce the size of the character range table and the index table to about 1/4
+# by merging runs of single character ranges and using character offsets from the start
+# of that range to retrieve the correct `Mapping` value
+def merge_single_char_ranges(ranges):
+    current = []
+    for r in ranges:
+        if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
+            current.append(r)
+            continue
+        if len(current) != 0:
+            ret = current
+            current = [r]
+            yield ret
+            continue
+        current.append(r)
+        ret = current
+        current = []
+        yield ret
+    yield current
+
+optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
+
+
+print("static TABLE: &'static [Range] = &[")
+
+for ranges in optimized_ranges:
+    first = ranges[0][0]
+    last = ranges[-1][1]
+    print("    Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
+                                                            escape_char(char(last))))
+
+print("];\n")
+
+print("static INDEX_TABLE: &'static [u16] = &[")
+
+SINGLE_MARKER = 1 << 15
+
+offset = 0
+for ranges in optimized_ranges:
+    assert offset < SINGLE_MARKER
+
+    block_len = len(ranges)
+    single = SINGLE_MARKER if block_len == 1 else 0
+    print("    %s," % (offset | single))
+    offset += block_len
+
+print("];\n")
+
+print("static MAPPING_TABLE: &'static [Mapping] = &[")
+
+for ranges in optimized_ranges:
+    for (first, last, mapping, unicode_str) in ranges:
+        if unicode_str is not None:
+            mapping += rust_slice(strtab_slice(unicode_str))
+        print("    %s," % mapping)
 
 print("];\n")
 

diff --git a/idna/src/punycode.rs b/idna/src/punycode.rs
@@ -15,6 +15,7 @@
 
 use std::u32;
 use std::char;
+#[allow(unused_imports, deprecated)]
 use std::ascii::AsciiExt;
 
 // Bootstring parameters for Punycode

diff --git a/idna/src/uts46.rs b/idna/src/uts46.rs
@@ -11,6 +11,7 @@
 
 use self::Mapping::*;
 use punycode;
+#[allow(unused_imports, deprecated)]
 use std::ascii::AsciiExt;
 use std::cmp::Ordering::{Equal, Less, Greater};
 use unicode_bidi::{BidiClass, bidi_class};
@@ -55,7 +56,6 @@ enum Mapping {
 struct Range {
     from: char,
     to: char,
-    mapping: Mapping,
 }
 
 fn find_char(codepoint: char) -> &'static Mapping {
@@ -68,7 +68,19 @@ fn find_char(codepoint: char) -> &'static Mapping {
             Equal
         }
     });
-    r.ok().map(|i| &TABLE[i].mapping).unwrap()
+    r.ok().map(|i| {
+        const SINGLE_MARKER: u16 = 1 << 15;
+
+        let x = INDEX_TABLE[i];
+        let single = (x & SINGLE_MARKER) != 0;
+        let offset = !SINGLE_MARKER & x;
+
+        if single {
+            &MAPPING_TABLE[offset as usize]
+        } else {
+            &MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize]
+        }
+    }).unwrap()
 }
 
 fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) {
@@ -221,17 +233,21 @@ fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
 }
 
 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
+fn validate_full(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
+    // V1: Must be in NFC form.
+    if label.nfc().ne(label.chars()) {
+        errors.push(Error::ValidityCriteria);
+    } else {
+        validate(label, is_bidi_domain, flags, errors);
+    }
+}
+
 fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
     let first_char = label.chars().next();
     if first_char == None {
         // Empty string, pass
     }
 
-    // V1: Must be in NFC form.
-    else if label.nfc().ne(label.chars()) {
-        errors.push(Error::ValidityCriteria);
-    }
-
     // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
     //
     // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
@@ -279,11 +295,12 @@ fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Er
 
 /// http://www.unicode.org/reports/tr46/#Processing
 fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
-    let mut mapped = String::new();
+    let mut mapped = String::with_capacity(domain.len());
     for c in domain.chars() {
         map_char(c, flags, &mut mapped, errors)
     }
-    let normalized: String = mapped.nfc().collect();
+    let mut normalized = String::with_capacity(mapped.len());
+    normalized.extend(mapped.nfc());
 
     // Find out if it's a Bidi Domain Name
     //
@@ -322,12 +339,13 @@ fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
             match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
                 Some(decoded_label) => {
                     let flags = Flags { transitional_processing: false, ..flags };
-                    validate(&decoded_label, is_bidi_domain, flags, errors);
+                    validate_full(&decoded_label, is_bidi_domain, flags, errors);
                     validated.push_str(&decoded_label)
                 }
                 None => errors.push(Error::PunycodeError)
             }
         } else {
+            // `normalized` is already `NFC` so we can skip that check
             validate(label, is_bidi_domain, flags, errors);
             validated.push_str(label)
         }