Skip to content

Commit 0b14212

Browse files
committed
tokenizer: Do the same thing of the last commits in a bunch more places.
If this is still too slow, the other thing to do would be to use table lookup to scan the properties of the given byte in all these hot loops.
1 parent fd86875 commit 0b14212

File tree

1 file changed

+96
-55
lines changed

1 file changed

+96
-55
lines changed

src/tokenizer.rs

+96-55
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
575575
-> Result<Cow<'a, str>, ()> {
576576
tokenizer.advance(1); // Skip the initial quote
577577
let start_pos = tokenizer.position();
578-
let mut string;
578+
let mut string_bytes;
579579
loop {
580580
if tokenizer.is_eof() {
581581
return Ok(Borrowed(tokenizer.slice_from(start_pos)))
@@ -592,7 +592,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
592592
return Ok(Borrowed(value))
593593
}
594594
b'\\' | b'\0' => {
595-
string = tokenizer.slice_from(start_pos).to_owned();
595+
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
596596
break
597597
}
598598
b'\n' | b'\r' | b'\x0C' => return Err(()),
@@ -606,10 +606,10 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
606606
if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') {
607607
return Err(());
608608
}
609-
match tokenizer.consume_char() {
610-
'"' if !single_quote => break,
611-
'\'' if single_quote => break,
612-
'\\' => {
609+
match tokenizer.consume_byte() {
610+
b'"' if !single_quote => break,
611+
b'\'' if single_quote => break,
612+
b'\\' => {
613613
if !tokenizer.is_eof() {
614614
match tokenizer.next_byte_unchecked() {
615615
// Escaped newline
@@ -620,16 +620,22 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
620620
tokenizer.advance(1);
621621
}
622622
}
623-
_ => string.push(consume_escape(tokenizer))
623+
_ => consume_escape_and_write(tokenizer, &mut string_bytes)
624624
}
625625
}
626626
// else: escaped EOF, do nothing.
627627
}
628-
'\0' => string.push('\u{FFFD}'),
629-
c => string.push(c),
628+
b'\0' => {
629+
// string.push('\u{FFFD}'),
630+
string_bytes.push(0xef);
631+
string_bytes.push(0xbf);
632+
string_bytes.push(0xbd);
633+
}
634+
c => string_bytes.push(c),
630635
}
631636
}
632-
Ok(Owned(string))
637+
638+
Ok(Owned(to_utf8(string_bytes)))
633639
}
634640

635641

@@ -650,7 +656,7 @@ fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
650656

651657
fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
652658
let value = consume_name(tokenizer);
653-
if !tokenizer.is_eof() && tokenizer.next_char() == '(' {
659+
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
654660
tokenizer.advance(1);
655661
if value.eq_ignore_ascii_case("url") {
656662
consume_unquoted_url(tokenizer).unwrap_or(Function(value))
@@ -668,42 +674,51 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
668674

669675
fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
670676
let start_pos = tokenizer.position();
671-
let mut value;
677+
let mut value_bytes;
672678
loop {
673679
if tokenizer.is_eof() {
674680
return Borrowed(tokenizer.slice_from(start_pos))
675681
}
676-
match tokenizer.next_char() {
677-
'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-' => tokenizer.advance(1),
678-
'\\' | '\0' => {
679-
value = tokenizer.slice_from(start_pos).to_owned();
682+
match tokenizer.next_byte_unchecked() {
683+
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'_' | b'-' => tokenizer.advance(1),
684+
b'\\' | b'\0' => {
685+
value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
680686
break
681687
}
682688
c if c.is_ascii() => return Borrowed(tokenizer.slice_from(start_pos)),
683689
_ => {
684-
tokenizer.consume_char();
690+
tokenizer.advance(1);
685691
}
686692
}
687693
}
688694

689695
while !tokenizer.is_eof() {
690-
let c = tokenizer.next_char();
691-
value.push(match c {
692-
'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-' => {
696+
let c = tokenizer.next_byte_unchecked();
697+
match c {
698+
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'_' | b'-' => {
693699
tokenizer.advance(1);
694-
c
700+
value_bytes.push(c)
695701
}
696-
'\\' => {
702+
b'\\' => {
697703
if tokenizer.has_newline_at(1) { break }
698704
tokenizer.advance(1);
699-
consume_escape(tokenizer)
705+
consume_escape_and_write(tokenizer, &mut value_bytes)
700706
}
701-
'\0' => { tokenizer.advance(1); '\u{FFFD}' },
707+
b'\0' => {
708+
tokenizer.advance(1);
709+
// value.push('\u{FFFD}')
710+
value_bytes.push(0xef);
711+
value_bytes.push(0xbf);
712+
value_bytes.push(0xbd);
713+
},
702714
c if c.is_ascii() => break,
703-
_ => tokenizer.consume_char(),
704-
})
715+
other => {
716+
tokenizer.advance(1);
717+
value_bytes.push(other)
718+
}
719+
}
705720
}
706-
Owned(value)
721+
Owned(to_utf8(value_bytes))
707722
}
708723

709724

@@ -825,7 +840,19 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
825840
}
826841

827842

843+
#[inline]
844+
fn to_utf8(string_bytes: Vec<u8>) -> String {
845+
if cfg!(debug_assertions) {
846+
String::from_utf8(string_bytes).unwrap()
847+
} else {
848+
unsafe {
849+
String::from_utf8_unchecked(string_bytes)
850+
}
851+
}
852+
}
853+
828854
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
855+
829856
for (offset, c) in tokenizer.input[tokenizer.position..].as_bytes().iter().cloned().enumerate() {
830857
match c {
831858
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
@@ -845,7 +872,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
845872

846873
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
847874
let start_pos = tokenizer.position();
848-
let mut string;
875+
let mut string_bytes: Vec<u8>;
849876
loop {
850877
if tokenizer.is_eof() {
851878
return UnquotedUrl(Borrowed(tokenizer.slice_from(start_pos)))
@@ -867,7 +894,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
867894
return consume_bad_url(tokenizer)
868895
},
869896
b'\\' | b'\0' => {
870-
string = tokenizer.slice_from(start_pos).to_owned();
897+
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
871898
break
872899
}
873900
_ => {
@@ -876,32 +903,37 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
876903
}
877904
}
878905
while !tokenizer.is_eof() {
879-
let next_char = match tokenizer.consume_char() {
880-
' ' | '\t' | '\n' | '\r' | '\x0C' => {
881-
return consume_url_end(tokenizer, Owned(string))
906+
match tokenizer.consume_byte() {
907+
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
908+
return consume_url_end(tokenizer, Owned(to_utf8(string_bytes)));
882909
}
883-
')' => break,
884-
'\x01'...'\x08' | '\x0B' | '\x0E'...'\x1F' | '\x7F' // non-printable
885-
| '"' | '\'' | '(' => return consume_bad_url(tokenizer),
886-
'\\' => {
910+
b')' => break,
911+
b'\x01'...b'\x08' | b'\x0B' | b'\x0E'...b'\x1F' | b'\x7F' // non-printable
912+
| b'"' | b'\'' | b'(' => return consume_bad_url(tokenizer),
913+
b'\\' => {
887914
if tokenizer.has_newline_at(0) {
888915
return consume_bad_url(tokenizer)
889916
}
890-
consume_escape(tokenizer)
917+
918+
consume_escape_and_write(tokenizer, &mut string_bytes)
891919
},
892-
'\0' => '\u{FFFD}',
893-
c => c
894-
};
895-
string.push(next_char)
920+
b'\0' => {
921+
// string.push('\u{FFFD}');
922+
string_bytes.push(0xef);
923+
string_bytes.push(0xbf);
924+
string_bytes.push(0xbd);
925+
}
926+
c => string_bytes.push(c)
927+
}
896928
}
897-
UnquotedUrl(Owned(string))
929+
UnquotedUrl(Owned(to_utf8(string_bytes)))
898930
}
899931

900932
fn consume_url_end<'a>(tokenizer: &mut Tokenizer<'a>, string: Cow<'a, str>) -> Token<'a> {
901933
while !tokenizer.is_eof() {
902-
match tokenizer.consume_char() {
903-
' ' | '\t' | '\n' | '\r' | '\x0C' => (),
904-
')' => break,
934+
match tokenizer.consume_byte() {
935+
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => (),
936+
b')' => break,
905937
_ => return consume_bad_url(tokenizer)
906938
}
907939
}
@@ -911,9 +943,9 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
911943
fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
912944
// Consume up to the closing )
913945
while !tokenizer.is_eof() {
914-
match tokenizer.consume_char() {
915-
')' => break,
916-
'\\' => tokenizer.advance(1), // Skip an escaped ')' or '\'
946+
match tokenizer.consume_byte() {
947+
b')' => break,
948+
b'\\' => tokenizer.advance(1), // Skip an escaped ')' or '\'
917949
_ => ()
918950
}
919951
}
@@ -972,20 +1004,29 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
9721004
}
9731005

9741006

1007+
// Same constraints as consume_escape except it writes into `bytes` the result
1008+
// instead of returning it.
1009+
//
1010+
// TODO: This could be made more efficient with char::encode_utf8, I guess.
1011+
fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1012+
use std::io::Write;
1013+
write!(bytes, "{}", consume_escape(tokenizer)).unwrap();
1014+
}
1015+
9751016
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
9761017
// and that the next input character has already been verified
9771018
// to not be a newline.
9781019
fn consume_escape(tokenizer: &mut Tokenizer) -> char {
9791020
if tokenizer.is_eof() { return '\u{FFFD}' } // Escaped EOF
980-
match tokenizer.next_char() {
981-
'0'...'9' | 'A'...'F' | 'a'...'f' => {
1021+
match tokenizer.next_byte_unchecked() {
1022+
b'0'...b'9' | b'A'...b'F' | b'a'...b'f' => {
9821023
let (c, _) = consume_hex_digits(tokenizer);
9831024
if !tokenizer.is_eof() {
984-
match tokenizer.next_char() {
985-
' ' | '\t' | '\n' | '\x0C' => tokenizer.advance(1),
986-
'\r' => {
1025+
match tokenizer.next_byte_unchecked() {
1026+
b' ' | b'\t' | b'\n' | b'\x0C' => tokenizer.advance(1),
1027+
b'\r' => {
9871028
tokenizer.advance(1);
988-
if !tokenizer.is_eof() && tokenizer.next_char() == '\n' {
1029+
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'\n' {
9891030
tokenizer.advance(1);
9901031
}
9911032
}
@@ -1000,7 +1041,7 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
10001041
REPLACEMENT_CHAR
10011042
}
10021043
},
1003-
'\0' => {
1044+
b'\0' => {
10041045
tokenizer.advance(1);
10051046
'\u{FFFD}'
10061047
}

0 commit comments

Comments
 (0)