Skip to content

Commit 360f1dc

Browse files
committed
tokenizer: don't use utf8-logic to parse unquoted urls without escapes.
261,823 ns/iter (+/- 11,536)
1 parent 4972950 commit 360f1dc

File tree

1 file changed

+25
-11
lines changed

1 file changed

+25
-11
lines changed

src/tokenizer.rs

+25-11
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,20 @@ impl<'a> Tokenizer<'a> {
344344
pub fn advance(&mut self, n: usize) { self.position += n }
345345

346346
// Assumes non-EOF
347+
#[inline]
348+
fn next_byte_unchecked(&self) -> u8 { self.byte_at(0) }
349+
350+
#[inline]
351+
fn byte_at(&self, offset: usize) -> u8 {
352+
self.input.as_bytes()[self.position + offset]
353+
}
354+
355+
#[inline]
356+
fn consume_byte(&mut self) -> u8 {
357+
self.position += 1;
358+
self.input.as_bytes()[self.position - 1]
359+
}
360+
347361
#[inline]
348362
fn next_char(&self) -> char { self.char_at(0) }
349363

@@ -812,11 +826,11 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
812826

813827

814828
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
815-
for (offset, c) in tokenizer.input[tokenizer.position..].char_indices() {
829+
for (offset, c) in tokenizer.input[tokenizer.position..].as_bytes().iter().cloned().enumerate() {
816830
match c {
817-
' ' | '\t' | '\n' | '\r' | '\x0C' => {},
818-
'"' | '\'' => return Err(()), // Do not advance
819-
')' => {
831+
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
832+
b'"' | b'\'' => return Err(()), // Do not advance
833+
b')' => {
820834
tokenizer.advance(offset + 1);
821835
return Ok(UnquotedUrl(Borrowed("")));
822836
}
@@ -836,28 +850,28 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
836850
if tokenizer.is_eof() {
837851
return UnquotedUrl(Borrowed(tokenizer.slice_from(start_pos)))
838852
}
839-
match tokenizer.next_char() {
840-
' ' | '\t' | '\n' | '\r' | '\x0C' => {
853+
match tokenizer.next_byte_unchecked() {
854+
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
841855
let value = tokenizer.slice_from(start_pos);
842856
tokenizer.advance(1);
843857
return consume_url_end(tokenizer, Borrowed(value))
844858
}
845-
')' => {
859+
b')' => {
846860
let value = tokenizer.slice_from(start_pos);
847861
tokenizer.advance(1);
848862
return UnquotedUrl(Borrowed(value))
849863
}
850-
'\x01'...'\x08' | '\x0B' | '\x0E'...'\x1F' | '\x7F' // non-printable
851-
| '"' | '\'' | '(' => {
864+
b'\x01'...b'\x08' | b'\x0B' | b'\x0E'...b'\x1F' | b'\x7F' // non-printable
865+
| b'"' | b'\'' | b'(' => {
852866
tokenizer.advance(1);
853867
return consume_bad_url(tokenizer)
854868
},
855-
'\\' | '\0' => {
869+
b'\\' | b'\0' => {
856870
string = tokenizer.slice_from(start_pos).to_owned();
857871
break
858872
}
859873
_ => {
860-
tokenizer.consume_char();
874+
tokenizer.consume_byte();
861875
}
862876
}
863877
}

0 commit comments

Comments
 (0)