@@ -575,7 +575,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
575
575
-> Result < Cow < ' a , str > , ( ) > {
576
576
tokenizer. advance ( 1 ) ; // Skip the initial quote
577
577
let start_pos = tokenizer. position ( ) ;
578
- let mut string ;
578
+ let mut string_bytes ;
579
579
loop {
580
580
if tokenizer. is_eof ( ) {
581
581
return Ok ( Borrowed ( tokenizer. slice_from ( start_pos) ) )
@@ -592,7 +592,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
592
592
return Ok ( Borrowed ( value) )
593
593
}
594
594
b'\\' | b'\0' => {
595
- string = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
595
+ string_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
596
596
break
597
597
}
598
598
b'\n' | b'\r' | b'\x0C' => return Err ( ( ) ) ,
@@ -606,10 +606,10 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
606
606
if matches ! ( tokenizer. next_byte_unchecked( ) , b'\n' | b'\r' | b'\x0C' ) {
607
607
return Err ( ( ) ) ;
608
608
}
609
- match tokenizer. consume_char ( ) {
610
- '"' if !single_quote => break ,
611
- '\'' if single_quote => break ,
612
- '\\' => {
609
+ match tokenizer. consume_byte ( ) {
610
+ b '"' if !single_quote => break ,
611
+ b '\'' if single_quote => break ,
612
+ b '\\' => {
613
613
if !tokenizer. is_eof ( ) {
614
614
match tokenizer. next_byte_unchecked ( ) {
615
615
// Escaped newline
@@ -620,16 +620,22 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
620
620
tokenizer. advance ( 1 ) ;
621
621
}
622
622
}
623
- _ => string . push ( consume_escape ( tokenizer) )
623
+ _ => consume_escape_and_write ( tokenizer, & mut string_bytes )
624
624
}
625
625
}
626
626
// else: escaped EOF, do nothing.
627
627
}
628
- '\0' => string. push ( '\u{FFFD}' ) ,
629
- c => string. push ( c) ,
628
+ b'\0' => {
629
+ // string.push('\u{FFFD}'),
630
+ string_bytes. push ( 0xef ) ;
631
+ string_bytes. push ( 0xbf ) ;
632
+ string_bytes. push ( 0xbd ) ;
633
+ }
634
+ c => string_bytes. push ( c) ,
630
635
}
631
636
}
632
- Ok ( Owned ( string) )
637
+
638
+ Ok ( Owned ( to_utf8 ( string_bytes) ) )
633
639
}
634
640
635
641
@@ -650,7 +656,7 @@ fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
650
656
651
657
fn consume_ident_like < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
652
658
let value = consume_name ( tokenizer) ;
653
- if !tokenizer. is_eof ( ) && tokenizer. next_char ( ) == '(' {
659
+ if !tokenizer. is_eof ( ) && tokenizer. next_byte_unchecked ( ) == b '(' {
654
660
tokenizer. advance ( 1 ) ;
655
661
if value. eq_ignore_ascii_case ( "url" ) {
656
662
consume_unquoted_url ( tokenizer) . unwrap_or ( Function ( value) )
@@ -668,42 +674,51 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
668
674
669
675
fn consume_name < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Cow < ' a , str > {
670
676
let start_pos = tokenizer. position ( ) ;
671
- let mut value ;
677
+ let mut value_bytes ;
672
678
loop {
673
679
if tokenizer. is_eof ( ) {
674
680
return Borrowed ( tokenizer. slice_from ( start_pos) )
675
681
}
676
- match tokenizer. next_char ( ) {
677
- 'a' ...'z' | 'A' ...'Z' | '0' ...'9' | '_' | '-' => tokenizer. advance ( 1 ) ,
678
- '\\' | '\0' => {
679
- value = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
682
+ match tokenizer. next_byte_unchecked ( ) {
683
+ b 'a' ...b 'z' | b 'A' ...b 'Z' | b '0' ...b '9' | b '_' | b '-' => tokenizer. advance ( 1 ) ,
684
+ b '\\' | b '\0' => {
685
+ value_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
680
686
break
681
687
}
682
688
c if c. is_ascii ( ) => return Borrowed ( tokenizer. slice_from ( start_pos) ) ,
683
689
_ => {
684
- tokenizer. consume_char ( ) ;
690
+ tokenizer. advance ( 1 ) ;
685
691
}
686
692
}
687
693
}
688
694
689
695
while !tokenizer. is_eof ( ) {
690
- let c = tokenizer. next_char ( ) ;
691
- value . push ( match c {
692
- 'a' ...'z' | 'A' ...'Z' | '0' ...'9' | '_' | '-' => {
696
+ let c = tokenizer. next_byte_unchecked ( ) ;
697
+ match c {
698
+ b 'a' ...b 'z' | b 'A' ...b 'Z' | b '0' ...b '9' | b '_' | b '-' => {
693
699
tokenizer. advance ( 1 ) ;
694
- c
700
+ value_bytes . push ( c )
695
701
}
696
- '\\' => {
702
+ b '\\' => {
697
703
if tokenizer. has_newline_at ( 1 ) { break }
698
704
tokenizer. advance ( 1 ) ;
699
- consume_escape ( tokenizer)
705
+ consume_escape_and_write ( tokenizer, & mut value_bytes )
700
706
}
701
- '\0' => { tokenizer. advance ( 1 ) ; '\u{FFFD}' } ,
707
+ b'\0' => {
708
+ tokenizer. advance ( 1 ) ;
709
+ // value.push('\u{FFFD}')
710
+ value_bytes. push ( 0xef ) ;
711
+ value_bytes. push ( 0xbf ) ;
712
+ value_bytes. push ( 0xbd ) ;
713
+ } ,
702
714
c if c. is_ascii ( ) => break ,
703
- _ => tokenizer. consume_char ( ) ,
704
- } )
715
+ other => {
716
+ tokenizer. advance ( 1 ) ;
717
+ value_bytes. push ( other)
718
+ }
719
+ }
705
720
}
706
- Owned ( value )
721
+ Owned ( to_utf8 ( value_bytes ) )
707
722
}
708
723
709
724
@@ -825,7 +840,19 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
825
840
}
826
841
827
842
843
+ #[ inline]
844
+ fn to_utf8 ( string_bytes : Vec < u8 > ) -> String {
845
+ if cfg ! ( debug_assertions) {
846
+ String :: from_utf8 ( string_bytes) . unwrap ( )
847
+ } else {
848
+ unsafe {
849
+ String :: from_utf8_unchecked ( string_bytes)
850
+ }
851
+ }
852
+ }
853
+
828
854
fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Result < Token < ' a > , ( ) > {
855
+
829
856
for ( offset, c) in tokenizer. input [ tokenizer. position ..] . as_bytes ( ) . iter ( ) . cloned ( ) . enumerate ( ) {
830
857
match c {
831
858
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
@@ -845,7 +872,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
845
872
846
873
fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
847
874
let start_pos = tokenizer. position ( ) ;
848
- let mut string ;
875
+ let mut string_bytes : Vec < u8 > ;
849
876
loop {
850
877
if tokenizer. is_eof ( ) {
851
878
return UnquotedUrl ( Borrowed ( tokenizer. slice_from ( start_pos) ) )
@@ -867,7 +894,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
867
894
return consume_bad_url ( tokenizer)
868
895
} ,
869
896
b'\\' | b'\0' => {
870
- string = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
897
+ string_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
871
898
break
872
899
}
873
900
_ => {
@@ -876,32 +903,37 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
876
903
}
877
904
}
878
905
while !tokenizer. is_eof ( ) {
879
- let next_char = match tokenizer. consume_char ( ) {
880
- ' ' | '\t' | '\n' | '\r' | '\x0C' => {
881
- return consume_url_end ( tokenizer, Owned ( string ) )
906
+ match tokenizer. consume_byte ( ) {
907
+ b ' ' | b '\t' | b '\n' | b '\r' | b '\x0C' => {
908
+ return consume_url_end ( tokenizer, Owned ( to_utf8 ( string_bytes ) ) ) ;
882
909
}
883
- ')' => break ,
884
- '\x01' ...'\x08' | '\x0B' | '\x0E' ...'\x1F' | '\x7F' // non-printable
885
- | '"' | '\'' | '(' => return consume_bad_url ( tokenizer) ,
886
- '\\' => {
910
+ b ')' => break ,
911
+ b '\x01' ...b '\x08' | b '\x0B' | b '\x0E' ...b '\x1F' | b '\x7F' // non-printable
912
+ | b '"' | b '\'' | b '(' => return consume_bad_url ( tokenizer) ,
913
+ b '\\' => {
887
914
if tokenizer. has_newline_at ( 0 ) {
888
915
return consume_bad_url ( tokenizer)
889
916
}
890
- consume_escape ( tokenizer)
917
+
918
+ consume_escape_and_write ( tokenizer, & mut string_bytes)
891
919
} ,
892
- '\0' => '\u{FFFD}' ,
893
- c => c
894
- } ;
895
- string. push ( next_char)
920
+ b'\0' => {
921
+ // string.push('\u{FFFD}');
922
+ string_bytes. push ( 0xef ) ;
923
+ string_bytes. push ( 0xbf ) ;
924
+ string_bytes. push ( 0xbd ) ;
925
+ }
926
+ c => string_bytes. push ( c)
927
+ }
896
928
}
897
- UnquotedUrl ( Owned ( string ) )
929
+ UnquotedUrl ( Owned ( to_utf8 ( string_bytes ) ) )
898
930
}
899
931
900
932
fn consume_url_end < ' a > ( tokenizer : & mut Tokenizer < ' a > , string : Cow < ' a , str > ) -> Token < ' a > {
901
933
while !tokenizer. is_eof ( ) {
902
- match tokenizer. consume_char ( ) {
903
- ' ' | '\t' | '\n' | '\r' | '\x0C' => ( ) ,
904
- ')' => break ,
934
+ match tokenizer. consume_byte ( ) {
935
+ b ' ' | b '\t' | b '\n' | b '\r' | b '\x0C' => ( ) ,
936
+ b ')' => break ,
905
937
_ => return consume_bad_url ( tokenizer)
906
938
}
907
939
}
@@ -911,9 +943,9 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
911
943
fn consume_bad_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
912
944
// Consume up to the closing )
913
945
while !tokenizer. is_eof ( ) {
914
- match tokenizer. consume_char ( ) {
915
- ')' => break ,
916
- '\\' => tokenizer. advance ( 1 ) , // Skip an escaped ')' or '\'
946
+ match tokenizer. consume_byte ( ) {
947
+ b ')' => break ,
948
+ b '\\' => tokenizer. advance ( 1 ) , // Skip an escaped ')' or '\'
917
949
_ => ( )
918
950
}
919
951
}
@@ -972,20 +1004,29 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
972
1004
}
973
1005
974
1006
1007
+ // Same constraints as consume_escape except it writes into `bytes` the result
1008
+ // instead of returning it.
1009
+ //
1010
+ // TODO: This could be made more efficient with char::encode_utf8, I guess.
1011
+ fn consume_escape_and_write ( tokenizer : & mut Tokenizer , bytes : & mut Vec < u8 > ) {
1012
+ use std:: io:: Write ;
1013
+ write ! ( bytes, "{}" , consume_escape( tokenizer) ) . unwrap ( ) ;
1014
+ }
1015
+
975
1016
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
976
1017
// and that the next input character has already been verified
977
1018
// to not be a newline.
978
1019
fn consume_escape ( tokenizer : & mut Tokenizer ) -> char {
979
1020
if tokenizer. is_eof ( ) { return '\u{FFFD}' } // Escaped EOF
980
- match tokenizer. next_char ( ) {
981
- '0' ...'9' | 'A' ...'F' | 'a' ...'f' => {
1021
+ match tokenizer. next_byte_unchecked ( ) {
1022
+ b '0' ...b '9' | b 'A' ...b 'F' | b 'a' ...b 'f' => {
982
1023
let ( c, _) = consume_hex_digits ( tokenizer) ;
983
1024
if !tokenizer. is_eof ( ) {
984
- match tokenizer. next_char ( ) {
985
- ' ' | '\t' | '\n' | '\x0C' => tokenizer. advance ( 1 ) ,
986
- '\r' => {
1025
+ match tokenizer. next_byte_unchecked ( ) {
1026
+ b ' ' | b '\t' | b '\n' | b '\x0C' => tokenizer. advance ( 1 ) ,
1027
+ b '\r' => {
987
1028
tokenizer. advance ( 1 ) ;
988
- if !tokenizer. is_eof ( ) && tokenizer. next_char ( ) == '\n' {
1029
+ if !tokenizer. is_eof ( ) && tokenizer. next_byte_unchecked ( ) == b '\n' {
989
1030
tokenizer. advance ( 1 ) ;
990
1031
}
991
1032
}
@@ -1000,7 +1041,7 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1000
1041
REPLACEMENT_CHAR
1001
1042
}
1002
1043
} ,
1003
- '\0' => {
1044
+ b '\0' => {
1004
1045
tokenizer. advance ( 1 ) ;
1005
1046
'\u{FFFD}'
1006
1047
}
0 commit comments