@@ -55,6 +55,8 @@ pub enum Token {
55
55
EscapedStringLiteral ( String ) ,
56
56
/// Hexadecimal string literal: i.e.: X'deadbeef'
57
57
HexStringLiteral ( String ) ,
58
+ /// Unicode escaped string: U&'d\0061t\+000061' (data)
59
+ UnicodeEscapedStringLiteral ( String ) ,
58
60
/// Comma
59
61
Comma ,
60
62
/// Whitespace (space, tab, etc)
@@ -164,6 +166,7 @@ impl fmt::Display for Token {
164
166
Token :: NationalStringLiteral ( ref s) => write ! ( f, "N'{}'" , s) ,
165
167
Token :: EscapedStringLiteral ( ref s) => write ! ( f, "E'{}'" , s) ,
166
168
Token :: HexStringLiteral ( ref s) => write ! ( f, "X'{}'" , s) ,
169
+ Token :: UnicodeEscapedStringLiteral ( ref s) => write ! ( f, "U&'{}'" , s) ,
167
170
Token :: Comma => f. write_str ( "," ) ,
168
171
Token :: Whitespace ( ws) => write ! ( f, "{}" , ws) ,
169
172
Token :: DoubleEq => f. write_str ( "==" ) ,
@@ -427,6 +430,28 @@ impl<'a> Tokenizer<'a> {
427
430
}
428
431
}
429
432
}
433
+ x @ 'u' | x @ 'U' => {
434
+ chars. next ( ) ; // consume, to check the next char
435
+ let mut look_ahead_chars = chars. clone ( ) ;
436
+ if look_ahead_chars. next_if_eq ( & '&' ) . is_some ( ) {
437
+ match look_ahead_chars. peek ( ) {
438
+ Some ( '\'' ) => {
439
+ //Move chars to the position of look_ahead_chars
440
+ chars. next ( ) ;
441
+ // U&'...' - a <binary string literal>
442
+ let s = self . tokenize_single_quoted_string ( chars) ?;
443
+ Ok ( Some ( Token :: UnicodeEscapedStringLiteral ( s) ) )
444
+ }
445
+ _ => {
446
+ let s = self . tokenize_word ( x, chars) ;
447
+ Ok ( Some ( Token :: make_word ( & s, None ) ) )
448
+ }
449
+ }
450
+ } else {
451
+ let s = self . tokenize_word ( x, chars) ;
452
+ Ok ( Some ( Token :: make_word ( & s, None ) ) )
453
+ }
454
+ }
430
455
// identifier or keyword
431
456
ch if self . dialect . is_identifier_start ( ch) => {
432
457
chars. next ( ) ; // consume the first char
@@ -1454,4 +1479,36 @@ mod tests {
1454
1479
//println!("------------------------------");
1455
1480
assert_eq ! ( expected, actual) ;
1456
1481
}
1482
+ #[ test]
1483
+ fn tokenize_unicode_escaped_literal ( ) {
1484
+ let sql = r#"U&'aaa'"# ;
1485
+ let dialect = GenericDialect { } ;
1486
+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1487
+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1488
+ let expected = vec ! [ Token :: UnicodeEscapedStringLiteral ( "aaa" . to_string( ) ) ] ;
1489
+ compare ( expected, tokens) ;
1490
+
1491
+ let sql = r#"U&a"# ;
1492
+ let dialect = GenericDialect { } ;
1493
+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1494
+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1495
+ let expected = vec ! [
1496
+ Token :: make_word( "U" , None ) ,
1497
+ Token :: Ampersand ,
1498
+ Token :: make_word( "a" , None ) ,
1499
+ ] ;
1500
+ compare ( expected, tokens) ;
1501
+ let sql = r#"U & 'aaa'"# ;
1502
+ let dialect = GenericDialect { } ;
1503
+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1504
+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1505
+ let expected = vec ! [
1506
+ Token :: make_word( "U" , None ) ,
1507
+ Token :: Whitespace ( Whitespace :: Space ) ,
1508
+ Token :: Ampersand ,
1509
+ Token :: Whitespace ( Whitespace :: Space ) ,
1510
+ Token :: SingleQuotedString ( "aaa" . to_string( ) ) ,
1511
+ ] ;
1512
+ compare ( expected, tokens) ;
1513
+ }
1457
1514
}
0 commit comments