feat: Strings with Unicode Escapes (#43)

waralexrom · mcheshkov · commit 8180af7cbc36 · 2024-09-03T00:54:43.000+03:00
Can drop this after rebase on commit bc15f7b "Support for postgres String Constants with Unicode Escapes (apache#1355)", first released in 0.50.0
diff --git a/src/ast/value.rs b/src/ast/value.rs
@@ -39,6 +39,8 @@ pub enum Value {
     NationalStringLiteral(String),
     /// X'hex value'
     HexStringLiteral(String),
+    /// U&'hex value'
+    UnicodeEscapedStringLiteral(String),
 
     DoubleQuotedString(String),
     /// Boolean value true or false
@@ -75,6 +77,7 @@ impl fmt::Display for Value {
             Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v),
             Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
             Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
+            Value::UnicodeEscapedStringLiteral(v) => write!(f, "U&'{}'", escape_escaped_string(v)),
             Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
             Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
             Value::Boolean(v) => write!(f, "{}", v),
diff --git a/src/parser.rs b/src/parser.rs
@@ -527,7 +527,7 @@ impl<'a> Parser<'a> {
                     expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?),
                 })
             }
-            Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
+            Token::EscapedStringLiteral(_) | Token::UnicodeEscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
             {
                 self.prev_token();
                 Ok(Expr::Value(self.parse_value()?))
@@ -956,6 +956,7 @@ impl<'a> Parser<'a> {
                     Token::SingleQuotedString(_)
                     | Token::EscapedStringLiteral(_)
                     | Token::NationalStringLiteral(_)
+                    | Token::UnicodeEscapedStringLiteral(_)
                     | Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
                     unexpected => {
                         self.expected("either filler, WITH, or WITHOUT in LISTAGG", unexpected)?
@@ -2888,6 +2889,9 @@ impl<'a> Parser<'a> {
             Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
             Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
             Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
+            Token::UnicodeEscapedStringLiteral(ref s) => {
+                Ok(Value::UnicodeEscapedStringLiteral(s.to_string()))
+            }
             Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
             Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
             unexpected => self.expected("a value", unexpected),
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -55,6 +55,8 @@ pub enum Token {
     EscapedStringLiteral(String),
     /// Hexadecimal string literal: i.e.: X'deadbeef'
     HexStringLiteral(String),
+    /// Unicode escaped string: U&'d\0061t\+000061' (data)
+    UnicodeEscapedStringLiteral(String),
     /// Comma
     Comma,
     /// Whitespace (space, tab, etc)
@@ -164,6 +166,7 @@ impl fmt::Display for Token {
             Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
             Token::EscapedStringLiteral(ref s) => write!(f, "E'{}'", s),
             Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
+            Token::UnicodeEscapedStringLiteral(ref s) => write!(f, "U&'{}'", s),
             Token::Comma => f.write_str(","),
             Token::Whitespace(ws) => write!(f, "{}", ws),
             Token::DoubleEq => f.write_str("=="),
@@ -427,6 +430,28 @@ impl<'a> Tokenizer<'a> {
                         }
                     }
                 }
+                x @ 'u' | x @ 'U' => {
+                    chars.next(); // consume, to check the next char
+                    let mut look_ahead_chars = chars.clone();
+                    if look_ahead_chars.next_if_eq(&'&').is_some() {
+                        match look_ahead_chars.peek() {
+                            Some('\'') => {
+                                //Move chars to the position of look_ahead_chars
+                                chars.next();
+                                // U&'...' - a <binary string literal>
+                                let s = self.tokenize_single_quoted_string(chars)?;
+                                Ok(Some(Token::UnicodeEscapedStringLiteral(s)))
+                            }
+                            _ => {
+                                let s = self.tokenize_word(x, chars);
+                                Ok(Some(Token::make_word(&s, None)))
+                            }
+                        }
+                    } else {
+                        let s = self.tokenize_word(x, chars);
+                        Ok(Some(Token::make_word(&s, None)))
+                    }
+                }
                 // identifier or keyword
                 ch if self.dialect.is_identifier_start(ch) => {
                     chars.next(); // consume the first char
@@ -1454,4 +1479,36 @@ mod tests {
         //println!("------------------------------");
         assert_eq!(expected, actual);
     }
+    #[test]
+    fn tokenize_unicode_escaped_literal() {
+        let sql = r#"U&'aaa'"#;
+        let dialect = GenericDialect {};
+        let mut tokenizer = Tokenizer::new(&dialect, sql);
+        let tokens = tokenizer.tokenize().unwrap();
+        let expected = vec![Token::UnicodeEscapedStringLiteral("aaa".to_string())];
+        compare(expected, tokens);
+
+        let sql = r#"U&a"#;
+        let dialect = GenericDialect {};
+        let mut tokenizer = Tokenizer::new(&dialect, sql);
+        let tokens = tokenizer.tokenize().unwrap();
+        let expected = vec![
+            Token::make_word("U", None),
+            Token::Ampersand,
+            Token::make_word("a", None),
+        ];
+        compare(expected, tokens);
+        let sql = r#"U & 'aaa'"#;
+        let dialect = GenericDialect {};
+        let mut tokenizer = Tokenizer::new(&dialect, sql);
+        let tokens = tokenizer.tokenize().unwrap();
+        let expected = vec![
+            Token::make_word("U", None),
+            Token::Whitespace(Whitespace::Space),
+            Token::Ampersand,
+            Token::Whitespace(Whitespace::Space),
+            Token::SingleQuotedString("aaa".to_string()),
+        ];
+        compare(expected, tokens);
+    }
 }