Support for postgres String Constants with Unicode Escapes (apache#1355)

lovasoa · ayman-sigma · commit 66549fc17c0a · 2024-11-18T18:04:18.000-08:00
diff --git a/src/ast/value.rs b/src/ast/value.rs
@@ -52,6 +52,10 @@ pub enum Value {
     /// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS)
     /// for more details.
     EscapedStringLiteral(String),
+    /// u&'string value' (postgres extension)
+    /// See [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
+    /// for more details.
+    UnicodeStringLiteral(String),
     /// B'string value'
     SingleQuotedByteStringLiteral(String),
     /// B"string value"
@@ -102,6 +106,7 @@ impl fmt::Display for Value {
             }
             Value::DollarQuotedString(v) => write!(f, "{v}"),
             Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
+            Value::UnicodeStringLiteral(v) => write!(f, "U&'{}'", escape_unicode_string(v)),
             Value::NationalStringLiteral(v) => write!(f, "N'{v}'"),
             Value::HexStringLiteral(v) => write!(f, "X'{v}'"),
             Value::Boolean(v) => write!(f, "{v}"),
@@ -347,6 +352,41 @@ pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> {
     EscapeEscapedStringLiteral(s)
 }
 
+pub struct EscapeUnicodeStringLiteral<'a>(&'a str);
+
+impl<'a> fmt::Display for EscapeUnicodeStringLiteral<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        for c in self.0.chars() {
+            match c {
+                '\'' => {
+                    write!(f, "''")?;
+                }
+                '\\' => {
+                    write!(f, r#"\\"#)?;
+                }
+                x if x.is_ascii() => {
+                    write!(f, "{}", c)?;
+                }
+                _ => {
+                    let codepoint = c as u32;
+                    // if the character fits in 32 bits, we can use the \XXXX format
+                    // otherwise, we need to use the \+XXXXXX format
+                    if codepoint <= 0xFFFF {
+                        write!(f, "\\{:04X}", codepoint)?;
+                    } else {
+                        write!(f, "\\+{:06X}", codepoint)?;
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+pub fn escape_unicode_string(s: &str) -> EscapeUnicodeStringLiteral<'_> {
+    EscapeUnicodeStringLiteral(s)
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
@@ -35,6 +35,10 @@ impl Dialect for GenericDialect {
             || ch == '_'
     }
 
+    fn supports_unicode_string_literal(&self) -> bool {
+        true
+    }
+
     fn supports_group_by_expr(&self) -> bool {
         true
     }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
@@ -145,6 +145,21 @@ pub trait Dialect: Debug + Any {
     fn supports_string_literal_backslash_escape(&self) -> bool {
         false
     }
+
+    /// Determine if the dialect supports string literals with `U&` prefix.
+    /// This is used to specify Unicode code points in string literals.
+    /// For example, in PostgreSQL, the following is a valid string literal:
+    /// ```sql
+    /// SELECT U&'\0061\0062\0063';
+    /// ```
+    /// This is equivalent to the string literal `'abc'`.
+    /// See
+    ///  - [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
+    ///  - [H2 docs](http://www.h2database.com/html/grammar.html#string)
+    fn supports_unicode_string_literal(&self) -> bool {
+        false
+    }
+
     /// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
     fn supports_filter_during_aggregation(&self) -> bool {
         false
diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs
@@ -40,6 +40,10 @@ impl Dialect for PostgreSqlDialect {
         ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
     }
 
+    fn supports_unicode_string_literal(&self) -> bool {
+        true
+    }
+
     /// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
     fn is_custom_operator_part(&self, ch: char) -> bool {
         matches!(
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
@@ -1195,6 +1195,10 @@ impl<'a> Parser<'a> {
                 self.prev_token();
                 Ok(Expr::Value(self.parse_value()?))
             }
+            Token::UnicodeStringLiteral(_) => {
+                self.prev_token();
+                Ok(Expr::Value(self.parse_value()?))
+            }
             Token::Number(_, _)
             | Token::SingleQuotedString(_)
             | Token::DoubleQuotedString(_)
@@ -1872,6 +1876,7 @@ impl<'a> Parser<'a> {
                     }
                     Token::SingleQuotedString(_)
                     | Token::EscapedStringLiteral(_)
+                    | Token::UnicodeStringLiteral(_)
                     | Token::NationalStringLiteral(_)
                     | Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
                     _ => self.expected(
@@ -6979,6 +6984,7 @@ impl<'a> Parser<'a> {
             }
             Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
             Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
+            Token::UnicodeStringLiteral(ref s) => Ok(Value::UnicodeStringLiteral(s.to_string())),
             Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
             Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
             tok @ Token::Colon | tok @ Token::AtSign => {
@@ -7070,6 +7076,7 @@ impl<'a> Parser<'a> {
             Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
                 Ok(s)
             }
+            Token::UnicodeStringLiteral(s) => Ok(s),
             _ => self.expected("literal string", next_token),
         }
     }
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -94,6 +94,8 @@ pub enum Token {
     NationalStringLiteral(String),
     /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
     EscapedStringLiteral(String),
+    /// Unicode string literal: i.e: U&'first \000A second'
+    UnicodeStringLiteral(String),
     /// Hexadecimal string literal: i.e.: X'deadbeef'
     HexStringLiteral(String),
     /// Comma
@@ -251,6 +253,7 @@ impl fmt::Display for Token {
             Token::DollarQuotedString(ref s) => write!(f, "{s}"),
             Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
             Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
+            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
             Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
             Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
             Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
@@ -794,6 +797,23 @@ impl<'a> Tokenizer<'a> {
                         }
                     }
                 }
+                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
+                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
+                    chars.next(); // consume, to check the next char
+                    if chars.peek() == Some(&'&') {
+                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
+                        let mut chars_clone = chars.peekable.clone();
+                        chars_clone.next(); // consume the '&' in the clone
+                        if chars_clone.peek() == Some(&'\'') {
+                            chars.next(); // consume the '&' in the original iterator
+                            let s = unescape_unicode_single_quoted_string(chars)?;
+                            return Ok(Some(Token::UnicodeStringLiteral(s)));
+                        }
+                    }
+                    // regular identifier starting with an "U" or "u"
+                    let s = self.tokenize_word(x, chars);
+                    Ok(Some(Token::make_word(&s, None)))
+                }
                 // The spec only allows an uppercase 'X' to introduce a hex
                 // string, but PostgreSQL, at least, allows a lowercase 'x' too.
                 x @ 'x' | x @ 'X' => {
@@ -1797,6 +1817,64 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
     }
 }
 
+fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
+    let mut unescaped = String::new();
+    chars.next(); // consume the opening quote
+    while let Some(c) = chars.next() {
+        match c {
+            '\'' => {
+                if chars.peek() == Some(&'\'') {
+                    chars.next();
+                    unescaped.push('\'');
+                } else {
+                    return Ok(unescaped);
+                }
+            }
+            '\\' => match chars.peek() {
+                Some('\\') => {
+                    chars.next();
+                    unescaped.push('\\');
+                }
+                Some('+') => {
+                    chars.next();
+                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
+                }
+                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
+            },
+            _ => {
+                unescaped.push(c);
+            }
+        }
+    }
+    Err(TokenizerError {
+        message: "Unterminated unicode encoded string literal".to_string(),
+        location: chars.location(),
+    })
+}
+
+fn take_char_from_hex_digits(
+    chars: &mut State<'_>,
+    max_digits: usize,
+) -> Result<char, TokenizerError> {
+    let mut result = 0u32;
+    for _ in 0..max_digits {
+        let next_char = chars.next().ok_or_else(|| TokenizerError {
+            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
+                .to_string(),
+            location: chars.location(),
+        })?;
+        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
+            message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
+            location: chars.location(),
+        })?;
+        result = result * 16 + digit;
+    }
+    char::from_u32(result).ok_or_else(|| TokenizerError {
+        message: format!("Invalid unicode character: {:x}", result),
+        location: chars.location(),
+    })
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs
@@ -4441,3 +4441,35 @@ fn test_table_unnest_with_ordinality() {
         _ => panic!("Expecting TableFactor::UNNEST with ordinality"),
     }
 }
+
+#[test]
+fn test_escaped_string_literal() {
+    match pg().verified_expr(r#"E'\n'"#) {
+        Expr::Value(Value::EscapedStringLiteral(s)) => {
+            assert_eq!("\n", s);
+        }
+        _ => unreachable!(),
+    }
+}
+
+#[test]
+fn test_unicode_string_literal() {
+    let pairs = [
+        // Example from the postgres docs
+        (r#"U&'\0441\043B\043E\043D'"#, "слон"),
+        // High unicode code point (> 0xFFFF)
+        (r#"U&'\+01F418'"#, "🐘"),
+        // Escaped backslash
+        (r#"U&'\\'"#, r#"\"#),
+        // Escaped single quote
+        (r#"U&''''"#, "'"),
+    ];
+    for (input, expected) in pairs {
+        match pg_and_generic().verified_expr(input) {
+            Expr::Value(Value::UnicodeStringLiteral(s)) => {
+                assert_eq!(expected, s);
+            }
+            _ => unreachable!(),
+        }
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,10 @@ impl Dialect for GenericDialect {`
`35`	`35`	`\|\| ch == '_'`
`36`	`36`	`}`
`37`	`37`
	`38`	`+ fn supports_unicode_string_literal(&self) -> bool {`
	`39`	`+ true`
	`40`	`+ }`
	`41`	`+`
`38`	`42`	`fn supports_group_by_expr(&self) -> bool {`
`39`	`43`	`true`
`40`	`44`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1195,6 +1195,10 @@ impl<'a> Parser<'a> {`
`1195`	`1195`	`self.prev_token();`
`1196`	`1196`	`Ok(Expr::Value(self.parse_value()?))`
`1197`	`1197`	`}`
	`1198`	`+ Token::UnicodeStringLiteral(_) => {`
	`1199`	`+ self.prev_token();`
	`1200`	`+ Ok(Expr::Value(self.parse_value()?))`
	`1201`	`+ }`
`1198`	`1202`	`Token::Number(_, _)`
`1199`	`1203`	`\| Token::SingleQuotedString(_)`
`1200`	`1204`	`\| Token::DoubleQuotedString(_)`
`@@ -1872,6 +1876,7 @@ impl<'a> Parser<'a> {`
`1872`	`1876`	`}`
`1873`	`1877`	`Token::SingleQuotedString(_)`
`1874`	`1878`	`\| Token::EscapedStringLiteral(_)`
	`1879`	`+ \| Token::UnicodeStringLiteral(_)`
`1875`	`1880`	`\| Token::NationalStringLiteral(_)`
`1876`	`1881`	`\| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),`
`1877`	`1882`	`_ => self.expected(`
`@@ -6979,6 +6984,7 @@ impl<'a> Parser<'a> {`
`6979`	`6984`	`}`
`6980`	`6985`	`Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),`
`6981`	`6986`	`Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),`
	`6987`	`+ Token::UnicodeStringLiteral(ref s) => Ok(Value::UnicodeStringLiteral(s.to_string())),`
`6982`	`6988`	`Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),`
`6983`	`6989`	`Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),`
`6984`	`6990`	`tok @ Token::Colon \| tok @ Token::AtSign => {`
`@@ -7070,6 +7076,7 @@ impl<'a> Parser<'a> {`
`7070`	`7076`	`Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect \| GenericDialect) => {`
`7071`	`7077`	`Ok(s)`
`7072`	`7078`	`}`
	`7079`	`+ Token::UnicodeStringLiteral(s) => Ok(s),`
`7073`	`7080`	`_ => self.expected("literal string", next_token),`
`7074`	`7081`	`}`
`7075`	`7082`	`}`