Skip to content

Commit 66549fc

Browse files
lovasoaayman-sigma
authored andcommitted
Support for postgres String Constants with Unicode Escapes (apache#1355)
1 parent 47f6ab6 commit 66549fc

File tree

7 files changed

+180
-0
lines changed

7 files changed

+180
-0
lines changed

src/ast/value.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ pub enum Value {
5252
/// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS)
5353
/// for more details.
5454
EscapedStringLiteral(String),
55+
/// u&'string value' (postgres extension)
56+
/// See [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
57+
/// for more details.
58+
UnicodeStringLiteral(String),
5559
/// B'string value'
5660
SingleQuotedByteStringLiteral(String),
5761
/// B"string value"
@@ -102,6 +106,7 @@ impl fmt::Display for Value {
102106
}
103107
Value::DollarQuotedString(v) => write!(f, "{v}"),
104108
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
109+
Value::UnicodeStringLiteral(v) => write!(f, "U&'{}'", escape_unicode_string(v)),
105110
Value::NationalStringLiteral(v) => write!(f, "N'{v}'"),
106111
Value::HexStringLiteral(v) => write!(f, "X'{v}'"),
107112
Value::Boolean(v) => write!(f, "{v}"),
@@ -347,6 +352,41 @@ pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> {
347352
EscapeEscapedStringLiteral(s)
348353
}
349354

355+
pub struct EscapeUnicodeStringLiteral<'a>(&'a str);
356+
357+
impl<'a> fmt::Display for EscapeUnicodeStringLiteral<'a> {
358+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
359+
for c in self.0.chars() {
360+
match c {
361+
'\'' => {
362+
write!(f, "''")?;
363+
}
364+
'\\' => {
365+
write!(f, r#"\\"#)?;
366+
}
367+
x if x.is_ascii() => {
368+
write!(f, "{}", c)?;
369+
}
370+
_ => {
371+
let codepoint = c as u32;
372+
// if the character fits in 32 bits, we can use the \XXXX format
373+
// otherwise, we need to use the \+XXXXXX format
374+
if codepoint <= 0xFFFF {
375+
write!(f, "\\{:04X}", codepoint)?;
376+
} else {
377+
write!(f, "\\+{:06X}", codepoint)?;
378+
}
379+
}
380+
}
381+
}
382+
Ok(())
383+
}
384+
}
385+
386+
pub fn escape_unicode_string(s: &str) -> EscapeUnicodeStringLiteral<'_> {
387+
EscapeUnicodeStringLiteral(s)
388+
}
389+
350390
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
351391
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
352392
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]

src/dialect/generic.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ impl Dialect for GenericDialect {
3535
|| ch == '_'
3636
}
3737

38+
fn supports_unicode_string_literal(&self) -> bool {
39+
true
40+
}
41+
3842
fn supports_group_by_expr(&self) -> bool {
3943
true
4044
}

src/dialect/mod.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,21 @@ pub trait Dialect: Debug + Any {
145145
fn supports_string_literal_backslash_escape(&self) -> bool {
146146
false
147147
}
148+
149+
/// Determine if the dialect supports string literals with `U&` prefix.
150+
/// This is used to specify Unicode code points in string literals.
151+
/// For example, in PostgreSQL, the following is a valid string literal:
152+
/// ```sql
153+
/// SELECT U&'\0061\0062\0063';
154+
/// ```
155+
/// This is equivalent to the string literal `'abc'`.
156+
/// See
157+
/// - [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
158+
/// - [H2 docs](http://www.h2database.com/html/grammar.html#string)
159+
fn supports_unicode_string_literal(&self) -> bool {
160+
false
161+
}
162+
148163
/// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
149164
fn supports_filter_during_aggregation(&self) -> bool {
150165
false

src/dialect/postgresql.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ impl Dialect for PostgreSqlDialect {
4040
ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
4141
}
4242

43+
fn supports_unicode_string_literal(&self) -> bool {
44+
true
45+
}
46+
4347
/// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
4448
fn is_custom_operator_part(&self, ch: char) -> bool {
4549
matches!(

src/parser/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,6 +1195,10 @@ impl<'a> Parser<'a> {
11951195
self.prev_token();
11961196
Ok(Expr::Value(self.parse_value()?))
11971197
}
1198+
Token::UnicodeStringLiteral(_) => {
1199+
self.prev_token();
1200+
Ok(Expr::Value(self.parse_value()?))
1201+
}
11981202
Token::Number(_, _)
11991203
| Token::SingleQuotedString(_)
12001204
| Token::DoubleQuotedString(_)
@@ -1872,6 +1876,7 @@ impl<'a> Parser<'a> {
18721876
}
18731877
Token::SingleQuotedString(_)
18741878
| Token::EscapedStringLiteral(_)
1879+
| Token::UnicodeStringLiteral(_)
18751880
| Token::NationalStringLiteral(_)
18761881
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
18771882
_ => self.expected(
@@ -6979,6 +6984,7 @@ impl<'a> Parser<'a> {
69796984
}
69806985
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
69816986
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
6987+
Token::UnicodeStringLiteral(ref s) => Ok(Value::UnicodeStringLiteral(s.to_string())),
69826988
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
69836989
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
69846990
tok @ Token::Colon | tok @ Token::AtSign => {
@@ -7070,6 +7076,7 @@ impl<'a> Parser<'a> {
70707076
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
70717077
Ok(s)
70727078
}
7079+
Token::UnicodeStringLiteral(s) => Ok(s),
70737080
_ => self.expected("literal string", next_token),
70747081
}
70757082
}

src/tokenizer.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ pub enum Token {
9494
NationalStringLiteral(String),
9595
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
9696
EscapedStringLiteral(String),
97+
/// Unicode string literal: i.e: U&'first \000A second'
98+
UnicodeStringLiteral(String),
9799
/// Hexadecimal string literal: i.e.: X'deadbeef'
98100
HexStringLiteral(String),
99101
/// Comma
@@ -251,6 +253,7 @@ impl fmt::Display for Token {
251253
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
252254
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
253255
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
256+
Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
254257
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
255258
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
256259
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
@@ -794,6 +797,23 @@ impl<'a> Tokenizer<'a> {
794797
}
795798
}
796799
}
800+
// Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
801+
x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
802+
chars.next(); // consume, to check the next char
803+
if chars.peek() == Some(&'&') {
804+
// we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
805+
let mut chars_clone = chars.peekable.clone();
806+
chars_clone.next(); // consume the '&' in the clone
807+
if chars_clone.peek() == Some(&'\'') {
808+
chars.next(); // consume the '&' in the original iterator
809+
let s = unescape_unicode_single_quoted_string(chars)?;
810+
return Ok(Some(Token::UnicodeStringLiteral(s)));
811+
}
812+
}
813+
// regular identifier starting with an "U" or "u"
814+
let s = self.tokenize_word(x, chars);
815+
Ok(Some(Token::make_word(&s, None)))
816+
}
797817
// The spec only allows an uppercase 'X' to introduce a hex
798818
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
799819
x @ 'x' | x @ 'X' => {
@@ -1797,6 +1817,64 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
17971817
}
17981818
}
17991819

1820+
fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
1821+
let mut unescaped = String::new();
1822+
chars.next(); // consume the opening quote
1823+
while let Some(c) = chars.next() {
1824+
match c {
1825+
'\'' => {
1826+
if chars.peek() == Some(&'\'') {
1827+
chars.next();
1828+
unescaped.push('\'');
1829+
} else {
1830+
return Ok(unescaped);
1831+
}
1832+
}
1833+
'\\' => match chars.peek() {
1834+
Some('\\') => {
1835+
chars.next();
1836+
unescaped.push('\\');
1837+
}
1838+
Some('+') => {
1839+
chars.next();
1840+
unescaped.push(take_char_from_hex_digits(chars, 6)?);
1841+
}
1842+
_ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
1843+
},
1844+
_ => {
1845+
unescaped.push(c);
1846+
}
1847+
}
1848+
}
1849+
Err(TokenizerError {
1850+
message: "Unterminated unicode encoded string literal".to_string(),
1851+
location: chars.location(),
1852+
})
1853+
}
1854+
1855+
fn take_char_from_hex_digits(
1856+
chars: &mut State<'_>,
1857+
max_digits: usize,
1858+
) -> Result<char, TokenizerError> {
1859+
let mut result = 0u32;
1860+
for _ in 0..max_digits {
1861+
let next_char = chars.next().ok_or_else(|| TokenizerError {
1862+
message: "Unexpected EOF while parsing hex digit in escaped unicode string."
1863+
.to_string(),
1864+
location: chars.location(),
1865+
})?;
1866+
let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
1867+
message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
1868+
location: chars.location(),
1869+
})?;
1870+
result = result * 16 + digit;
1871+
}
1872+
char::from_u32(result).ok_or_else(|| TokenizerError {
1873+
message: format!("Invalid unicode character: {:x}", result),
1874+
location: chars.location(),
1875+
})
1876+
}
1877+
18001878
#[cfg(test)]
18011879
mod tests {
18021880
use super::*;

tests/sqlparser_postgres.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4441,3 +4441,35 @@ fn test_table_unnest_with_ordinality() {
44414441
_ => panic!("Expecting TableFactor::UNNEST with ordinality"),
44424442
}
44434443
}
4444+
4445+
#[test]
4446+
fn test_escaped_string_literal() {
4447+
match pg().verified_expr(r#"E'\n'"#) {
4448+
Expr::Value(Value::EscapedStringLiteral(s)) => {
4449+
assert_eq!("\n", s);
4450+
}
4451+
_ => unreachable!(),
4452+
}
4453+
}
4454+
4455+
#[test]
4456+
fn test_unicode_string_literal() {
4457+
let pairs = [
4458+
// Example from the postgres docs
4459+
(r#"U&'\0441\043B\043E\043D'"#, "слон"),
4460+
// High unicode code point (> 0xFFFF)
4461+
(r#"U&'\+01F418'"#, "🐘"),
4462+
// Escaped backslash
4463+
(r#"U&'\\'"#, r#"\"#),
4464+
// Escaped single quote
4465+
(r#"U&''''"#, "'"),
4466+
];
4467+
for (input, expected) in pairs {
4468+
match pg_and_generic().verified_expr(input) {
4469+
Expr::Value(Value::UnicodeStringLiteral(s)) => {
4470+
assert_eq!(expected, s);
4471+
}
4472+
_ => unreachable!(),
4473+
}
4474+
}
4475+
}

0 commit comments

Comments
 (0)