diff --git a/src/ast/operator.rs b/src/ast/operator.rs index 3c4f192e3..e70df344a 100644 --- a/src/ast/operator.rs +++ b/src/ast/operator.rs @@ -111,7 +111,7 @@ pub enum BinaryOperator { DuckIntegerDivide, /// MySQL [`DIV`](https://dev.mysql.com/doc/refman/8.0/en/arithmetic-functions.html) integer division MyIntegerDivide, - /// Support for custom operators (built by parsers outside this crate) + /// Support for custom operators (such as Postgres custom operators) Custom(String), /// Bitwise XOR, e.g. `a # b` (PostgreSQL-specific) PGBitwiseXor, diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index da5c8c5ac..c79257456 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -122,6 +122,12 @@ pub trait Dialect: Debug + Any { fn is_identifier_start(&self, ch: char) -> bool; /// Determine if a character is a valid unquoted identifier character fn is_identifier_part(&self, ch: char) -> bool; + + /// Most dialects do not have custom operators. Override this method to provide custom operators. + fn is_custom_operator_part(&self, _ch: char) -> bool { + false + } + /// Determine if the dialect supports escaping characters via '\' in string literals. /// /// Some dialects like BigQuery and Snowflake support this while others like diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index f179111e0..0e04bfa27 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -25,6 +25,10 @@ impl Dialect for PostgreSqlDialect { Some('"') } + fn is_delimited_identifier_start(&self, ch: char) -> bool { + ch == '"' // Postgres does not support backticks to quote identifiers + } + fn is_identifier_start(&self, ch: char) -> bool { // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS // We don't yet support identifiers beginning with "letters with @@ -36,6 +40,29 @@ impl Dialect for PostgreSqlDialect { ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' } + /// See + fn is_custom_operator_part(&self, ch: char) -> bool { + matches!( + ch, + '+' | '-' + | '*' + | '/' + | '<' + | '>' + | '=' + | '~' + | '!' + | '@' + | '#' + | '%' + | '^' + | '&' + | '|' + | '`' + | '?' + ) + } + fn parse_statement(&self, parser: &mut Parser) -> Option> { if parser.parse_keyword(Keyword::COMMENT) { Some(parse_comment(parser)) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index c6750644c..83e2bed39 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2300,9 +2300,8 @@ impl<'a> Parser<'a> { return infix; } - let tok = self.next_token(); - - let regular_binary_operator = match &tok.token { + let mut tok = self.next_token(); + let regular_binary_operator = match &mut tok.token { Token::Spaceship => Some(BinaryOperator::Spaceship), Token::DoubleEq => Some(BinaryOperator::Eq), Token::Eq => Some(BinaryOperator::Eq), @@ -2366,6 +2365,7 @@ impl<'a> Parser<'a> { Token::Question => Some(BinaryOperator::Question), Token::QuestionAnd => Some(BinaryOperator::QuestionAnd), Token::QuestionPipe => Some(BinaryOperator::QuestionPipe), + Token::CustomBinaryOperator(s) => Some(BinaryOperator::Custom(core::mem::take(s))), Token::Word(w) => match w.keyword { Keyword::AND => Some(BinaryOperator::And), @@ -2920,7 +2920,8 @@ impl<'a> Parser<'a> { | Token::AtAt | Token::Question | Token::QuestionAnd - | Token::QuestionPipe => Ok(Self::PG_OTHER_PREC), + | Token::QuestionPipe + | Token::CustomBinaryOperator(_) => Ok(Self::PG_OTHER_PREC), _ => Ok(0), } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index b6fed354d..bcc5478bc 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -231,6 +231,10 @@ pub enum Token { /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level /// keys within the jsonb object QuestionPipe, + /// Custom binary operator + /// This is used to represent any custom binary operator that is not part of the SQL standard. + /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR. + CustomBinaryOperator(String), } impl fmt::Display for Token { @@ -320,6 +324,7 @@ impl fmt::Display for Token { Token::Question => write!(f, "?"), Token::QuestionAnd => write!(f, "?&"), Token::QuestionPipe => write!(f, "?|"), + Token::CustomBinaryOperator(s) => f.write_str(s), } } } @@ -961,15 +966,12 @@ impl<'a> Tokenizer<'a> { Some('>') => { chars.next(); match chars.peek() { - Some('>') => { - chars.next(); - Ok(Some(Token::LongArrow)) - } - _ => Ok(Some(Token::Arrow)), + Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow), + _ => self.start_binop(chars, "->", Token::Arrow), } } // a regular '-' operator - _ => Ok(Some(Token::Minus)), + _ => self.start_binop(chars, "-", Token::Minus), } } '/' => { @@ -999,26 +1001,28 @@ impl<'a> Tokenizer<'a> { '%' => { chars.next(); // advance past '%' match chars.peek() { - Some(' ') => Ok(Some(Token::Mod)), + Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)), Some(sch) if self.dialect.is_identifier_start('%') => { self.tokenize_identifier_or_keyword([ch, *sch], chars) } - _ => Ok(Some(Token::Mod)), + _ => self.start_binop(chars, "%", Token::Mod), } } '|' => { chars.next(); // consume the '|' match chars.peek() { - Some('/') => self.consume_and_return(chars, Token::PGSquareRoot), + Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot), Some('|') => { chars.next(); // consume the second '|' match chars.peek() { - Some('/') => self.consume_and_return(chars, Token::PGCubeRoot), - _ => Ok(Some(Token::StringConcat)), + Some('/') => { + self.consume_for_binop(chars, "||/", Token::PGCubeRoot) + } + _ => self.start_binop(chars, "||", Token::StringConcat), } } // Bitshift '|' operator - _ => Ok(Some(Token::Pipe)), + _ => self.start_binop(chars, "|", Token::Pipe), } } '=' => { @@ -1061,22 +1065,22 @@ impl<'a> Tokenizer<'a> { Some('=') => { chars.next(); match chars.peek() { - Some('>') => self.consume_and_return(chars, Token::Spaceship), - _ => Ok(Some(Token::LtEq)), + Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship), + _ => self.start_binop(chars, "<=", Token::LtEq), } } - Some('>') => self.consume_and_return(chars, Token::Neq), - Some('<') => self.consume_and_return(chars, Token::ShiftLeft), - Some('@') => self.consume_and_return(chars, Token::ArrowAt), - _ => Ok(Some(Token::Lt)), + Some('>') => self.consume_for_binop(chars, "<>", Token::Neq), + Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft), + Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt), + _ => self.start_binop(chars, "<", Token::Lt), } } '>' => { chars.next(); // consume match chars.peek() { - Some('=') => self.consume_and_return(chars, Token::GtEq), - Some('>') => self.consume_and_return(chars, Token::ShiftRight), - _ => Ok(Some(Token::Gt)), + Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq), + Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight), + _ => self.start_binop(chars, ">", Token::Gt), } } ':' => { @@ -1094,9 +1098,12 @@ impl<'a> Tokenizer<'a> { '&' => { chars.next(); // consume the '&' match chars.peek() { - Some('&') => self.consume_and_return(chars, Token::Overlap), + Some('&') => { + chars.next(); // consume the second '&' + self.start_binop(chars, "&&", Token::Overlap) + } // Bitshift '&' operator - _ => Ok(Some(Token::Ampersand)), + _ => self.start_binop(chars, "&", Token::Ampersand), } } '^' => { @@ -1119,38 +1126,37 @@ impl<'a> Tokenizer<'a> { '~' => { chars.next(); // consume match chars.peek() { - Some('*') => self.consume_and_return(chars, Token::TildeAsterisk), + Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk), Some('~') => { chars.next(); match chars.peek() { Some('*') => { - self.consume_and_return(chars, Token::DoubleTildeAsterisk) + self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk) } - _ => Ok(Some(Token::DoubleTilde)), + _ => self.start_binop(chars, "~~", Token::DoubleTilde), } } - _ => Ok(Some(Token::Tilde)), + _ => self.start_binop(chars, "~", Token::Tilde), } } '#' => { chars.next(); match chars.peek() { - Some('-') => self.consume_and_return(chars, Token::HashMinus), + Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus), Some('>') => { chars.next(); match chars.peek() { Some('>') => { - chars.next(); - Ok(Some(Token::HashLongArrow)) + self.consume_for_binop(chars, "#>>", Token::HashLongArrow) } - _ => Ok(Some(Token::HashArrow)), + _ => self.start_binop(chars, "#>", Token::HashArrow), } } Some(' ') => Ok(Some(Token::Sharp)), Some(sch) if self.dialect.is_identifier_start('#') => { self.tokenize_identifier_or_keyword([ch, *sch], chars) } - _ => Ok(Some(Token::Sharp)), + _ => self.start_binop(chars, "#", Token::Sharp), } } '@' => { @@ -1206,6 +1212,39 @@ impl<'a> Tokenizer<'a> { } } + /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix + fn consume_for_binop( + &self, + chars: &mut State, + prefix: &str, + default: Token, + ) -> Result, TokenizerError> { + chars.next(); // consume the first char + self.start_binop(chars, prefix, default) + } + + /// parse a custom binary operator + fn start_binop( + &self, + chars: &mut State, + prefix: &str, + default: Token, + ) -> Result, TokenizerError> { + let mut custom = None; + while let Some(&ch) = chars.peek() { + if !self.dialect.is_custom_operator_part(ch) { + break; + } + + custom.get_or_insert_with(|| prefix.to_string()).push(ch); + chars.next(); + } + + Ok(Some( + custom.map(Token::CustomBinaryOperator).unwrap_or(default), + )) + } + /// Tokenize dollar preceded value (i.e: a string/placeholder) fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result { let mut s = String::new(); diff --git a/tests/sqlparser_mssql.rs b/tests/sqlparser_mssql.rs index 5d61c6ab9..86d3990f6 100644 --- a/tests/sqlparser_mssql.rs +++ b/tests/sqlparser_mssql.rs @@ -437,6 +437,12 @@ fn parse_for_json_expect_ast() { ); } +#[test] +fn parse_ampersand_arobase() { + // In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b) + ms().expr_parses_to("a&@b", "a & @b"); +} + #[test] fn parse_cast_varchar_max() { ms_and_generic().verified_expr("CAST('foo' AS VARCHAR(MAX))"); diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 677246a51..d1218f283 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1757,6 +1757,29 @@ fn parse_pg_returning() { }; } +fn test_operator(operator: &str, dialect: &TestedDialects, expected: BinaryOperator) { + let operator_tokens = + sqlparser::tokenizer::Tokenizer::new(&PostgreSqlDialect {}, &format!("a{operator}b")) + .tokenize() + .unwrap(); + assert_eq!( + operator_tokens.len(), + 3, + "binary op should be 3 tokens, not {operator_tokens:?}" + ); + let expected_expr = Expr::BinaryOp { + left: Box::new(Expr::Identifier(Ident::new("a"))), + op: expected, + right: Box::new(Expr::Identifier(Ident::new("b"))), + }; + let str_expr_canonical = format!("a {operator} b"); + assert_eq!(expected_expr, dialect.verified_expr(&str_expr_canonical)); + assert_eq!( + expected_expr, + dialect.expr_parses_to(&format!("a{operator}b"), &str_expr_canonical) + ); +} + #[test] fn parse_pg_binary_ops() { let binary_ops = &[ @@ -1770,18 +1793,73 @@ fn parse_pg_binary_ops() { ]; for (str_op, op, dialects) in binary_ops { - let select = dialects.verified_only_select(&format!("SELECT a {} b", &str_op)); - assert_eq!( - SelectItem::UnnamedExpr(Expr::BinaryOp { - left: Box::new(Expr::Identifier(Ident::new("a"))), - op: op.clone(), - right: Box::new(Expr::Identifier(Ident::new("b"))), - }), - select.projection[0] - ); + test_operator(str_op, dialects, op.clone()); + } +} + +#[test] +fn parse_pg_custom_binary_ops() { + // Postgres supports declaring custom binary operators, using any character in the following set: + // + - * / < > = ~ ! @ # % ^ & | ` ? + + // Here, we test the ones used by common extensions + let operators = [ + // PostGIS + "&&&", // n-D bounding boxes intersect + "&<", // (is strictly to the left of) + "&>", // (is strictly to the right of) + "|=|", // distance between A and B trajectories at their closest point of approach + "<<#>>", // n-D distance between A and B bounding boxes + "|>>", // A's bounding box is strictly above B's. + "~=", // bounding box is the same + // PGroonga + "&@", // Full text search by a keyword + "&@~", // Full text search by easy to use query language + "&@*", // Similar search + "&`", // Advanced search by ECMAScript like query language + "&@|", // Full text search by an array of keywords + "&@~|", // Full text search by an array of queries in easy to use query language + // pgtrgm + "<<%", // second argument has a continuous extent of an ordered trigram set that matches word boundaries + "%>>", // commutator of <<% + "<<<->", // distance between arguments + // hstore + "#=", // Replace fields with matching values from hstore + // ranges + "-|-", // Is adjacent to + // pg_similarity + "~++", // L1 distance + "~##", // Cosine Distance + "~-~", // Dice Coefficient + "~!!", // Euclidean Distance + "~@~", // Hamming Distance + "~??", // Jaccard Coefficient + "~%%", // Jaro Distance + "~@@", // Jaro-Winkler Distance + "~==", // Levenshtein Distance + "~^^", // Matching Coefficient + "~||", // Monge-Elkan Coefficient + "~#~", // Needleman-Wunsch Coefficient + "~**", // Overlap Coefficient + "~~~", // Q-Gram Distance + "~=~", // Smith-Waterman Coefficient + "~!~", // Smith-Waterman-Gotoh Coefficient + "~*~", // Soundex Distance + // soundex_operator + ">@@<", // Soundex matches + "<@@>", // Soundex doesn't match + ]; + for op in &operators { + test_operator(op, &pg(), BinaryOperator::Custom(op.to_string())); } } +#[test] +fn parse_ampersand_arobase() { + // In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b) + pg().expr_parses_to("a&@b", "a &@ b"); +} + #[test] fn parse_pg_unary_ops() { let pg_unary_ops = &[