From 1d872fd25bb1880a6c1a95d2aa13482e74b87552 Mon Sep 17 00:00:00 2001 From: lovasoa Date: Thu, 6 Jun 2024 22:45:37 +0200 Subject: [PATCH 1/8] add support for custom operators in postgres Fixes https://github.com/sqlparser-rs/sqlparser-rs/issues/1298 Fixes https://github.com/lovasoa/SQLpage/issues/372 Closes https://github.com/sqlparser-rs/sqlparser-rs/pull/1299 --- src/ast/operator.rs | 2 +- src/dialect/mod.rs | 6 ++ src/dialect/postgresql.rs | 27 +++++++++ src/parser/mod.rs | 9 +-- src/tokenizer.rs | 108 ++++++++++++++++++++++++++++-------- tests/sqlparser_mssql.rs | 6 ++ tests/sqlparser_postgres.rs | 73 +++++++++++++++++++++--- 7 files changed, 195 insertions(+), 36 deletions(-) diff --git a/src/ast/operator.rs b/src/ast/operator.rs index 3c4f192e3..e70df344a 100644 --- a/src/ast/operator.rs +++ b/src/ast/operator.rs @@ -111,7 +111,7 @@ pub enum BinaryOperator { DuckIntegerDivide, /// MySQL [`DIV`](https://dev.mysql.com/doc/refman/8.0/en/arithmetic-functions.html) integer division MyIntegerDivide, - /// Support for custom operators (built by parsers outside this crate) + /// Support for custom operators (such as Postgres custom operators) Custom(String), /// Bitwise XOR, e.g. `a # b` (PostgreSQL-specific) PGBitwiseXor, diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index da5c8c5ac..c79257456 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -122,6 +122,12 @@ pub trait Dialect: Debug + Any { fn is_identifier_start(&self, ch: char) -> bool; /// Determine if a character is a valid unquoted identifier character fn is_identifier_part(&self, ch: char) -> bool; + + /// Most dialects do not have custom operators. Override this method to provide custom operators. + fn is_custom_operator_part(&self, _ch: char) -> bool { + false + } + /// Determine if the dialect supports escaping characters via '\' in string literals. /// /// Some dialects like BigQuery and Snowflake support this while others like diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index f179111e0..8ca64bb48 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -25,6 +25,10 @@ impl Dialect for PostgreSqlDialect { Some('"') } + fn is_delimited_identifier_start(&self, ch: char) -> bool { + ch == '"' // Postgres does not support backticks to quote identifiers + } + fn is_identifier_start(&self, ch: char) -> bool { // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS // We don't yet support identifiers beginning with "letters with @@ -36,6 +40,29 @@ impl Dialect for PostgreSqlDialect { ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' } + /// See https://www.postgresql.org/docs/current/sql-createoperator.html + fn is_custom_operator_part(&self, ch: char) -> bool { + matches!( + ch, + '+' | '-' + | '*' + | '/' + | '<' + | '>' + | '=' + | '~' + | '!' + | '@' + | '#' + | '%' + | '^' + | '&' + | '|' + | '`' + | '?' + ) + } + fn parse_statement(&self, parser: &mut Parser) -> Option> { if parser.parse_keyword(Keyword::COMMENT) { Some(parse_comment(parser)) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index c6750644c..5cb12a354 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2300,9 +2300,8 @@ impl<'a> Parser<'a> { return infix; } - let tok = self.next_token(); - - let regular_binary_operator = match &tok.token { + let mut tok = self.next_token(); + let regular_binary_operator = match &mut tok.token { Token::Spaceship => Some(BinaryOperator::Spaceship), Token::DoubleEq => Some(BinaryOperator::Eq), Token::Eq => Some(BinaryOperator::Eq), @@ -2366,6 +2365,7 @@ impl<'a> Parser<'a> { Token::Question => Some(BinaryOperator::Question), Token::QuestionAnd => Some(BinaryOperator::QuestionAnd), Token::QuestionPipe => Some(BinaryOperator::QuestionPipe), + Token::CustomBinaryOperator(s) => Some(BinaryOperator::Custom(std::mem::take(s))), Token::Word(w) => match w.keyword { Keyword::AND => Some(BinaryOperator::And), @@ -2920,7 +2920,8 @@ impl<'a> Parser<'a> { | Token::AtAt | Token::Question | Token::QuestionAnd - | Token::QuestionPipe => Ok(Self::PG_OTHER_PREC), + | Token::QuestionPipe + | Token::CustomBinaryOperator(_) => Ok(Self::PG_OTHER_PREC), _ => Ok(0), } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index b6fed354d..f5aca8c35 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -231,6 +231,10 @@ pub enum Token { /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level /// keys within the jsonb object QuestionPipe, + /// Custom binary operator + /// This is used to represent any custom binary operator that is not part of the SQL standard. + /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR. + CustomBinaryOperator(String), } impl fmt::Display for Token { @@ -320,6 +324,7 @@ impl fmt::Display for Token { Token::Question => write!(f, "?"), Token::QuestionAnd => write!(f, "?&"), Token::QuestionPipe => write!(f, "?|"), + Token::CustomBinaryOperator(s) => f.write_str(s), } } } @@ -999,26 +1004,32 @@ impl<'a> Tokenizer<'a> { '%' => { chars.next(); // advance past '%' match chars.peek() { - Some(' ') => Ok(Some(Token::Mod)), + Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)), Some(sch) if self.dialect.is_identifier_start('%') => { self.tokenize_identifier_or_keyword([ch, *sch], chars) } - _ => Ok(Some(Token::Mod)), + _ => self.parse_custom_operator_or(chars, "%", Token::Mod), } } '|' => { chars.next(); // consume the '|' match chars.peek() { - Some('/') => self.consume_and_return(chars, Token::PGSquareRoot), + Some('/') => { + chars.next(); // consume the '/' + self.parse_custom_operator_or(chars, "|/", Token::PGSquareRoot) + } Some('|') => { chars.next(); // consume the second '|' match chars.peek() { - Some('/') => self.consume_and_return(chars, Token::PGCubeRoot), - _ => Ok(Some(Token::StringConcat)), + Some('/') => { + chars.next(); + self.parse_custom_operator_or(chars, "|", Token::PGCubeRoot) + } + _ => self.parse_custom_operator_or(chars, "|", Token::StringConcat), } } // Bitshift '|' operator - _ => Ok(Some(Token::Pipe)), + _ => self.parse_custom_operator_or(chars, "|", Token::Pipe), } } '=' => { @@ -1061,14 +1072,26 @@ impl<'a> Tokenizer<'a> { Some('=') => { chars.next(); match chars.peek() { - Some('>') => self.consume_and_return(chars, Token::Spaceship), - _ => Ok(Some(Token::LtEq)), + Some('>') => { + chars.next(); + self.parse_custom_operator_or(chars, "<=>", Token::Spaceship) + } + _ => self.parse_custom_operator_or(chars, "<=", Token::LtEq), } } - Some('>') => self.consume_and_return(chars, Token::Neq), - Some('<') => self.consume_and_return(chars, Token::ShiftLeft), - Some('@') => self.consume_and_return(chars, Token::ArrowAt), - _ => Ok(Some(Token::Lt)), + Some('>') => { + chars.next(); + self.parse_custom_operator_or(chars, "<>", Token::Neq) + } + Some('<') => { + chars.next(); + self.parse_custom_operator_or(chars, "<<", Token::ShiftLeft) + } + Some('@') => { + chars.next(); + self.parse_custom_operator_or(chars, "<@", Token::ArrowAt) + } + _ => self.parse_custom_operator_or(chars, "<", Token::Lt), } } '>' => { @@ -1094,9 +1117,12 @@ impl<'a> Tokenizer<'a> { '&' => { chars.next(); // consume the '&' match chars.peek() { - Some('&') => self.consume_and_return(chars, Token::Overlap), + Some('&') => { + chars.next(); // consume the second '&' + self.parse_custom_operator_or(chars, "&&", Token::Overlap) + } // Bitshift '&' operator - _ => Ok(Some(Token::Ampersand)), + _ => self.parse_custom_operator_or(chars, "&", Token::Ampersand), } } '^' => { @@ -1119,38 +1145,53 @@ impl<'a> Tokenizer<'a> { '~' => { chars.next(); // consume match chars.peek() { - Some('*') => self.consume_and_return(chars, Token::TildeAsterisk), + Some('*') => { + chars.next(); + self.parse_custom_operator_or(chars, "~*", Token::TildeAsterisk) + } Some('~') => { chars.next(); match chars.peek() { Some('*') => { - self.consume_and_return(chars, Token::DoubleTildeAsterisk) + chars.next(); + self.parse_custom_operator_or( + chars, + "~~*", + Token::DoubleTildeAsterisk, + ) } - _ => Ok(Some(Token::DoubleTilde)), + _ => self.parse_custom_operator_or(chars, "~~", Token::DoubleTilde), } } - _ => Ok(Some(Token::Tilde)), + _ => self.parse_custom_operator_or(chars, "~", Token::Tilde), } } '#' => { chars.next(); match chars.peek() { - Some('-') => self.consume_and_return(chars, Token::HashMinus), + Some('-') => { + chars.next(); + self.parse_custom_operator_or(chars, "#-", Token::HashMinus) + } Some('>') => { chars.next(); match chars.peek() { Some('>') => { chars.next(); - Ok(Some(Token::HashLongArrow)) + self.parse_custom_operator_or( + chars, + "#>>", + Token::HashLongArrow, + ) } - _ => Ok(Some(Token::HashArrow)), + _ => self.parse_custom_operator_or(chars, "#>", Token::HashArrow), } } Some(' ') => Ok(Some(Token::Sharp)), Some(sch) if self.dialect.is_identifier_start('#') => { self.tokenize_identifier_or_keyword([ch, *sch], chars) } - _ => Ok(Some(Token::Sharp)), + _ => self.parse_custom_operator_or(chars, "#", Token::Sharp), } } '@' => { @@ -1206,6 +1247,29 @@ impl<'a> Tokenizer<'a> { } } + fn parse_custom_operator_or( + &self, + chars: &mut State, + operator_start: &str, + default: Token, + ) -> Result, TokenizerError> { + let mut s = operator_start.to_string(); + let mut is_custom_operator = false; + while let Some(&ch) = chars.peek() { + if !self.dialect.is_custom_operator_part(ch) { + break; + } + s.push(ch); + is_custom_operator = true; + chars.next(); + } + if is_custom_operator { + Ok(Some(Token::CustomBinaryOperator(s))) + } else { + Ok(Some(default)) + } + } + /// Tokenize dollar preceded value (i.e: a string/placeholder) fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result { let mut s = String::new(); diff --git a/tests/sqlparser_mssql.rs b/tests/sqlparser_mssql.rs index 5d61c6ab9..86d3990f6 100644 --- a/tests/sqlparser_mssql.rs +++ b/tests/sqlparser_mssql.rs @@ -437,6 +437,12 @@ fn parse_for_json_expect_ast() { ); } +#[test] +fn parse_ampersand_arobase() { + // In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b) + ms().expr_parses_to("a&@b", "a & @b"); +} + #[test] fn parse_cast_varchar_max() { ms_and_generic().verified_expr("CAST('foo' AS VARCHAR(MAX))"); diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 677246a51..aee8bdef0 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1757,6 +1757,29 @@ fn parse_pg_returning() { }; } +fn test_operator(operator: &str, dialect: &TestedDialects, expected: BinaryOperator) { + let operator_tokens = + sqlparser::tokenizer::Tokenizer::new(&PostgreSqlDialect {}, &format!("a{operator}b")) + .tokenize() + .unwrap(); + assert_eq!( + operator_tokens.len(), + 3, + "binary op should be 3 tokens, not {operator_tokens:?}" + ); + let expected_expr = Expr::BinaryOp { + left: Box::new(Expr::Identifier(Ident::new("a"))), + op: expected, + right: Box::new(Expr::Identifier(Ident::new("b"))), + }; + let str_expr_canonical = format!("a {operator} b"); + assert_eq!(expected_expr, dialect.verified_expr(&str_expr_canonical)); + assert_eq!( + expected_expr, + dialect.expr_parses_to(&format!("a{operator}b"), &str_expr_canonical) + ); +} + #[test] fn parse_pg_binary_ops() { let binary_ops = &[ @@ -1770,18 +1793,50 @@ fn parse_pg_binary_ops() { ]; for (str_op, op, dialects) in binary_ops { - let select = dialects.verified_only_select(&format!("SELECT a {} b", &str_op)); - assert_eq!( - SelectItem::UnnamedExpr(Expr::BinaryOp { - left: Box::new(Expr::Identifier(Ident::new("a"))), - op: op.clone(), - right: Box::new(Expr::Identifier(Ident::new("b"))), - }), - select.projection[0] - ); + test_operator(str_op, dialects, op.clone()); + } +} + +#[test] +fn parse_pg_custom_binary_ops() { + // Postgres supports declaring custom binary operators, using any character in the following set: + // + - * / < > = ~ ! @ # % ^ & | ` ? + + // Here, we test the ones used by common extensions + let operators = [ + // PostGIS + "&&&", // n-D bounding boxes intersect + "&<", // (is strictly to the left of) + "&>", // (is strictly to the right of) + "|=|", // distance between A and B trajectories at their closest point of approach + "<<#>>", // n-D distance between A and B bounding boxes + "|>>", // A's bounding box is strictly above B's. + "~=", // bounding box is the same + // PGroonga + "&@", // Full text search by a keyword + "&@~", // Full text search by easy to use query language + "&@*", // Similar search + "&`", // Advanced search by ECMAScript like query language + "&@|", // Full text search by an array of keywords + "&@~|", // Full text search by an array of queries in easy to use query language + // pgtrgm + "<<%", // second argument has a continuous extent of an ordered trigram set that matches word boundaries + "%>>", // commutator of <<% + "<<<->", // distance between arguments + // hstore + "#=", // Replace fields with matching values from hstore + ]; + for op in &operators { + test_operator(op, &pg(), BinaryOperator::Custom(op.to_string())); } } +#[test] +fn parse_ampersand_arobase() { + // In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b) + pg().expr_parses_to("a&@b", "a &@ b"); +} + #[test] fn parse_pg_unary_ops() { let pg_unary_ops = &[ From 28a8c461569f7320523df390c532497fe74f77e9 Mon Sep 17 00:00:00 2001 From: lovasoa Date: Thu, 6 Jun 2024 23:06:40 +0200 Subject: [PATCH 2/8] more concise method names --- src/tokenizer.rs | 92 +++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 56 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f5aca8c35..ce1ebd18d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1008,28 +1008,24 @@ impl<'a> Tokenizer<'a> { Some(sch) if self.dialect.is_identifier_start('%') => { self.tokenize_identifier_or_keyword([ch, *sch], chars) } - _ => self.parse_custom_operator_or(chars, "%", Token::Mod), + _ => self.start_binop(chars, "%", Token::Mod), } } '|' => { chars.next(); // consume the '|' match chars.peek() { - Some('/') => { - chars.next(); // consume the '/' - self.parse_custom_operator_or(chars, "|/", Token::PGSquareRoot) - } + Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot), Some('|') => { chars.next(); // consume the second '|' match chars.peek() { Some('/') => { - chars.next(); - self.parse_custom_operator_or(chars, "|", Token::PGCubeRoot) + self.consume_for_binop(chars, "||/", Token::PGCubeRoot) } - _ => self.parse_custom_operator_or(chars, "|", Token::StringConcat), + _ => self.start_binop(chars, "||", Token::StringConcat), } } // Bitshift '|' operator - _ => self.parse_custom_operator_or(chars, "|", Token::Pipe), + _ => self.start_binop(chars, "|", Token::Pipe), } } '=' => { @@ -1072,26 +1068,14 @@ impl<'a> Tokenizer<'a> { Some('=') => { chars.next(); match chars.peek() { - Some('>') => { - chars.next(); - self.parse_custom_operator_or(chars, "<=>", Token::Spaceship) - } - _ => self.parse_custom_operator_or(chars, "<=", Token::LtEq), + Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship), + _ => self.start_binop(chars, "<=", Token::LtEq), } } - Some('>') => { - chars.next(); - self.parse_custom_operator_or(chars, "<>", Token::Neq) - } - Some('<') => { - chars.next(); - self.parse_custom_operator_or(chars, "<<", Token::ShiftLeft) - } - Some('@') => { - chars.next(); - self.parse_custom_operator_or(chars, "<@", Token::ArrowAt) - } - _ => self.parse_custom_operator_or(chars, "<", Token::Lt), + Some('>') => self.consume_for_binop(chars, "<>", Token::Neq), + Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft), + Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt), + _ => self.start_binop(chars, "<", Token::Lt), } } '>' => { @@ -1119,10 +1103,10 @@ impl<'a> Tokenizer<'a> { match chars.peek() { Some('&') => { chars.next(); // consume the second '&' - self.parse_custom_operator_or(chars, "&&", Token::Overlap) + self.start_binop(chars, "&&", Token::Overlap) } // Bitshift '&' operator - _ => self.parse_custom_operator_or(chars, "&", Token::Ampersand), + _ => self.start_binop(chars, "&", Token::Ampersand), } } '^' => { @@ -1145,53 +1129,37 @@ impl<'a> Tokenizer<'a> { '~' => { chars.next(); // consume match chars.peek() { - Some('*') => { - chars.next(); - self.parse_custom_operator_or(chars, "~*", Token::TildeAsterisk) - } + Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk), Some('~') => { chars.next(); match chars.peek() { Some('*') => { - chars.next(); - self.parse_custom_operator_or( - chars, - "~~*", - Token::DoubleTildeAsterisk, - ) + self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk) } - _ => self.parse_custom_operator_or(chars, "~~", Token::DoubleTilde), + _ => self.start_binop(chars, "~~", Token::DoubleTilde), } } - _ => self.parse_custom_operator_or(chars, "~", Token::Tilde), + _ => self.start_binop(chars, "~", Token::Tilde), } } '#' => { chars.next(); match chars.peek() { - Some('-') => { - chars.next(); - self.parse_custom_operator_or(chars, "#-", Token::HashMinus) - } + Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus), Some('>') => { chars.next(); match chars.peek() { Some('>') => { - chars.next(); - self.parse_custom_operator_or( - chars, - "#>>", - Token::HashLongArrow, - ) + self.consume_for_binop(chars, "#>>", Token::HashLongArrow) } - _ => self.parse_custom_operator_or(chars, "#>", Token::HashArrow), + _ => self.start_binop(chars, "#>", Token::HashArrow), } } Some(' ') => Ok(Some(Token::Sharp)), Some(sch) if self.dialect.is_identifier_start('#') => { self.tokenize_identifier_or_keyword([ch, *sch], chars) } - _ => self.parse_custom_operator_or(chars, "#", Token::Sharp), + _ => self.start_binop(chars, "#", Token::Sharp), } } '@' => { @@ -1247,13 +1215,25 @@ impl<'a> Tokenizer<'a> { } } - fn parse_custom_operator_or( + /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix + fn consume_for_binop( + &self, + chars: &mut State, + prefix: &str, + default: Token, + ) -> Result, TokenizerError> { + chars.next(); // consume the first char + self.start_binop(chars, prefix, default) + } + + /// parse a custom binary operator + fn start_binop( &self, chars: &mut State, - operator_start: &str, + prefix: &str, default: Token, ) -> Result, TokenizerError> { - let mut s = operator_start.to_string(); + let mut s = prefix.to_string(); let mut is_custom_operator = false; while let Some(&ch) = chars.peek() { if !self.dialect.is_custom_operator_part(ch) { From 9c9404f9878594ca8e3666c3c7bfee1989a4420c Mon Sep 17 00:00:00 2001 From: lovasoa Date: Thu, 6 Jun 2024 23:08:20 +0200 Subject: [PATCH 3/8] fix for nostd --- src/parser/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 5cb12a354..83e2bed39 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2365,7 +2365,7 @@ impl<'a> Parser<'a> { Token::Question => Some(BinaryOperator::Question), Token::QuestionAnd => Some(BinaryOperator::QuestionAnd), Token::QuestionPipe => Some(BinaryOperator::QuestionPipe), - Token::CustomBinaryOperator(s) => Some(BinaryOperator::Custom(std::mem::take(s))), + Token::CustomBinaryOperator(s) => Some(BinaryOperator::Custom(core::mem::take(s))), Token::Word(w) => match w.keyword { Keyword::AND => Some(BinaryOperator::And), From 8e590296b5df0eb35f70c442c354ee707c1d751f Mon Sep 17 00:00:00 2001 From: lovasoa Date: Thu, 6 Jun 2024 23:09:50 +0200 Subject: [PATCH 4/8] fix doc --- src/dialect/postgresql.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 8ca64bb48..0e04bfa27 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -40,7 +40,7 @@ impl Dialect for PostgreSqlDialect { ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' } - /// See https://www.postgresql.org/docs/current/sql-createoperator.html + /// See fn is_custom_operator_part(&self, ch: char) -> bool { matches!( ch, From b99ee76e4e4141e841271c82802d49c9ee17e317 Mon Sep 17 00:00:00 2001 From: lovasoa Date: Thu, 6 Jun 2024 23:17:30 +0200 Subject: [PATCH 5/8] add support for custom operators starting with a minus sign --- src/tokenizer.rs | 9 +++------ tests/sqlparser_postgres.rs | 2 ++ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index ce1ebd18d..82fe9ec14 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -966,15 +966,12 @@ impl<'a> Tokenizer<'a> { Some('>') => { chars.next(); match chars.peek() { - Some('>') => { - chars.next(); - Ok(Some(Token::LongArrow)) - } - _ => Ok(Some(Token::Arrow)), + Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow), + _ => self.start_binop(chars, "->", Token::Arrow), } } // a regular '-' operator - _ => Ok(Some(Token::Minus)), + _ => self.start_binop(chars, "-", Token::Minus), } } '/' => { diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index aee8bdef0..76e1831fb 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1825,6 +1825,8 @@ fn parse_pg_custom_binary_ops() { "<<<->", // distance between arguments // hstore "#=", // Replace fields with matching values from hstore + // ranges + "-|-", // Is adjacent to ]; for op in &operators { test_operator(op, &pg(), BinaryOperator::Custom(op.to_string())); From d11a24e41cec0e3b79c5288ef791af4f0f73accc Mon Sep 17 00:00:00 2001 From: lovasoa Date: Thu, 6 Jun 2024 23:35:00 +0200 Subject: [PATCH 6/8] add support for operators starting with '>' --- src/tokenizer.rs | 6 +++--- tests/sqlparser_postgres.rs | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 82fe9ec14..3dd7f8789 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1078,9 +1078,9 @@ impl<'a> Tokenizer<'a> { '>' => { chars.next(); // consume match chars.peek() { - Some('=') => self.consume_and_return(chars, Token::GtEq), - Some('>') => self.consume_and_return(chars, Token::ShiftRight), - _ => Ok(Some(Token::Gt)), + Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq), + Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight), + _ => self.start_binop(chars, ">", Token::Gt), } } ':' => { diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 76e1831fb..d1218f283 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1827,6 +1827,27 @@ fn parse_pg_custom_binary_ops() { "#=", // Replace fields with matching values from hstore // ranges "-|-", // Is adjacent to + // pg_similarity + "~++", // L1 distance + "~##", // Cosine Distance + "~-~", // Dice Coefficient + "~!!", // Euclidean Distance + "~@~", // Hamming Distance + "~??", // Jaccard Coefficient + "~%%", // Jaro Distance + "~@@", // Jaro-Winkler Distance + "~==", // Levenshtein Distance + "~^^", // Matching Coefficient + "~||", // Monge-Elkan Coefficient + "~#~", // Needleman-Wunsch Coefficient + "~**", // Overlap Coefficient + "~~~", // Q-Gram Distance + "~=~", // Smith-Waterman Coefficient + "~!~", // Smith-Waterman-Gotoh Coefficient + "~*~", // Soundex Distance + // soundex_operator + ">@@<", // Soundex matches + "<@@>", // Soundex doesn't match ]; for op in &operators { test_operator(op, &pg(), BinaryOperator::Custom(op.to_string())); From 487df8cf3a9f474aeb1a50966a34c445cfebbfc0 Mon Sep 17 00:00:00 2001 From: Ophir LOJKINE Date: Fri, 7 Jun 2024 06:27:53 +0200 Subject: [PATCH 7/8] performance Co-authored-by: Joey Hain --- src/tokenizer.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3dd7f8789..43d9dd23a 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1230,21 +1230,17 @@ impl<'a> Tokenizer<'a> { prefix: &str, default: Token, ) -> Result, TokenizerError> { - let mut s = prefix.to_string(); - let mut is_custom_operator = false; + let mut custom = None; while let Some(&ch) = chars.peek() { if !self.dialect.is_custom_operator_part(ch) { break; } - s.push(ch); - is_custom_operator = true; + + custom.get_or_insert_with(|| prefix.to_string()).push(ch); chars.next(); } - if is_custom_operator { - Ok(Some(Token::CustomBinaryOperator(s))) - } else { - Ok(Some(default)) - } + + Ok(Some(custom.map(Token::CustomBinaryOperator).unwrap_or(default))) } /// Tokenize dollar preceded value (i.e: a string/placeholder) From d1231c1af2506da21d1f8d07bd87c46d50abfbaf Mon Sep 17 00:00:00 2001 From: lovasoa Date: Fri, 7 Jun 2024 06:31:28 +0200 Subject: [PATCH 8/8] cargo fmt --- src/tokenizer.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 43d9dd23a..bcc5478bc 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1240,7 +1240,9 @@ impl<'a> Tokenizer<'a> { chars.next(); } - Ok(Some(custom.map(Token::CustomBinaryOperator).unwrap_or(default))) + Ok(Some( + custom.map(Token::CustomBinaryOperator).unwrap_or(default), + )) } /// Tokenize dollar preceded value (i.e: a string/placeholder)