Skip to content

Commit b61c35c

Browse files
committed
add split_constant_prefix
1 parent 2ec4dc3 commit b61c35c

File tree

1 file changed

+30
-46
lines changed

1 file changed

+30
-46
lines changed

datafusion/physical-optimizer/src/pruning.rs

Lines changed: 30 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1711,11 +1711,11 @@ fn build_like_match(
17111711
Some(combined)
17121712
}
17131713

1714-
// For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
1714+
// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. If both col_min and col_max have the prefix const_prefix, we skip the entire row group (as we can be certain that all data in this row group has the prefix const_prefix).
17151715
fn build_not_like_match(
17161716
expr_builder: &mut PruningExpressionBuilder<'_>,
17171717
) -> Result<Arc<dyn PhysicalExpr>> {
1718-
// col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1718+
// col NOT LIKE 'const_prefix%' -> !(col_min LIKE 'const_prefix%' && col_max LIKE 'const_prefix%') -> (col_min NOT LIKE 'const_prefix%' || col_max NOT LIKE 'const_prefix%')
17191719

17201720
let min_column_expr = expr_builder.min_column_expr()?;
17211721
let max_column_expr = expr_builder.max_column_expr()?;
@@ -1726,27 +1726,21 @@ fn build_not_like_match(
17261726
plan_datafusion_err!("cannot extract literal from NOT LIKE expression")
17271727
})?;
17281728

1729-
let chars: Vec<char> = pattern.chars().collect();
1730-
for i in 0..chars.len() - 1 {
1731-
// Check if current char is a wildcard and is not escaped with backslash
1732-
if (chars[i] == '%' || chars[i] == '_') && (i == 0 || chars[i - 1] != '\\') {
1733-
// Example: For pattern "foo%bar", the row group might include values like
1734-
// ["foobar", "food", "foodbar"], making it unsafe to prune.
1735-
// Even if the min/max values in the group (e.g., "foobar" and "foodbar")
1736-
// match the pattern, intermediate values like "food" may not
1737-
// match the full pattern "foo%bar", making pruning unsafe.
1738-
// (truncate foo%bar to foo% have same problem)
1739-
return Err(plan_datafusion_err!(
1740-
"NOT LIKE expressions with unescaped wildcards ('%' or '_') at the beginning or middle of the pattern are not supported"
1741-
));
1742-
}
1743-
}
1744-
1745-
if chars.last() == Some(&'_') && (chars.len() > 1 && chars[chars.len() - 2] != '\\') {
1729+
let (const_prefix, remaining) = split_constant_prefix(pattern);
1730+
if const_prefix.is_empty() || remaining != "%" {
1731+
// we can not handle `%` in the middle or begining of the pattern
1732+
// Example: For pattern "foo%bar", the row group might include values like
1733+
// ["foobar", "food", "foodbar"], making it unsafe to prune.
1734+
// Even if the min/max values in the group (e.g., "foobar" and "foodbar")
1735+
// match the pattern, intermediate values like "food" may not
1736+
// match the full pattern "foo%bar", making pruning unsafe.
1737+
// (truncate foo%bar to foo% have same problem)
1738+
1739+
// we can not handle pattern with `_`
17461740
// Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
17471741
// which means not every row is guaranteed to match the pattern.
17481742
return Err(plan_datafusion_err!(
1749-
"NOT LIKE expressions with unescaped '_' at the end of the pattern are not supported"
1743+
"NOT LIKE expressions only support constant_prefix+wildcard`%`"
17501744
));
17511745
}
17521746

@@ -1771,6 +1765,22 @@ fn build_not_like_match(
17711765
)))
17721766
}
17731767

1768+
/// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty)
1769+
fn split_constant_prefix(pattern: &str) -> (&str, &str) {
1770+
let char_indices = pattern.char_indices().collect::<Vec<_>>();
1771+
for i in 0..char_indices.len() {
1772+
let (idx, char) = char_indices[i];
1773+
if char == '%' || char == '_' {
1774+
if i != 0 && char_indices[i - 1].1 == '\\' {
1775+
// ecsaped by `\`
1776+
continue;
1777+
}
1778+
return (&pattern[..idx], &pattern[idx..]);
1779+
}
1780+
}
1781+
(pattern, "")
1782+
}
1783+
17741784
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
17751785
/// This makes it so that the returned string will always compare greater than the input string
17761786
/// or any other string with the same prefix.
@@ -4210,32 +4220,6 @@ mod tests {
42104220
];
42114221
prune_with_expr(expr, &schema, &statistics, expected_ret);
42124222

4213-
let expr = col("s1").not_like(lit("M"));
4214-
#[rustfmt::skip]
4215-
let expected_ret = &[
4216-
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4217-
true,
4218-
// s1 ["A", "L"] ==> some rows could pass (must keep)
4219-
true,
4220-
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4221-
true,
4222-
// s1 ["M", "M"] ==> no row match
4223-
false,
4224-
// s1 [NULL, NULL] ==> unknown (must keep)
4225-
true,
4226-
// s1 ["A", NULL] ==> some rows could pass (must keep)
4227-
true,
4228-
// s1 ["", "A"] ==> some rows could pass (must keep)
4229-
true,
4230-
// s1 ["", ""] ==> some rows could pass (must keep)
4231-
true,
4232-
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4233-
true,
4234-
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4235-
true,
4236-
];
4237-
prune_with_expr(expr, &schema, &statistics, expected_ret);
4238-
42394223
let expr = col("s1").not_like(lit("A\\%%"));
42404224
let statistics = TestStatistics::new().with(
42414225
"s1",

0 commit comments

Comments
 (0)