From a373693dd001270297356f0649f980a7cb23651e Mon Sep 17 00:00:00 2001 From: Jagdish Parihar Date: Tue, 14 Jan 2025 11:32:56 +0530 Subject: [PATCH 1/5] Implemented `simplify` for the `starts_with` function to convert it into a LIKE expression, enabling predicate pruning optimization. --- .../functions/src/string/starts_with.rs | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 229982a9616a..0d3c20774201 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -20,10 +20,11 @@ use std::sync::Arc; use arrow::array::ArrayRef; use arrow::datatypes::DataType; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::utils::make_scalar_function; -use datafusion_common::{internal_err, Result}; -use datafusion_expr::{ColumnarValue, Documentation}; +use datafusion_common::{internal_err, Result, ScalarValue}; +use datafusion_expr::{ColumnarValue, Documentation, Expr, Like}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; @@ -98,6 +99,27 @@ impl ScalarUDFImpl for StartsWithFunc { } } + fn simplify( + &self, + args: Vec, + _info: &dyn SimplifyInfo, + ) -> Result { + if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = &args[1] { + // Convert starts_with (col, 'prefix') to col LIKE 'prefix%' + let like_pattern = format!("{}%", pattern); + + return Ok(ExprSimplifyResult::Simplified(Expr::Like(Like { + negated: false, + expr: Box::new(args[0].clone()), + pattern: Box::new(Expr::Literal(ScalarValue::Utf8(Some(like_pattern)))), + escape_char: None, + case_insensitive: false, + }))); + } + + Ok(ExprSimplifyResult::Original(args)) + } + fn documentation(&self) -> Option<&Documentation> { self.doc() } From b4c1f3702247f0e3983425ce06f2a8c4c4f00775 Mon Sep 17 00:00:00 2001 From: Jagdish Parihar Date: Sun, 19 Jan 2025 09:35:38 +0530 Subject: [PATCH 2/5] fix: escape special characters in starts_with to LIKE conversion --- datafusion/functions/src/string/starts_with.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 0d3c20774201..2cab5c3ab88d 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -105,8 +105,13 @@ impl ScalarUDFImpl for StartsWithFunc { _info: &dyn SimplifyInfo, ) -> Result { if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = &args[1] { - // Convert starts_with (col, 'prefix') to col LIKE 'prefix%' - let like_pattern = format!("{}%", pattern); + // Convert starts_with(col, 'prefix') to col LIKE 'prefix%' with proper escaping + // Example: starts_with(col, 'ja%') -> col LIKE 'ja\%%' + // 1. 'ja%' (input pattern) + // 2. 'ja\%' (escape special char '%') + // 3. 'ja\%%' (add suffix for starts_with) + let escaped_pattern = pattern.replace("%", "\\%"); + let like_pattern = format!("{}%", escaped_pattern); return Ok(ExprSimplifyResult::Simplified(Expr::Like(Like { negated: false, From 90b0cadf3e308f9e90b6e15cd180ac8d7a152acd Mon Sep 17 00:00:00 2001 From: Jagdish Parihar Date: Sun, 19 Jan 2025 20:54:49 +0530 Subject: [PATCH 3/5] updated simply function to handle utf8, largeutf8 and utf8view data type. and updated the coresponding test --- .../functions/src/string/starts_with.rs | 24 +++++++++++++++---- .../test_files/string/string_view.slt | 4 ++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 2cab5c3ab88d..74d0fbdc4033 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -104,19 +104,35 @@ impl ScalarUDFImpl for StartsWithFunc { args: Vec, _info: &dyn SimplifyInfo, ) -> Result { - if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = &args[1] { + if let Expr::Literal(scalar_value) = &args[1] { // Convert starts_with(col, 'prefix') to col LIKE 'prefix%' with proper escaping // Example: starts_with(col, 'ja%') -> col LIKE 'ja\%%' // 1. 'ja%' (input pattern) // 2. 'ja\%' (escape special char '%') // 3. 'ja\%%' (add suffix for starts_with) - let escaped_pattern = pattern.replace("%", "\\%"); - let like_pattern = format!("{}%", escaped_pattern); + let like_expr = match scalar_value { + ScalarValue::Utf8(Some(pattern)) => { + let escaped_pattern = pattern.replace("%", "\\%"); + let like_pattern = format!("{}%", escaped_pattern); + Expr::Literal(ScalarValue::Utf8(Some(like_pattern))) + } + ScalarValue::LargeUtf8(Some(pattern)) => { + let escaped_pattern = pattern.replace("%", "\\%"); + let like_pattern = format!("{}%", escaped_pattern); + Expr::Literal(ScalarValue::LargeUtf8(Some(like_pattern))) + } + ScalarValue::Utf8View(Some(pattern)) => { + let escaped_pattern = pattern.replace("%", "\\%"); + let like_pattern = format!("{}%", escaped_pattern); + Expr::Literal(ScalarValue::Utf8View(Some(like_pattern))) + } + _ => return Ok(ExprSimplifyResult::Original(args)), + }; return Ok(ExprSimplifyResult::Simplified(Expr::Like(Like { negated: false, expr: Box::new(args[0].clone()), - pattern: Box::new(Expr::Literal(ScalarValue::Utf8(Some(like_pattern)))), + pattern: Box::new(like_expr), escape_char: None, case_insensitive: false, }))); diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index c37dd1ed3b4f..7a8fdc704c94 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -344,7 +344,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 +01)Projection: test.column1_utf8view LIKE Utf8View("äöüß%") AS c1, CASE test.column1_utf8view IS NOT NULL WHEN Boolean(true) THEN Boolean(true) END AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] query TT @@ -887,7 +887,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 +01)Projection: test.column1_utf8view LIKE Utf8View("foo%") AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for TRANSLATE From 66dfac3dca2502e8e22eec68353c09fa4a5e31dd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 22 Jan 2025 16:19:53 -0500 Subject: [PATCH 4/5] Add some more tests --- .../test_files/string/string_view.slt | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 7a8fdc704c94..435b4bc3c5a8 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -347,6 +347,48 @@ logical_plan 01)Projection: test.column1_utf8view LIKE Utf8View("äöüß%") AS c1, CASE test.column1_utf8view IS NOT NULL WHEN Boolean(true) THEN Boolean(true) END AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] +## Test STARTS_WITH is rewitten to LIKE when the pattern is a constant +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8, 'foo%') as c1, + STARTS_WITH(column1_large_utf8, 'foo%') as c2, + STARTS_WITH(column1_utf8view, 'foo%') as c3, + STARTS_WITH(column1_utf8, 'f_o') as c4, + STARTS_WITH(column1_large_utf8, 'f_o') as c5, + STARTS_WITH(column1_utf8view, 'f_o') as c6 +FROM test; +---- +logical_plan +01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f_o%") AS c5, test.column1_utf8view LIKE Utf8View("f_o%") AS c6 +02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view] + +## Test STARTS_WITH works with column arguments +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1, + STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2, + STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3 +FROM test; +---- +logical_plan +01)Projection: starts_with(test.column1_utf8, substr(test.column1_utf8, Int64(1), Int64(2))) AS c1, starts_with(test.column1_large_utf8, substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2, starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1), Int64(2))) AS c3 +02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view] + +query BBB +SELECT + STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1, + STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2, + STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3 +FROM test; +---- +true true true +true true true +true true true +true true true +NULL NULL NULL + + +# Ensure that INIT cap works with utf8view query TT EXPLAIN SELECT INITCAP(column1_utf8view) as c From 5829d46a0f99e735619b009ff811cc654da4f281 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 22 Jan 2025 16:34:51 -0500 Subject: [PATCH 5/5] Add pruning test --- .../sqllogictest/test_files/parquet.slt | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 0faa7bd6b0bf..28f1a6a3780d 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -598,3 +598,34 @@ drop table cpu; statement ok drop table cpu_parquet; + +# Test for parquet predicate pruning with `starts_with` function +query I +copy (values ('foo'), ('bar'), ('baz')) TO 'test_files/scratch/parquet/foo.parquet' +---- +3 + +statement ok +create external table foo +stored as parquet +location 'test_files/scratch/parquet/foo.parquet'; + + +# Expect that the pruning predicate contain a comparison on the min/max value of `column1): +# column1_min@0 <= g AND f <= column1_max@1` +# (the starts_with function is not supported in the parquet predicate pruning but DataFusion rewrites +# it to a like which is then handled by the PruningPredicate) +query TT +explain select * from foo where starts_with(column1, 'f'); +---- +logical_plan +01)Filter: foo.column1 LIKE Utf8View("f%") +02)--TableScan: foo projection=[column1], partial_filters=[foo.column1 LIKE Utf8View("f%")] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: column1@0 LIKE f% +03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +04)------ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[] + +statement ok +drop table foo