From 619688bceccf52c58cfb86b87260a3dd0d10de8f Mon Sep 17 00:00:00 2001 From: Leonardo Yvens Date: Tue, 15 Apr 2025 11:42:29 -0300 Subject: [PATCH 1/5] coerce FixedSizeBinary to Binary --- .../expr-common/src/type_coercion/binary.rs | 4 ++++ datafusion/sqllogictest/test_files/binary.slt | 17 ++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index fdc61cd665efd..fdee00f81b1e6 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -1297,6 +1297,10 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Some(LargeBinary) } (Binary, Utf8) | (Utf8, Binary) => Some(Binary), + + // Cast FixedSizeBinary to Binary + (FixedSizeBinary(_), Binary) | (Binary, FixedSizeBinary(_)) => Some(Binary), + _ => None, } } diff --git a/datafusion/sqllogictest/test_files/binary.slt b/datafusion/sqllogictest/test_files/binary.slt index 5c5f9d510e554..30403d579a014 100644 --- a/datafusion/sqllogictest/test_files/binary.slt +++ b/datafusion/sqllogictest/test_files/binary.slt @@ -147,8 +147,23 @@ query error DataFusion error: Error during planning: Cannot infer common argumen SELECT column1, column1 = arrow_cast(X'0102', 'FixedSizeBinary(2)') FROM t # Comparison to different sized Binary -query error DataFusion error: Error during planning: Cannot infer common argument type for comparison operation FixedSizeBinary\(3\) = Binary +query ?B SELECT column1, column1 = X'0102' FROM t +---- +000102 false +003102 false +NULL NULL +ff0102 false +000102 false + +query ?B +SELECT column1, column1 = X'000102' FROM t +---- +000102 true +003102 false +NULL NULL +ff0102 false +000102 true statement ok drop table t_source From c72a47877be59a791908c50f8ecb219d80e6fb2e Mon Sep 17 00:00:00 2001 From: Leonardo Yvens Date: Tue, 15 Apr 2025 17:22:33 -0300 Subject: [PATCH 2/5] simplify FixedSizeBytes equality to literal --- .../simplify_expressions/expr_simplifier.rs | 109 ++++++++++++++++++ .../src/simplify_expressions/unwrap_cast.rs | 29 +++++ 2 files changed, 138 insertions(+) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 8e25bb7534365..7eedaa75e60cb 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -779,6 +779,65 @@ impl TreeNodeRewriter for Simplifier<'_, S> { }) } + // Optimize `col = x'deadbeef'` when `col` is a `FixedSizeBinary`. It might look already + // simple, but type coercion will cast the LHS to `Binary` instead of casting the RHS + // literal to `FixedSizeBinary`. So we fix that here. + // + // Simplies the equality `CAST( AS BINARY) = ` when `N == + // length(literal)`, by converting the literal to fixed length and removing the LHS cast. + // + // Assumes canonical form with the literal on the right. + Expr::BinaryExpr(BinaryExpr { + ref left, + op: Eq, + ref right, + }) if is_fixed_binary_cast_to_binary(left, info)? + && matches!(right.as_ref(), &Expr::Literal(ScalarValue::Binary(_))) => + { + // Extract the inner LHS expr from the cast and type length. + let (lhs_expr, lhs_len) = { + let lhs_expr = match left.as_ref() { + Expr::Cast(Cast { expr, .. }) => expr, + _ => { + return internal_err!("Expected cast to binary, got {left:?}") + } + }; + + match info.get_data_type(lhs_expr)? { + DataType::FixedSizeBinary(len) => (lhs_expr, len), + _ => return internal_err!("Expected FixedSizeBinary: {left:?}"), + } + }; + + let (binary_lit, lit_len) = match right.as_ref() { + Expr::Literal(l @ ScalarValue::Binary(v)) => { + (l, v.as_ref().and_then(|l| i32::try_from(l.len()).ok())) + } + _ => return internal_err!("Expected binary literal, got {expr:?}"), + }; + + // If the lengths don't match, don't simplify. + if Some(lhs_len) != lit_len { + return Ok(Transformed::no(expr)); + } + + // If the lengths match, cast the literal to remove the column cast. + let new_lit = match try_cast_literal_to_type( + binary_lit, + &DataType::FixedSizeBinary(lhs_len), + ) { + Some(lit) => lit, + None => return internal_err!("failed to cast binary literal"), + }; + + // The LHS cast is removed, and the RHS literal has been converted to fixed size. + return Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr { + left: lhs_expr.clone(), + op: Eq, + right: Box::new(Expr::Literal(new_lit)), + }))); + } + // Rules for NotEq // @@ -2107,6 +2166,18 @@ fn simplify_null_div_other_case( } } +fn is_fixed_binary_cast_to_binary( + expr: &Expr, + info: &impl SimplifyInfo, +) -> Result { + if let Expr::Cast(Cast { expr, data_type }) = expr { + if let DataType::FixedSizeBinary(_) = info.get_data_type(expr)? { + return Ok(data_type.equals_datatype(&DataType::Binary)); + } + } + Ok(false) +} + #[cfg(test)] mod tests { use crate::simplify_expressions::SimplifyContext; @@ -3310,6 +3381,15 @@ mod tests { simplifier.simplify(expr) } + fn coerce(expr: Expr) -> Expr { + let schema = expr_test_schema(); + let execution_props = ExecutionProps::new(); + let simplifier = ExprSimplifier::new( + SimplifyContext::new(&execution_props).with_schema(schema.clone()), + ); + simplifier.coerce(expr, schema.as_ref()).unwrap() + } + fn simplify(expr: Expr) -> Expr { try_simplify(expr).unwrap() } @@ -3352,6 +3432,7 @@ mod tests { Field::new("c2_non_null", DataType::Boolean, false), Field::new("c3_non_null", DataType::Int64, false), Field::new("c4_non_null", DataType::UInt32, false), + Field::new("c5", DataType::FixedSizeBinary(3), true), ] .into(), HashMap::new(), @@ -4475,6 +4556,34 @@ mod tests { } } + #[test] + fn simplify_fixed_size_binary_eq_lit() { + let bytes = [1u8, 2, 3].as_slice(); + + // The expression starts simple. + let expr = col("c5").eq(lit(bytes)); + + // The type coercer introduces a cast. + let coerced = coerce(expr.clone()); + let schema = expr_test_schema(); + assert_eq!( + coerced, + col("c5") + .cast_to(&DataType::Binary, schema.as_ref()) + .unwrap() + .eq(lit(bytes)) + ); + + // The simplifier removes the cast. + assert_eq!( + simplify(coerced), + col("c5").eq(Expr::Literal(ScalarValue::FixedSizeBinary( + 3, + Some(bytes.to_vec()), + ))) + ); + } + fn if_not_null(expr: Expr, then: bool) -> Expr { Expr::Case(Case { expr: Some(expr.is_not_null().into()), diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs index be71a8cd19b00..37116018cdca5 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs @@ -197,6 +197,7 @@ fn is_supported_type(data_type: &DataType) -> bool { is_supported_numeric_type(data_type) || is_supported_string_type(data_type) || is_supported_dictionary_type(data_type) + || is_supported_binary_type(data_type) } /// Returns true if unwrap_cast_in_comparison support this numeric type @@ -230,6 +231,10 @@ fn is_supported_dictionary_type(data_type: &DataType) -> bool { DataType::Dictionary(_, inner) if is_supported_type(inner)) } +fn is_supported_binary_type(data_type: &DataType) -> bool { + matches!(data_type, DataType::Binary | DataType::FixedSizeBinary(_)) +} + ///// Tries to move a cast from an expression (such as column) to the literal other side of a comparison operator./ /// /// Specifically, rewrites @@ -292,6 +297,7 @@ pub(super) fn try_cast_literal_to_type( try_cast_numeric_literal(lit_value, target_type) .or_else(|| try_cast_string_literal(lit_value, target_type)) .or_else(|| try_cast_dictionary(lit_value, target_type)) + .or_else(|| try_cast_binary(lit_value, target_type)) } /// Convert a numeric value from one numeric data type to another @@ -501,6 +507,20 @@ fn cast_between_timestamp(from: &DataType, to: &DataType, value: i128) -> Option } } +fn try_cast_binary( + lit_value: &ScalarValue, + target_type: &DataType, +) -> Option { + match (lit_value, target_type) { + (ScalarValue::Binary(Some(v)), DataType::FixedSizeBinary(n)) + if v.len() == *n as usize => + { + Some(ScalarValue::FixedSizeBinary(*n, Some(v.clone()))) + } + _ => None, + } +} + #[cfg(test)] mod tests { use super::*; @@ -1450,4 +1470,13 @@ mod tests { ) } } + + #[test] + fn try_cast_to_fixed_size_binary() { + expect_cast( + ScalarValue::Binary(Some(vec![1, 2, 3])), + DataType::FixedSizeBinary(3), + ExpectedCast::Value(ScalarValue::FixedSizeBinary(3, Some(vec![1, 2, 3]))), + ) + } } From 103658770af7acb825981ccf4b7dbfa5ebf0ab98 Mon Sep 17 00:00:00 2001 From: Leonardo Yvens Date: Wed, 16 Apr 2025 10:00:34 -0300 Subject: [PATCH 3/5] fix clippy --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 7eedaa75e60cb..ee0e8e8bdc322 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3385,7 +3385,7 @@ mod tests { let schema = expr_test_schema(); let execution_props = ExecutionProps::new(); let simplifier = ExprSimplifier::new( - SimplifyContext::new(&execution_props).with_schema(schema.clone()), + SimplifyContext::new(&execution_props).with_schema(Arc::clone(&schema)), ); simplifier.coerce(expr, schema.as_ref()).unwrap() } From f238eb16f27572c07f41025541361e73106196cc Mon Sep 17 00:00:00 2001 From: Leonardo Yvens Date: Wed, 16 Apr 2025 10:12:09 -0300 Subject: [PATCH 4/5] remove redundant ExprSimplifier case --- .../simplify_expressions/expr_simplifier.rs | 71 ------------------- 1 file changed, 71 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index ee0e8e8bdc322..867363735c55b 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -779,65 +779,6 @@ impl TreeNodeRewriter for Simplifier<'_, S> { }) } - // Optimize `col = x'deadbeef'` when `col` is a `FixedSizeBinary`. It might look already - // simple, but type coercion will cast the LHS to `Binary` instead of casting the RHS - // literal to `FixedSizeBinary`. So we fix that here. - // - // Simplies the equality `CAST( AS BINARY) = ` when `N == - // length(literal)`, by converting the literal to fixed length and removing the LHS cast. - // - // Assumes canonical form with the literal on the right. - Expr::BinaryExpr(BinaryExpr { - ref left, - op: Eq, - ref right, - }) if is_fixed_binary_cast_to_binary(left, info)? - && matches!(right.as_ref(), &Expr::Literal(ScalarValue::Binary(_))) => - { - // Extract the inner LHS expr from the cast and type length. - let (lhs_expr, lhs_len) = { - let lhs_expr = match left.as_ref() { - Expr::Cast(Cast { expr, .. }) => expr, - _ => { - return internal_err!("Expected cast to binary, got {left:?}") - } - }; - - match info.get_data_type(lhs_expr)? { - DataType::FixedSizeBinary(len) => (lhs_expr, len), - _ => return internal_err!("Expected FixedSizeBinary: {left:?}"), - } - }; - - let (binary_lit, lit_len) = match right.as_ref() { - Expr::Literal(l @ ScalarValue::Binary(v)) => { - (l, v.as_ref().and_then(|l| i32::try_from(l.len()).ok())) - } - _ => return internal_err!("Expected binary literal, got {expr:?}"), - }; - - // If the lengths don't match, don't simplify. - if Some(lhs_len) != lit_len { - return Ok(Transformed::no(expr)); - } - - // If the lengths match, cast the literal to remove the column cast. - let new_lit = match try_cast_literal_to_type( - binary_lit, - &DataType::FixedSizeBinary(lhs_len), - ) { - Some(lit) => lit, - None => return internal_err!("failed to cast binary literal"), - }; - - // The LHS cast is removed, and the RHS literal has been converted to fixed size. - return Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr { - left: lhs_expr.clone(), - op: Eq, - right: Box::new(Expr::Literal(new_lit)), - }))); - } - // Rules for NotEq // @@ -2166,18 +2107,6 @@ fn simplify_null_div_other_case( } } -fn is_fixed_binary_cast_to_binary( - expr: &Expr, - info: &impl SimplifyInfo, -) -> Result { - if let Expr::Cast(Cast { expr, data_type }) = expr { - if let DataType::FixedSizeBinary(_) = info.get_data_type(expr)? { - return Ok(data_type.equals_datatype(&DataType::Binary)); - } - } - Ok(false) -} - #[cfg(test)] mod tests { use crate::simplify_expressions::SimplifyContext; From 73007f34714ba5ebba949ec64963c2eceb83ac61 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 17 Apr 2025 10:14:13 -0400 Subject: [PATCH 5/5] Add explain test to make sure unwrapping is working correctly --- datafusion/sqllogictest/test_files/binary.slt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/datafusion/sqllogictest/test_files/binary.slt b/datafusion/sqllogictest/test_files/binary.slt index 30403d579a014..5ac13779acd74 100644 --- a/datafusion/sqllogictest/test_files/binary.slt +++ b/datafusion/sqllogictest/test_files/binary.slt @@ -165,6 +165,19 @@ NULL NULL ff0102 false 000102 true +# Plan should not have a cast of the column (should have casted the literal +# to FixedSizeBinary as that is much faster) + +query TT +explain SELECT column1, column1 = X'000102' FROM t +---- +logical_plan +01)Projection: t.column1, t.column1 = FixedSizeBinary(3, "0,1,2") AS t.column1 = Binary("0,1,2") +02)--TableScan: t projection=[column1] +physical_plan +01)ProjectionExec: expr=[column1@0 as column1, column1@0 = 000102 as t.column1 = Binary("0,1,2")] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + statement ok drop table t_source