From 2d08f61486d918896637ed97cb924964531b88ae Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 20 Apr 2024 13:40:41 +0800 Subject: [PATCH 01/16] add literal Signed-off-by: jayzhan211 --- .../src/expressions/literal.rs | 2 +- .../src/expressions/mod.rs | 1 + .../physical-expr-common/src/physical_expr.rs | 363 +++++++++++++++++- .../physical-expr/src/expressions/mod.rs | 3 +- 4 files changed, 364 insertions(+), 5 deletions(-) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/literal.rs (99%) diff --git a/datafusion/physical-expr/src/expressions/literal.rs b/datafusion/physical-expr-common/src/expressions/literal.rs similarity index 99% rename from datafusion/physical-expr/src/expressions/literal.rs rename to datafusion/physical-expr-common/src/expressions/literal.rs index cd3b51f09105..ec955c24e374 100644 --- a/datafusion/physical-expr/src/expressions/literal.rs +++ b/datafusion/physical-expr-common/src/expressions/literal.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use crate::physical_expr::down_cast_any_ref; use crate::sort_properties::SortProperties; -use crate::PhysicalExpr; +use crate::physical_expr::PhysicalExpr; use arrow::{ datatypes::{DataType, Schema}, diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index d102422081dc..f9ccb87398e3 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -16,3 +16,4 @@ // under the License. pub mod column; +pub mod literal; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index be6358e73c99..3a605abd2f88 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -25,10 +25,14 @@ use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::utils::DataPtr; -use datafusion_common::{internal_err, not_impl_err, Result}; +use datafusion_common::{internal_err, not_impl_err, DFSchema, Result}; +use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::expr::Alias; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, Expr}; +use crate::expressions::column::Column; +use crate::expressions::literal::Literal; use crate::sort_properties::SortProperties; use crate::utils::scatter; @@ -209,3 +213,358 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { any } } + +/// [PhysicalExpr] evaluate DataFusion expressions such as `A + 1`, or `CAST(c1 +/// AS int)`. +/// +/// [PhysicalExpr] are the physical counterpart to [Expr] used in logical +/// planning, and can be evaluated directly on a [RecordBatch]. They are +/// normally created from [Expr] by a [PhysicalPlanner] and can be created +/// directly using [create_physical_expr]. +/// +/// A Physical expression knows its type, nullability and how to evaluate itself. +/// +/// [PhysicalPlanner]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html +/// [RecordBatch]: https://docs.rs/arrow/latest/arrow/record_batch/struct.RecordBatch.html +/// +/// # Example: Create `PhysicalExpr` from `Expr` +/// ``` +/// # use arrow::datatypes::{DataType, Field, Schema}; +/// # use datafusion_common::DFSchema; +/// # use datafusion_expr::{Expr, col, lit}; +/// # use datafusion_physical_expr::create_physical_expr; +/// # use datafusion_expr::execution_props::ExecutionProps; +/// // For a logical expression `a = 1`, we can create a physical expression +/// let expr = col("a").eq(lit(1)); +/// // To create a PhysicalExpr we need 1. a schema +/// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); +/// let df_schema = DFSchema::try_from(schema).unwrap(); +/// // 2. ExecutionProps +/// let props = ExecutionProps::new(); +/// // We can now create a PhysicalExpr: +/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); +/// ``` +/// +/// # Example: Executing a PhysicalExpr to obtain [ColumnarValue] +/// ``` +/// # use std::sync::Arc; +/// # use arrow::array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; +/// # use arrow::datatypes::{DataType, Field, Schema}; +/// # use datafusion_common::{assert_batches_eq, DFSchema}; +/// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; +/// # use datafusion_physical_expr::create_physical_expr; +/// # use datafusion_expr::execution_props::ExecutionProps; +/// # let expr = col("a").eq(lit(1)); +/// # let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); +/// # let df_schema = DFSchema::try_from(schema.clone()).unwrap(); +/// # let props = ExecutionProps::new(); +/// // Given a PhysicalExpr, for `a = 1` we can evaluate it against a RecordBatch like this: +/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); +/// // Input of [1,2,3] +/// let input_batch = RecordBatch::try_from_iter(vec![ +/// ("a", Arc::new(Int32Array::from(vec![1, 2, 3])) as _) +/// ]).unwrap(); +/// // The result is a ColumnarValue (either an Array or a Scalar) +/// let result = physical_expr.evaluate(&input_batch).unwrap(); +/// // In this case, a BooleanArray with the result of the comparison +/// let ColumnarValue::Array(arr) = result else { +/// panic!("Expected an array") +/// }; +/// assert_eq!(arr.as_boolean(), &BooleanArray::from(vec![true, false, false])); +/// ``` +/// +/// [ColumnarValue]: datafusion_expr::ColumnarValue +/// +/// Create a physical expression from a logical expression ([Expr]). +/// +/// # Arguments +/// +/// * `e` - The logical expression +/// * `input_dfschema` - The DataFusion schema for the input, used to resolve `Column` references +/// to qualified or unqualified fields by name. +pub fn create_physical_expr( + e: &Expr, + input_dfschema: &DFSchema, + execution_props: &ExecutionProps, +) -> Result> { + let _input_schema: &Schema = &input_dfschema.into(); + + match e { + Expr::Alias(Alias { expr, .. }) => { + Ok(create_physical_expr(expr, input_dfschema, execution_props)?) + } + Expr::Column(c) => { + let idx = input_dfschema.index_of_column(c)?; + Ok(Arc::new(Column::new(&c.name, idx))) + } + Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))), + // Expr::ScalarVariable(_, variable_names) => { + // if is_system_variables(variable_names) { + // match execution_props.get_var_provider(VarType::System) { + // Some(provider) => { + // let scalar_value = provider.get_value(variable_names.clone())?; + // Ok(Arc::new(Literal::new(scalar_value))) + // } + // _ => plan_err!("No system variable provider found"), + // } + // } else { + // match execution_props.get_var_provider(VarType::UserDefined) { + // Some(provider) => { + // let scalar_value = provider.get_value(variable_names.clone())?; + // Ok(Arc::new(Literal::new(scalar_value))) + // } + // _ => plan_err!("No user defined variable provider found"), + // } + // } + // } + // Expr::IsTrue(expr) => { + // let binary_op = binary_expr( + // expr.as_ref().clone(), + // Operator::IsNotDistinctFrom, + // Expr::Literal(ScalarValue::Boolean(Some(true))), + // ); + // create_physical_expr(&binary_op, input_dfschema, execution_props) + // } + // Expr::IsNotTrue(expr) => { + // let binary_op = binary_expr( + // expr.as_ref().clone(), + // Operator::IsDistinctFrom, + // Expr::Literal(ScalarValue::Boolean(Some(true))), + // ); + // create_physical_expr(&binary_op, input_dfschema, execution_props) + // } + // Expr::IsFalse(expr) => { + // let binary_op = binary_expr( + // expr.as_ref().clone(), + // Operator::IsNotDistinctFrom, + // Expr::Literal(ScalarValue::Boolean(Some(false))), + // ); + // create_physical_expr(&binary_op, input_dfschema, execution_props) + // } + // Expr::IsNotFalse(expr) => { + // let binary_op = binary_expr( + // expr.as_ref().clone(), + // Operator::IsDistinctFrom, + // Expr::Literal(ScalarValue::Boolean(Some(false))), + // ); + // create_physical_expr(&binary_op, input_dfschema, execution_props) + // } + // Expr::IsUnknown(expr) => { + // let binary_op = binary_expr( + // expr.as_ref().clone(), + // Operator::IsNotDistinctFrom, + // Expr::Literal(ScalarValue::Boolean(None)), + // ); + // create_physical_expr(&binary_op, input_dfschema, execution_props) + // } + // Expr::IsNotUnknown(expr) => { + // let binary_op = binary_expr( + // expr.as_ref().clone(), + // Operator::IsDistinctFrom, + // Expr::Literal(ScalarValue::Boolean(None)), + // ); + // create_physical_expr(&binary_op, input_dfschema, execution_props) + // } + // Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + // // Create physical expressions for left and right operands + // let lhs = create_physical_expr(left, input_dfschema, execution_props)?; + // let rhs = create_physical_expr(right, input_dfschema, execution_props)?; + // // Note that the logical planner is responsible + // // for type coercion on the arguments (e.g. if one + // // argument was originally Int32 and one was + // // Int64 they will both be coerced to Int64). + // // + // // There should be no coercion during physical + // // planning. + // binary(lhs, *op, rhs, input_schema) + // } + // Expr::Like(Like { + // negated, + // expr, + // pattern, + // escape_char, + // case_insensitive, + // }) => { + // if escape_char.is_some() { + // return exec_err!("LIKE does not support escape_char"); + // } + // let physical_expr = + // create_physical_expr(expr, input_dfschema, execution_props)?; + // let physical_pattern = + // create_physical_expr(pattern, input_dfschema, execution_props)?; + // like( + // *negated, + // *case_insensitive, + // physical_expr, + // physical_pattern, + // input_schema, + // ) + // } + // Expr::Case(case) => { + // let expr: Option> = if let Some(e) = &case.expr { + // Some(create_physical_expr( + // e.as_ref(), + // input_dfschema, + // execution_props, + // )?) + // } else { + // None + // }; + // let (when_expr, then_expr): (Vec<&Expr>, Vec<&Expr>) = case + // .when_then_expr + // .iter() + // .map(|(w, t)| (w.as_ref(), t.as_ref())) + // .unzip(); + // let when_expr = + // create_physical_exprs(when_expr, input_dfschema, execution_props)?; + // let then_expr = + // create_physical_exprs(then_expr, input_dfschema, execution_props)?; + // let when_then_expr: Vec<(Arc, Arc)> = + // when_expr + // .iter() + // .zip(then_expr.iter()) + // .map(|(w, t)| (w.clone(), t.clone())) + // .collect(); + // let else_expr: Option> = + // if let Some(e) = &case.else_expr { + // Some(create_physical_expr( + // e.as_ref(), + // input_dfschema, + // execution_props, + // )?) + // } else { + // None + // }; + // Ok(expressions::case(expr, when_then_expr, else_expr)?) + // } + // Expr::Cast(Cast { expr, data_type }) => expressions::cast( + // create_physical_expr(expr, input_dfschema, execution_props)?, + // input_schema, + // data_type.clone(), + // ), + // Expr::TryCast(TryCast { expr, data_type }) => expressions::try_cast( + // create_physical_expr(expr, input_dfschema, execution_props)?, + // input_schema, + // data_type.clone(), + // ), + // Expr::Not(expr) => { + // expressions::not(create_physical_expr(expr, input_dfschema, execution_props)?) + // } + // Expr::Negative(expr) => expressions::negative( + // create_physical_expr(expr, input_dfschema, execution_props)?, + // input_schema, + // ), + // Expr::IsNull(expr) => expressions::is_null(create_physical_expr( + // expr, + // input_dfschema, + // execution_props, + // )?), + // Expr::IsNotNull(expr) => expressions::is_not_null(create_physical_expr( + // expr, + // input_dfschema, + // execution_props, + // )?), + // Expr::GetIndexedField(GetIndexedField { expr: _, field }) => match field { + // GetFieldAccess::NamedStructField { name: _ } => { + // internal_err!( + // "NamedStructField should be rewritten in OperatorToFunction" + // ) + // } + // GetFieldAccess::ListIndex { key: _ } => { + // internal_err!("ListIndex should be rewritten in OperatorToFunction") + // } + // GetFieldAccess::ListRange { + // start: _, + // stop: _, + // stride: _, + // } => { + // internal_err!("ListRange should be rewritten in OperatorToFunction") + // } + // }, + + // Expr::ScalarFunction(ScalarFunction { func_def, args }) => { + // let physical_args = + // create_physical_exprs(args, input_dfschema, execution_props)?; + + // match func_def { + // ScalarFunctionDefinition::BuiltIn(fun) => { + // functions::create_builtin_physical_expr( + // fun, + // &physical_args, + // input_schema, + // execution_props, + // ) + // } + // ScalarFunctionDefinition::UDF(fun) => udf::create_physical_expr( + // fun.clone().as_ref(), + // &physical_args, + // input_schema, + // args, + // input_dfschema, + // ), + // ScalarFunctionDefinition::Name(_) => { + // internal_err!("Function `Expr` with name should be resolved.") + // } + // } + // } + // Expr::Between(Between { + // expr, + // negated, + // low, + // high, + // }) => { + // let value_expr = create_physical_expr(expr, input_dfschema, execution_props)?; + // let low_expr = create_physical_expr(low, input_dfschema, execution_props)?; + // let high_expr = create_physical_expr(high, input_dfschema, execution_props)?; + + // // rewrite the between into the two binary operators + // let binary_expr = binary( + // binary(value_expr.clone(), Operator::GtEq, low_expr, input_schema)?, + // Operator::And, + // binary(value_expr.clone(), Operator::LtEq, high_expr, input_schema)?, + // input_schema, + // ); + + // if *negated { + // expressions::not(binary_expr?) + // } else { + // binary_expr + // } + // } + // Expr::InList(InList { + // expr, + // list, + // negated, + // }) => match expr.as_ref() { + // Expr::Literal(ScalarValue::Utf8(None)) => { + // Ok(expressions::lit(ScalarValue::Boolean(None))) + // } + // _ => { + // let value_expr = + // create_physical_expr(expr, input_dfschema, execution_props)?; + + // let list_exprs = + // create_physical_exprs(list, input_dfschema, execution_props)?; + // expressions::in_list(value_expr, list_exprs, negated, input_schema) + // } + // }, + other => { + not_impl_err!("Physical plan does not support logical expression {other:?}") + } + } +} + +/// Create vector of Physical Expression from a vector of logical expression +pub fn create_physical_exprs<'a, I>( + exprs: I, + input_dfschema: &DFSchema, + execution_props: &ExecutionProps, +) -> Result>> +where + I: IntoIterator, +{ + exprs + .into_iter() + .map(|expr| create_physical_expr(expr, input_dfschema, execution_props)) + .collect::>>() +} \ No newline at end of file diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 688d5ce6eabf..3ac402fffc41 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -27,7 +27,6 @@ mod in_list; mod is_not_null; mod is_null; mod like; -mod literal; mod negative; mod no_op; mod not; @@ -89,7 +88,7 @@ pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; pub use like::{like, LikeExpr}; -pub use literal::{lit, Literal}; +pub use datafusion_physical_expr_common::expressions::literal::{lit, Literal}; pub use negative::{negative, NegativeExpr}; pub use no_op::NoOp; pub use not::{not, NotExpr}; From 3d0fb917249a490d8bddce065147a4ba63b47f13 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 20 Apr 2024 13:47:03 +0800 Subject: [PATCH 02/16] move utils to own package Signed-off-by: jayzhan211 --- datafusion/physical-expr-common/src/expressions/literal.rs | 2 +- datafusion/physical-expr-common/src/physical_expr.rs | 2 +- datafusion/physical-expr-common/src/{utils.rs => utils/mod.rs} | 0 datafusion/physical-expr/src/expressions/mod.rs | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename datafusion/physical-expr-common/src/{utils.rs => utils/mod.rs} (100%) diff --git a/datafusion/physical-expr-common/src/expressions/literal.rs b/datafusion/physical-expr-common/src/expressions/literal.rs index ec955c24e374..6c94aef3599f 100644 --- a/datafusion/physical-expr-common/src/expressions/literal.rs +++ b/datafusion/physical-expr-common/src/expressions/literal.rs @@ -22,8 +22,8 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use crate::physical_expr::down_cast_any_ref; -use crate::sort_properties::SortProperties; use crate::physical_expr::PhysicalExpr; +use crate::sort_properties::SortProperties; use arrow::{ datatypes::{DataType, Schema}, diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 3a605abd2f88..48dec08b80bb 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -567,4 +567,4 @@ where .into_iter() .map(|expr| create_physical_expr(expr, input_dfschema, execution_props)) .collect::>>() -} \ No newline at end of file +} diff --git a/datafusion/physical-expr-common/src/utils.rs b/datafusion/physical-expr-common/src/utils/mod.rs similarity index 100% rename from datafusion/physical-expr-common/src/utils.rs rename to datafusion/physical-expr-common/src/utils/mod.rs diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 3ac402fffc41..32a2a92b994f 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -84,11 +84,11 @@ pub use cast::{cast, cast_with_options, CastExpr}; pub use column::UnKnownColumn; pub use datafusion_expr::utils::format_state_name; pub use datafusion_physical_expr_common::expressions::column::{col, Column}; +pub use datafusion_physical_expr_common::expressions::literal::{lit, Literal}; pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; pub use like::{like, LikeExpr}; -pub use datafusion_physical_expr_common::expressions::literal::{lit, Literal}; pub use negative::{negative, NegativeExpr}; pub use no_op::NoOp; pub use not::{not, NotExpr}; From 436b5c5a6d39c34f6ddec7dcacb0b1c17fd9cc50 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 20 Apr 2024 14:26:01 +0800 Subject: [PATCH 03/16] move parts of cp_solver Signed-off-by: jayzhan211 --- .../src/expressions/intervals/cp_solver.rs | 362 ++++++++++++++++++ .../src/expressions/intervals/mod.rs | 21 + .../src/expressions/intervals/utils.rs | 129 +++++++ .../src/expressions/mod.rs | 1 + .../physical-expr-common/src/physical_expr.rs | 1 + .../physical-expr/src/intervals/cp_solver.rs | 341 +---------------- .../physical-expr/src/intervals/utils.rs | 110 ------ 7 files changed, 521 insertions(+), 444 deletions(-) create mode 100644 datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs create mode 100644 datafusion/physical-expr-common/src/expressions/intervals/mod.rs create mode 100644 datafusion/physical-expr-common/src/expressions/intervals/utils.rs diff --git a/datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs b/datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs new file mode 100644 index 000000000000..b382ee77ac03 --- /dev/null +++ b/datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs @@ -0,0 +1,362 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Constraint propagator/solver for custom PhysicalExpr graphs. + +use arrow::datatypes::DataType; +use datafusion_common::{internal_err, Result}; +use datafusion_expr::interval_arithmetic::{apply_operator, satisfy_greater, Interval}; +use datafusion_expr::Operator; + +use super::utils::{ + convert_duration_type_to_interval, convert_interval_type_to_duration, get_inverse_op, +}; + +/// This function refines intervals `left_child` and `right_child` by applying +/// constraint propagation through `parent` via operation. The main idea is +/// that we can shrink ranges of variables x and y using parent interval p. +/// +/// Assuming that x,y and p has ranges [xL, xU], [yL, yU], and [pL, pU], we +/// apply the following operations: +/// - For plus operation, specifically, we would first do +/// - [xL, xU] <- ([pL, pU] - [yL, yU]) ∩ [xL, xU], and then +/// - [yL, yU] <- ([pL, pU] - [xL, xU]) ∩ [yL, yU]. +/// - For minus operation, specifically, we would first do +/// - [xL, xU] <- ([yL, yU] + [pL, pU]) ∩ [xL, xU], and then +/// - [yL, yU] <- ([xL, xU] - [pL, pU]) ∩ [yL, yU]. +/// - For multiplication operation, specifically, we would first do +/// - [xL, xU] <- ([pL, pU] / [yL, yU]) ∩ [xL, xU], and then +/// - [yL, yU] <- ([pL, pU] / [xL, xU]) ∩ [yL, yU]. +/// - For division operation, specifically, we would first do +/// - [xL, xU] <- ([yL, yU] * [pL, pU]) ∩ [xL, xU], and then +/// - [yL, yU] <- ([xL, xU] / [pL, pU]) ∩ [yL, yU]. +pub fn propagate_arithmetic( + op: &Operator, + parent: &Interval, + left_child: &Interval, + right_child: &Interval, +) -> Result> { + let inverse_op = get_inverse_op(*op)?; + match (left_child.data_type(), right_child.data_type()) { + // If we have a child whose type is a time interval (i.e. DataType::Interval), + // we need special handling since timestamp differencing results in a + // Duration type. + (DataType::Timestamp(..), DataType::Interval(_)) => { + propagate_time_interval_at_right( + left_child, + right_child, + parent, + op, + &inverse_op, + ) + } + (DataType::Interval(_), DataType::Timestamp(..)) => { + propagate_time_interval_at_left( + left_child, + right_child, + parent, + op, + &inverse_op, + ) + } + _ => { + // First, propagate to the left: + match apply_operator(&inverse_op, parent, right_child)? + .intersect(left_child)? + { + // Left is feasible: + Some(value) => Ok( + // Propagate to the right using the new left. + propagate_right(&value, parent, right_child, op, &inverse_op)? + .map(|right| (value, right)), + ), + // If the left child is infeasible, short-circuit. + None => Ok(None), + } + } + } +} + +/// This function refines intervals `left_child` and `right_child` by applying +/// comparison propagation through `parent` via operation. The main idea is +/// that we can shrink ranges of variables x and y using parent interval p. +/// Two intervals can be ordered in 6 ways for a Gt `>` operator: +/// ```text +/// (1): Infeasible, short-circuit +/// left: | ================ | +/// right: | ======================== | +/// +/// (2): Update both interval +/// left: | ====================== | +/// right: | ====================== | +/// | +/// V +/// left: | ======= | +/// right: | ======= | +/// +/// (3): Update left interval +/// left: | ============================== | +/// right: | ========== | +/// | +/// V +/// left: | ===================== | +/// right: | ========== | +/// +/// (4): Update right interval +/// left: | ========== | +/// right: | =========================== | +/// | +/// V +/// left: | ========== | +/// right | ================== | +/// +/// (5): No change +/// left: | ============================ | +/// right: | =================== | +/// +/// (6): No change +/// left: | ==================== | +/// right: | =============== | +/// +/// -inf --------------------------------------------------------------- +inf +/// ``` +pub fn propagate_comparison( + op: &Operator, + parent: &Interval, + left_child: &Interval, + right_child: &Interval, +) -> Result> { + if parent == &Interval::CERTAINLY_TRUE { + match op { + Operator::Eq => left_child.intersect(right_child).map(|result| { + result.map(|intersection| (intersection.clone(), intersection)) + }), + Operator::Gt => satisfy_greater(left_child, right_child, true), + Operator::GtEq => satisfy_greater(left_child, right_child, false), + Operator::Lt => satisfy_greater(right_child, left_child, true) + .map(|t| t.map(reverse_tuple)), + Operator::LtEq => satisfy_greater(right_child, left_child, false) + .map(|t| t.map(reverse_tuple)), + _ => internal_err!( + "The operator must be a comparison operator to propagate intervals" + ), + } + } else if parent == &Interval::CERTAINLY_FALSE { + match op { + Operator::Eq => { + // TODO: Propagation is not possible until we support interval sets. + Ok(None) + } + Operator::Gt => satisfy_greater(right_child, left_child, false), + Operator::GtEq => satisfy_greater(right_child, left_child, true), + Operator::Lt => satisfy_greater(left_child, right_child, false) + .map(|t| t.map(reverse_tuple)), + Operator::LtEq => satisfy_greater(left_child, right_child, true) + .map(|t| t.map(reverse_tuple)), + _ => internal_err!( + "The operator must be a comparison operator to propagate intervals" + ), + } + } else { + // Uncertainty cannot change any end-point of the intervals. + Ok(None) + } +} + +/// During the propagation of [`Interval`] values on an [`ExprIntervalGraph`], +/// if there exists a `timestamp - timestamp` operation, the result would be +/// of type `Duration`. However, we may encounter a situation where a time interval +/// is involved in an arithmetic operation with a `Duration` type. This function +/// offers special handling for such cases, where the time interval resides on +/// the right side of the operation. +fn propagate_time_interval_at_right( + left_child: &Interval, + right_child: &Interval, + parent: &Interval, + op: &Operator, + inverse_op: &Operator, +) -> Result> { + // We check if the child's time interval(s) has a non-zero month or day field(s). + // If so, we return it as is without propagating. Otherwise, we first convert + // the time intervals to the `Duration` type, then propagate, and then convert + // the bounds to time intervals again. + let result = if let Some(duration) = convert_interval_type_to_duration(right_child) { + match apply_operator(inverse_op, parent, &duration)?.intersect(left_child)? { + Some(value) => { + propagate_right(left_child, parent, &duration, op, inverse_op)? + .and_then(|right| convert_duration_type_to_interval(&right)) + .map(|right| (value, right)) + } + None => None, + } + } else { + apply_operator(inverse_op, parent, right_child)? + .intersect(left_child)? + .map(|value| (value, right_child.clone())) + }; + Ok(result) +} + +/// This is a subfunction of the `propagate_arithmetic` function that propagates to the right child. +fn propagate_right( + left: &Interval, + parent: &Interval, + right: &Interval, + op: &Operator, + inverse_op: &Operator, +) -> Result> { + match op { + Operator::Minus => apply_operator(op, left, parent), + Operator::Plus => apply_operator(inverse_op, parent, left), + Operator::Divide => apply_operator(op, left, parent), + Operator::Multiply => apply_operator(inverse_op, parent, left), + _ => internal_err!("Interval arithmetic does not support the operator {}", op), + }? + .intersect(right) +} + +/// During the propagation of [`Interval`] values on an [`ExprIntervalGraph`], +/// if there exists a `timestamp - timestamp` operation, the result would be +/// of type `Duration`. However, we may encounter a situation where a time interval +/// is involved in an arithmetic operation with a `Duration` type. This function +/// offers special handling for such cases, where the time interval resides on +/// the left side of the operation. +fn propagate_time_interval_at_left( + left_child: &Interval, + right_child: &Interval, + parent: &Interval, + op: &Operator, + inverse_op: &Operator, +) -> Result> { + // We check if the child's time interval(s) has a non-zero month or day field(s). + // If so, we return it as is without propagating. Otherwise, we first convert + // the time intervals to the `Duration` type, then propagate, and then convert + // the bounds to time intervals again. + let result = if let Some(duration) = convert_interval_type_to_duration(left_child) { + match apply_operator(inverse_op, parent, right_child)?.intersect(duration)? { + Some(value) => { + let left = convert_duration_type_to_interval(&value); + let right = propagate_right(&value, parent, right_child, op, inverse_op)?; + match (left, right) { + (Some(left), Some(right)) => Some((left, right)), + _ => None, + } + } + None => None, + } + } else { + propagate_right(left_child, parent, right_child, op, inverse_op)? + .map(|right| (left_child.clone(), right)) + }; + Ok(result) +} + +fn reverse_tuple((first, second): (T, U)) -> (U, T) { + (second, first) +} + +#[cfg(test)] +mod tests { + use arrow::datatypes::TimeUnit; + use datafusion_common::ScalarValue; + + use super::*; + + #[test] + fn test_propagate_comparison() -> Result<()> { + // In the examples below: + // `left` is unbounded: [?, ?], + // `right` is known to be [1000,1000] + // so `left` < `right` results in no new knowledge of `right` but knowing that `left` is now < 1000:` [?, 999] + let left = Interval::make_unbounded(&DataType::Int64)?; + let right = Interval::make(Some(1000_i64), Some(1000_i64))?; + assert_eq!( + (Some(( + Interval::make(None, Some(999_i64))?, + Interval::make(Some(1000_i64), Some(1000_i64))?, + ))), + propagate_comparison( + &Operator::Lt, + &Interval::CERTAINLY_TRUE, + &left, + &right + )? + ); + + let left = + Interval::make_unbounded(&DataType::Timestamp(TimeUnit::Nanosecond, None))?; + let right = Interval::try_new( + ScalarValue::TimestampNanosecond(Some(1000), None), + ScalarValue::TimestampNanosecond(Some(1000), None), + )?; + assert_eq!( + (Some(( + Interval::try_new( + ScalarValue::try_from(&DataType::Timestamp( + TimeUnit::Nanosecond, + None + )) + .unwrap(), + ScalarValue::TimestampNanosecond(Some(999), None), + )?, + Interval::try_new( + ScalarValue::TimestampNanosecond(Some(1000), None), + ScalarValue::TimestampNanosecond(Some(1000), None), + )? + ))), + propagate_comparison( + &Operator::Lt, + &Interval::CERTAINLY_TRUE, + &left, + &right + )? + ); + + let left = Interval::make_unbounded(&DataType::Timestamp( + TimeUnit::Nanosecond, + Some("+05:00".into()), + ))?; + let right = Interval::try_new( + ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())), + ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())), + )?; + assert_eq!( + (Some(( + Interval::try_new( + ScalarValue::try_from(&DataType::Timestamp( + TimeUnit::Nanosecond, + Some("+05:00".into()), + )) + .unwrap(), + ScalarValue::TimestampNanosecond(Some(999), Some("+05:00".into())), + )?, + Interval::try_new( + ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())), + ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())), + )? + ))), + propagate_comparison( + &Operator::Lt, + &Interval::CERTAINLY_TRUE, + &left, + &right + )? + ); + + Ok(()) + } +} diff --git a/datafusion/physical-expr-common/src/expressions/intervals/mod.rs b/datafusion/physical-expr-common/src/expressions/intervals/mod.rs new file mode 100644 index 000000000000..7022bf2c42b9 --- /dev/null +++ b/datafusion/physical-expr-common/src/expressions/intervals/mod.rs @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Interval arithmetic and constraint propagation library + +pub mod cp_solver; +pub mod utils; diff --git a/datafusion/physical-expr-common/src/expressions/intervals/utils.rs b/datafusion/physical-expr-common/src/expressions/intervals/utils.rs new file mode 100644 index 000000000000..7e8fad259a96 --- /dev/null +++ b/datafusion/physical-expr-common/src/expressions/intervals/utils.rs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +//http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utility functions for the interval arithmetic library + +use datafusion_common::{internal_datafusion_err, internal_err, Result, ScalarValue}; +use datafusion_expr::{interval_arithmetic::Interval, Operator}; + +const MDN_DAY_MASK: i128 = 0xFFFF_FFFF_0000_0000_0000_0000; +const MDN_NS_MASK: i128 = 0xFFFF_FFFF_FFFF_FFFF; +const DT_MS_MASK: i64 = 0xFFFF_FFFF; + +// This function returns the inverse operator of the given operator. +pub fn get_inverse_op(op: Operator) -> Result { + match op { + Operator::Plus => Ok(Operator::Minus), + Operator::Minus => Ok(Operator::Plus), + Operator::Multiply => Ok(Operator::Divide), + Operator::Divide => Ok(Operator::Multiply), + _ => internal_err!("Interval arithmetic does not support the operator {}", op), + } +} + +/// Converts an [`Interval`] of time intervals to one of `Duration`s, if applicable. Otherwise, returns [`None`]. +pub fn convert_interval_type_to_duration(interval: &Interval) -> Option { + if let (Some(lower), Some(upper)) = ( + convert_interval_bound_to_duration(interval.lower()), + convert_interval_bound_to_duration(interval.upper()), + ) { + Interval::try_new(lower, upper).ok() + } else { + None + } +} + +/// Converts an [`ScalarValue`] containing a time interval to one containing a `Duration`, if applicable. Otherwise, returns [`None`]. +fn convert_interval_bound_to_duration( + interval_bound: &ScalarValue, +) -> Option { + match interval_bound { + ScalarValue::IntervalMonthDayNano(Some(mdn)) => interval_mdn_to_duration_ns(mdn) + .ok() + .map(|duration| ScalarValue::DurationNanosecond(Some(duration))), + ScalarValue::IntervalDayTime(Some(dt)) => interval_dt_to_duration_ms(dt) + .ok() + .map(|duration| ScalarValue::DurationMillisecond(Some(duration))), + _ => None, + } +} + +/// If both the month and day fields of [`ScalarValue::IntervalMonthDayNano`] are zero, this function returns the nanoseconds part. +/// Otherwise, it returns an error. +fn interval_mdn_to_duration_ns(mdn: &i128) -> Result { + let months = mdn >> 96; + let days = (mdn & MDN_DAY_MASK) >> 64; + let nanoseconds = mdn & MDN_NS_MASK; + + if months == 0 && days == 0 { + nanoseconds + .try_into() + .map_err(|_| internal_datafusion_err!("Resulting duration exceeds i64::MAX")) + } else { + internal_err!( + "The interval cannot have a non-zero month or day value for duration convertibility" + ) + } +} + +/// If the day field of the [`ScalarValue::IntervalDayTime`] is zero, this function returns the milliseconds part. +/// Otherwise, it returns an error. +fn interval_dt_to_duration_ms(dt: &i64) -> Result { + let days = dt >> 32; + let milliseconds = dt & DT_MS_MASK; + + if days == 0 { + Ok(milliseconds) + } else { + internal_err!( + "The interval cannot have a non-zero day value for duration convertibility" + ) + } +} + +/// Converts an [`Interval`] of `Duration`s to one of time intervals, if applicable. Otherwise, returns [`None`]. +pub fn convert_duration_type_to_interval(interval: &Interval) -> Option { + if let (Some(lower), Some(upper)) = ( + convert_duration_bound_to_interval(interval.lower()), + convert_duration_bound_to_interval(interval.upper()), + ) { + Interval::try_new(lower, upper).ok() + } else { + None + } +} + +/// Converts a [`ScalarValue`] containing a `Duration` to one containing a time interval, if applicable. Otherwise, returns [`None`]. +fn convert_duration_bound_to_interval( + interval_bound: &ScalarValue, +) -> Option { + match interval_bound { + ScalarValue::DurationNanosecond(Some(duration)) => { + Some(ScalarValue::new_interval_mdn(0, 0, *duration)) + } + ScalarValue::DurationMicrosecond(Some(duration)) => { + Some(ScalarValue::new_interval_mdn(0, 0, *duration * 1000)) + } + ScalarValue::DurationMillisecond(Some(duration)) => { + Some(ScalarValue::new_interval_dt(0, *duration as i32)) + } + ScalarValue::DurationSecond(Some(duration)) => { + Some(ScalarValue::new_interval_dt(0, *duration as i32 * 1000)) + } + _ => None, + } +} diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index f9ccb87398e3..ec9ac82a2147 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -16,4 +16,5 @@ // under the License. pub mod column; +pub mod intervals; pub mod literal; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 48dec08b80bb..d3fc27f961cb 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -282,6 +282,7 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { /// * `e` - The logical expression /// * `input_dfschema` - The DataFusion schema for the input, used to resolve `Column` references /// to qualified or unqualified fields by name. +#[allow(clippy::only_used_in_recursion)] pub fn create_physical_expr( e: &Expr, input_dfschema: &DFSchema, diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index 3bd059afa6be..203ab45316f6 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -21,23 +21,23 @@ use std::collections::HashSet; use std::fmt::{Display, Formatter}; use std::sync::Arc; -use super::utils::{ - convert_duration_type_to_interval, convert_interval_type_to_duration, get_inverse_op, -}; use crate::expressions::Literal; use crate::utils::{build_dag, ExprTreeNode}; use crate::PhysicalExpr; use arrow_schema::{DataType, Schema}; -use datafusion_common::{internal_err, Result}; -use datafusion_expr::interval_arithmetic::{apply_operator, satisfy_greater, Interval}; -use datafusion_expr::Operator; +use datafusion_common::Result; +use datafusion_expr::interval_arithmetic::Interval; use petgraph::graph::NodeIndex; use petgraph::stable_graph::{DefaultIx, StableGraph}; use petgraph::visit::{Bfs, Dfs, DfsPostOrder, EdgeRef}; use petgraph::Outgoing; +pub use datafusion_physical_expr_common::expressions::intervals::cp_solver::{ + propagate_arithmetic, propagate_comparison, +}; + // Interval arithmetic provides a way to perform mathematical operations on // intervals, which represent a range of possible values rather than a single // point value. This allows for the propagation of ranges through mathematical @@ -198,157 +198,6 @@ impl PartialEq for ExprIntervalGraphNode { } } -/// This function refines intervals `left_child` and `right_child` by applying -/// constraint propagation through `parent` via operation. The main idea is -/// that we can shrink ranges of variables x and y using parent interval p. -/// -/// Assuming that x,y and p has ranges [xL, xU], [yL, yU], and [pL, pU], we -/// apply the following operations: -/// - For plus operation, specifically, we would first do -/// - [xL, xU] <- ([pL, pU] - [yL, yU]) ∩ [xL, xU], and then -/// - [yL, yU] <- ([pL, pU] - [xL, xU]) ∩ [yL, yU]. -/// - For minus operation, specifically, we would first do -/// - [xL, xU] <- ([yL, yU] + [pL, pU]) ∩ [xL, xU], and then -/// - [yL, yU] <- ([xL, xU] - [pL, pU]) ∩ [yL, yU]. -/// - For multiplication operation, specifically, we would first do -/// - [xL, xU] <- ([pL, pU] / [yL, yU]) ∩ [xL, xU], and then -/// - [yL, yU] <- ([pL, pU] / [xL, xU]) ∩ [yL, yU]. -/// - For division operation, specifically, we would first do -/// - [xL, xU] <- ([yL, yU] * [pL, pU]) ∩ [xL, xU], and then -/// - [yL, yU] <- ([xL, xU] / [pL, pU]) ∩ [yL, yU]. -pub fn propagate_arithmetic( - op: &Operator, - parent: &Interval, - left_child: &Interval, - right_child: &Interval, -) -> Result> { - let inverse_op = get_inverse_op(*op)?; - match (left_child.data_type(), right_child.data_type()) { - // If we have a child whose type is a time interval (i.e. DataType::Interval), - // we need special handling since timestamp differencing results in a - // Duration type. - (DataType::Timestamp(..), DataType::Interval(_)) => { - propagate_time_interval_at_right( - left_child, - right_child, - parent, - op, - &inverse_op, - ) - } - (DataType::Interval(_), DataType::Timestamp(..)) => { - propagate_time_interval_at_left( - left_child, - right_child, - parent, - op, - &inverse_op, - ) - } - _ => { - // First, propagate to the left: - match apply_operator(&inverse_op, parent, right_child)? - .intersect(left_child)? - { - // Left is feasible: - Some(value) => Ok( - // Propagate to the right using the new left. - propagate_right(&value, parent, right_child, op, &inverse_op)? - .map(|right| (value, right)), - ), - // If the left child is infeasible, short-circuit. - None => Ok(None), - } - } - } -} - -/// This function refines intervals `left_child` and `right_child` by applying -/// comparison propagation through `parent` via operation. The main idea is -/// that we can shrink ranges of variables x and y using parent interval p. -/// Two intervals can be ordered in 6 ways for a Gt `>` operator: -/// ```text -/// (1): Infeasible, short-circuit -/// left: | ================ | -/// right: | ======================== | -/// -/// (2): Update both interval -/// left: | ====================== | -/// right: | ====================== | -/// | -/// V -/// left: | ======= | -/// right: | ======= | -/// -/// (3): Update left interval -/// left: | ============================== | -/// right: | ========== | -/// | -/// V -/// left: | ===================== | -/// right: | ========== | -/// -/// (4): Update right interval -/// left: | ========== | -/// right: | =========================== | -/// | -/// V -/// left: | ========== | -/// right | ================== | -/// -/// (5): No change -/// left: | ============================ | -/// right: | =================== | -/// -/// (6): No change -/// left: | ==================== | -/// right: | =============== | -/// -/// -inf --------------------------------------------------------------- +inf -/// ``` -pub fn propagate_comparison( - op: &Operator, - parent: &Interval, - left_child: &Interval, - right_child: &Interval, -) -> Result> { - if parent == &Interval::CERTAINLY_TRUE { - match op { - Operator::Eq => left_child.intersect(right_child).map(|result| { - result.map(|intersection| (intersection.clone(), intersection)) - }), - Operator::Gt => satisfy_greater(left_child, right_child, true), - Operator::GtEq => satisfy_greater(left_child, right_child, false), - Operator::Lt => satisfy_greater(right_child, left_child, true) - .map(|t| t.map(reverse_tuple)), - Operator::LtEq => satisfy_greater(right_child, left_child, false) - .map(|t| t.map(reverse_tuple)), - _ => internal_err!( - "The operator must be a comparison operator to propagate intervals" - ), - } - } else if parent == &Interval::CERTAINLY_FALSE { - match op { - Operator::Eq => { - // TODO: Propagation is not possible until we support interval sets. - Ok(None) - } - Operator::Gt => satisfy_greater(right_child, left_child, false), - Operator::GtEq => satisfy_greater(right_child, left_child, true), - Operator::Lt => satisfy_greater(left_child, right_child, false) - .map(|t| t.map(reverse_tuple)), - Operator::LtEq => satisfy_greater(left_child, right_child, true) - .map(|t| t.map(reverse_tuple)), - _ => internal_err!( - "The operator must be a comparison operator to propagate intervals" - ), - } - } else { - // Uncertainty cannot change any end-point of the intervals. - Ok(None) - } -} - impl ExprIntervalGraph { pub fn try_new(expr: Arc, schema: &Schema) -> Result { // Build the full graph: @@ -624,107 +473,15 @@ impl ExprIntervalGraph { } } -/// This is a subfunction of the `propagate_arithmetic` function that propagates to the right child. -fn propagate_right( - left: &Interval, - parent: &Interval, - right: &Interval, - op: &Operator, - inverse_op: &Operator, -) -> Result> { - match op { - Operator::Minus => apply_operator(op, left, parent), - Operator::Plus => apply_operator(inverse_op, parent, left), - Operator::Divide => apply_operator(op, left, parent), - Operator::Multiply => apply_operator(inverse_op, parent, left), - _ => internal_err!("Interval arithmetic does not support the operator {}", op), - }? - .intersect(right) -} - -/// During the propagation of [`Interval`] values on an [`ExprIntervalGraph`], -/// if there exists a `timestamp - timestamp` operation, the result would be -/// of type `Duration`. However, we may encounter a situation where a time interval -/// is involved in an arithmetic operation with a `Duration` type. This function -/// offers special handling for such cases, where the time interval resides on -/// the left side of the operation. -fn propagate_time_interval_at_left( - left_child: &Interval, - right_child: &Interval, - parent: &Interval, - op: &Operator, - inverse_op: &Operator, -) -> Result> { - // We check if the child's time interval(s) has a non-zero month or day field(s). - // If so, we return it as is without propagating. Otherwise, we first convert - // the time intervals to the `Duration` type, then propagate, and then convert - // the bounds to time intervals again. - let result = if let Some(duration) = convert_interval_type_to_duration(left_child) { - match apply_operator(inverse_op, parent, right_child)?.intersect(duration)? { - Some(value) => { - let left = convert_duration_type_to_interval(&value); - let right = propagate_right(&value, parent, right_child, op, inverse_op)?; - match (left, right) { - (Some(left), Some(right)) => Some((left, right)), - _ => None, - } - } - None => None, - } - } else { - propagate_right(left_child, parent, right_child, op, inverse_op)? - .map(|right| (left_child.clone(), right)) - }; - Ok(result) -} - -/// During the propagation of [`Interval`] values on an [`ExprIntervalGraph`], -/// if there exists a `timestamp - timestamp` operation, the result would be -/// of type `Duration`. However, we may encounter a situation where a time interval -/// is involved in an arithmetic operation with a `Duration` type. This function -/// offers special handling for such cases, where the time interval resides on -/// the right side of the operation. -fn propagate_time_interval_at_right( - left_child: &Interval, - right_child: &Interval, - parent: &Interval, - op: &Operator, - inverse_op: &Operator, -) -> Result> { - // We check if the child's time interval(s) has a non-zero month or day field(s). - // If so, we return it as is without propagating. Otherwise, we first convert - // the time intervals to the `Duration` type, then propagate, and then convert - // the bounds to time intervals again. - let result = if let Some(duration) = convert_interval_type_to_duration(right_child) { - match apply_operator(inverse_op, parent, &duration)?.intersect(left_child)? { - Some(value) => { - propagate_right(left_child, parent, &duration, op, inverse_op)? - .and_then(|right| convert_duration_type_to_interval(&right)) - .map(|right| (value, right)) - } - None => None, - } - } else { - apply_operator(inverse_op, parent, right_child)? - .intersect(left_child)? - .map(|value| (value, right_child.clone())) - }; - Ok(result) -} - -fn reverse_tuple((first, second): (T, U)) -> (U, T) { - (second, first) -} - #[cfg(test)] mod tests { use super::*; use crate::expressions::{BinaryExpr, Column}; use crate::intervals::test_utils::gen_conjunctive_numerical_expr; - use arrow::datatypes::TimeUnit; use arrow_schema::{DataType, Field}; use datafusion_common::ScalarValue; + use datafusion_expr::Operator; use itertools::Itertools; use rand::rngs::StdRng; @@ -1477,90 +1234,6 @@ mod tests { Ok(()) } - #[test] - fn test_propagate_comparison() -> Result<()> { - // In the examples below: - // `left` is unbounded: [?, ?], - // `right` is known to be [1000,1000] - // so `left` < `right` results in no new knowledge of `right` but knowing that `left` is now < 1000:` [?, 999] - let left = Interval::make_unbounded(&DataType::Int64)?; - let right = Interval::make(Some(1000_i64), Some(1000_i64))?; - assert_eq!( - (Some(( - Interval::make(None, Some(999_i64))?, - Interval::make(Some(1000_i64), Some(1000_i64))?, - ))), - propagate_comparison( - &Operator::Lt, - &Interval::CERTAINLY_TRUE, - &left, - &right - )? - ); - - let left = - Interval::make_unbounded(&DataType::Timestamp(TimeUnit::Nanosecond, None))?; - let right = Interval::try_new( - ScalarValue::TimestampNanosecond(Some(1000), None), - ScalarValue::TimestampNanosecond(Some(1000), None), - )?; - assert_eq!( - (Some(( - Interval::try_new( - ScalarValue::try_from(&DataType::Timestamp( - TimeUnit::Nanosecond, - None - )) - .unwrap(), - ScalarValue::TimestampNanosecond(Some(999), None), - )?, - Interval::try_new( - ScalarValue::TimestampNanosecond(Some(1000), None), - ScalarValue::TimestampNanosecond(Some(1000), None), - )? - ))), - propagate_comparison( - &Operator::Lt, - &Interval::CERTAINLY_TRUE, - &left, - &right - )? - ); - - let left = Interval::make_unbounded(&DataType::Timestamp( - TimeUnit::Nanosecond, - Some("+05:00".into()), - ))?; - let right = Interval::try_new( - ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())), - ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())), - )?; - assert_eq!( - (Some(( - Interval::try_new( - ScalarValue::try_from(&DataType::Timestamp( - TimeUnit::Nanosecond, - Some("+05:00".into()), - )) - .unwrap(), - ScalarValue::TimestampNanosecond(Some(999), Some("+05:00".into())), - )?, - Interval::try_new( - ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())), - ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())), - )? - ))), - propagate_comparison( - &Operator::Lt, - &Interval::CERTAINLY_TRUE, - &left, - &right - )? - ); - - Ok(()) - } - #[test] fn test_propagate_or() -> Result<()> { let expr = Arc::new(BinaryExpr::new( diff --git a/datafusion/physical-expr/src/intervals/utils.rs b/datafusion/physical-expr/src/intervals/utils.rs index e188b2d56bae..ff7fd63126a6 100644 --- a/datafusion/physical-expr/src/intervals/utils.rs +++ b/datafusion/physical-expr/src/intervals/utils.rs @@ -25,14 +25,8 @@ use crate::{ }; use arrow_schema::{DataType, SchemaRef}; -use datafusion_common::{internal_datafusion_err, internal_err, Result, ScalarValue}; -use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::Operator; -const MDN_DAY_MASK: i128 = 0xFFFF_FFFF_0000_0000_0000_0000; -const MDN_NS_MASK: i128 = 0xFFFF_FFFF_FFFF_FFFF; -const DT_MS_MASK: i64 = 0xFFFF_FFFF; - /// Indicates whether interval arithmetic is supported for the given expression. /// Currently, we do not support all [`PhysicalExpr`]s for interval calculations. /// We do not support every type of [`Operator`]s either. Over time, this check @@ -65,17 +59,6 @@ pub fn check_support(expr: &Arc, schema: &SchemaRef) -> bool { } } -// This function returns the inverse operator of the given operator. -pub fn get_inverse_op(op: Operator) -> Result { - match op { - Operator::Plus => Ok(Operator::Minus), - Operator::Minus => Ok(Operator::Plus), - Operator::Multiply => Ok(Operator::Divide), - Operator::Divide => Ok(Operator::Multiply), - _ => internal_err!("Interval arithmetic does not support the operator {}", op), - } -} - /// Indicates whether interval arithmetic is supported for the given operator. pub fn is_operator_supported(op: &Operator) -> bool { matches!( @@ -109,96 +92,3 @@ pub fn is_datatype_supported(data_type: &DataType) -> bool { | &DataType::Float32 ) } - -/// Converts an [`Interval`] of time intervals to one of `Duration`s, if applicable. Otherwise, returns [`None`]. -pub fn convert_interval_type_to_duration(interval: &Interval) -> Option { - if let (Some(lower), Some(upper)) = ( - convert_interval_bound_to_duration(interval.lower()), - convert_interval_bound_to_duration(interval.upper()), - ) { - Interval::try_new(lower, upper).ok() - } else { - None - } -} - -/// Converts an [`ScalarValue`] containing a time interval to one containing a `Duration`, if applicable. Otherwise, returns [`None`]. -fn convert_interval_bound_to_duration( - interval_bound: &ScalarValue, -) -> Option { - match interval_bound { - ScalarValue::IntervalMonthDayNano(Some(mdn)) => interval_mdn_to_duration_ns(mdn) - .ok() - .map(|duration| ScalarValue::DurationNanosecond(Some(duration))), - ScalarValue::IntervalDayTime(Some(dt)) => interval_dt_to_duration_ms(dt) - .ok() - .map(|duration| ScalarValue::DurationMillisecond(Some(duration))), - _ => None, - } -} - -/// Converts an [`Interval`] of `Duration`s to one of time intervals, if applicable. Otherwise, returns [`None`]. -pub fn convert_duration_type_to_interval(interval: &Interval) -> Option { - if let (Some(lower), Some(upper)) = ( - convert_duration_bound_to_interval(interval.lower()), - convert_duration_bound_to_interval(interval.upper()), - ) { - Interval::try_new(lower, upper).ok() - } else { - None - } -} - -/// Converts a [`ScalarValue`] containing a `Duration` to one containing a time interval, if applicable. Otherwise, returns [`None`]. -fn convert_duration_bound_to_interval( - interval_bound: &ScalarValue, -) -> Option { - match interval_bound { - ScalarValue::DurationNanosecond(Some(duration)) => { - Some(ScalarValue::new_interval_mdn(0, 0, *duration)) - } - ScalarValue::DurationMicrosecond(Some(duration)) => { - Some(ScalarValue::new_interval_mdn(0, 0, *duration * 1000)) - } - ScalarValue::DurationMillisecond(Some(duration)) => { - Some(ScalarValue::new_interval_dt(0, *duration as i32)) - } - ScalarValue::DurationSecond(Some(duration)) => { - Some(ScalarValue::new_interval_dt(0, *duration as i32 * 1000)) - } - _ => None, - } -} - -/// If both the month and day fields of [`ScalarValue::IntervalMonthDayNano`] are zero, this function returns the nanoseconds part. -/// Otherwise, it returns an error. -fn interval_mdn_to_duration_ns(mdn: &i128) -> Result { - let months = mdn >> 96; - let days = (mdn & MDN_DAY_MASK) >> 64; - let nanoseconds = mdn & MDN_NS_MASK; - - if months == 0 && days == 0 { - nanoseconds - .try_into() - .map_err(|_| internal_datafusion_err!("Resulting duration exceeds i64::MAX")) - } else { - internal_err!( - "The interval cannot have a non-zero month or day value for duration convertibility" - ) - } -} - -/// If the day field of the [`ScalarValue::IntervalDayTime`] is zero, this function returns the milliseconds part. -/// Otherwise, it returns an error. -fn interval_dt_to_duration_ms(dt: &i64) -> Result { - let days = dt >> 32; - let milliseconds = dt & DT_MS_MASK; - - if days == 0 { - Ok(milliseconds) - } else { - internal_err!( - "The interval cannot have a non-zero day value for duration convertibility" - ) - } -} From 79731b9c66ee637a07e5a1e20f976a65fb7f361b Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 20 Apr 2024 14:28:41 +0800 Subject: [PATCH 04/16] fix doc Signed-off-by: jayzhan211 --- .../src/expressions/intervals/cp_solver.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs b/datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs index b382ee77ac03..28cf7c7b3974 100644 --- a/datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs +++ b/datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs @@ -177,7 +177,7 @@ pub fn propagate_comparison( } } -/// During the propagation of [`Interval`] values on an [`ExprIntervalGraph`], +/// During the propagation of [`Interval`] values on an ExprIntervalGraph, /// if there exists a `timestamp - timestamp` operation, the result would be /// of type `Duration`. However, we may encounter a situation where a time interval /// is involved in an arithmetic operation with a `Duration` type. This function @@ -229,7 +229,7 @@ fn propagate_right( .intersect(right) } -/// During the propagation of [`Interval`] values on an [`ExprIntervalGraph`], +/// During the propagation of [`Interval`] values on an ExprIntervalGraph, /// if there exists a `timestamp - timestamp` operation, the result would be /// of type `Duration`. However, we may encounter a situation where a time interval /// is involved in an arithmetic operation with a `Duration` type. This function From 446bc8239cf5f31876b56f6cce8365ac90bb6930 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 20 Apr 2024 14:51:45 +0800 Subject: [PATCH 05/16] move epxr::binary Signed-off-by: jayzhan211 --- datafusion/physical-expr-common/Cargo.toml | 1 + .../src/expressions/binary.rs | 6 ++++-- .../src/expressions/binary/kernels.rs | 0 .../src/expressions/datum.rs | 6 +++--- datafusion/physical-expr-common/src/expressions/mod.rs | 7 ++++++- .../src/expressions/try_cast.rs | 4 ++-- .../src/{expressions => }/intervals/cp_solver.rs | 0 .../src/{expressions => }/intervals/mod.rs | 1 + .../src/{expressions => }/intervals/utils.rs | 0 datafusion/physical-expr-common/src/lib.rs | 1 + datafusion/physical-expr-common/src/physical_expr.rs | 4 ++-- datafusion/physical-expr/src/expressions/like.rs | 2 +- datafusion/physical-expr/src/expressions/mod.rs | 8 ++------ datafusion/physical-expr/src/intervals/cp_solver.rs | 2 +- 14 files changed, 24 insertions(+), 18 deletions(-) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/binary.rs (99%) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/binary/kernels.rs (100%) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/datum.rs (96%) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/try_cast.rs (99%) rename datafusion/physical-expr-common/src/{expressions => }/intervals/cp_solver.rs (100%) rename datafusion/physical-expr-common/src/{expressions => }/intervals/mod.rs (99%) rename datafusion/physical-expr-common/src/{expressions => }/intervals/utils.rs (100%) diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index d1202c83d526..8fce83a79268 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -39,3 +39,4 @@ path = "src/lib.rs" arrow = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } +paste = "^1.0" diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr-common/src/expressions/binary.rs similarity index 99% rename from datafusion/physical-expr/src/expressions/binary.rs rename to datafusion/physical-expr-common/src/expressions/binary.rs index bc107e169de4..3be09a4b49a6 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr-common/src/expressions/binary.rs @@ -24,7 +24,7 @@ use crate::expressions::datum::{apply, apply_cmp}; use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison}; use crate::physical_expr::down_cast_any_ref; use crate::sort_properties::SortProperties; -use crate::PhysicalExpr; +use crate::physical_expr::PhysicalExpr; use arrow::array::*; use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene}; @@ -623,7 +623,9 @@ pub fn binary( #[cfg(test)] mod tests { use super::*; - use crate::expressions::{col, lit, try_cast, Literal}; + use crate::expressions::column::col; + use crate::expressions::literal::{lit, Literal}; + use crate::expressions::try_cast::try_cast; use arrow::datatypes::{ ArrowNumericType, Decimal128Type, Field, Int32Type, SchemaRef, }; diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr-common/src/expressions/binary/kernels.rs similarity index 100% rename from datafusion/physical-expr/src/expressions/binary/kernels.rs rename to datafusion/physical-expr-common/src/expressions/binary/kernels.rs diff --git a/datafusion/physical-expr/src/expressions/datum.rs b/datafusion/physical-expr-common/src/expressions/datum.rs similarity index 96% rename from datafusion/physical-expr/src/expressions/datum.rs rename to datafusion/physical-expr-common/src/expressions/datum.rs index 2bb79922cfec..f9f13cdf5fdf 100644 --- a/datafusion/physical-expr/src/expressions/datum.rs +++ b/datafusion/physical-expr-common/src/expressions/datum.rs @@ -17,7 +17,7 @@ use arrow::array::{ArrayRef, Datum}; use arrow::error::ArrowError; -use arrow_array::BooleanArray; +use arrow::array::BooleanArray; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::ColumnarValue; use std::sync::Arc; @@ -25,7 +25,7 @@ use std::sync::Arc; /// Applies a binary [`Datum`] kernel `f` to `lhs` and `rhs` /// /// This maps arrow-rs' [`Datum`] kernels to DataFusion's [`ColumnarValue`] abstraction -pub(crate) fn apply( +pub fn apply( lhs: &ColumnarValue, rhs: &ColumnarValue, f: impl Fn(&dyn Datum, &dyn Datum) -> Result, @@ -49,7 +49,7 @@ pub(crate) fn apply( } /// Applies a binary [`Datum`] comparison kernel `f` to `lhs` and `rhs` -pub(crate) fn apply_cmp( +pub fn apply_cmp( lhs: &ColumnarValue, rhs: &ColumnarValue, f: impl Fn(&dyn Datum, &dyn Datum) -> Result, diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index ec9ac82a2147..74e36670fb3d 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +//! Defines physical expressions that can evaluated at runtime during query execution + +#[macro_use] +pub mod binary; pub mod column; -pub mod intervals; +pub mod datum; pub mod literal; +pub mod try_cast; diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr-common/src/expressions/try_cast.rs similarity index 99% rename from datafusion/physical-expr/src/expressions/try_cast.rs rename to datafusion/physical-expr-common/src/expressions/try_cast.rs index ddfe49dda7a3..8a43d95ab6cc 100644 --- a/datafusion/physical-expr/src/expressions/try_cast.rs +++ b/datafusion/physical-expr-common/src/expressions/try_cast.rs @@ -21,7 +21,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use crate::physical_expr::down_cast_any_ref; -use crate::PhysicalExpr; +use crate::physical_expr::PhysicalExpr; use arrow::compute; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::{DataType, Schema}; @@ -148,7 +148,7 @@ pub fn try_cast( #[cfg(test)] mod tests { use super::*; - use crate::expressions::col; + use crate::expressions::column::col; use arrow::array::{ Decimal128Array, Decimal128Builder, StringArray, Time64NanosecondArray, }; diff --git a/datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs b/datafusion/physical-expr-common/src/intervals/cp_solver.rs similarity index 100% rename from datafusion/physical-expr-common/src/expressions/intervals/cp_solver.rs rename to datafusion/physical-expr-common/src/intervals/cp_solver.rs diff --git a/datafusion/physical-expr-common/src/expressions/intervals/mod.rs b/datafusion/physical-expr-common/src/intervals/mod.rs similarity index 99% rename from datafusion/physical-expr-common/src/expressions/intervals/mod.rs rename to datafusion/physical-expr-common/src/intervals/mod.rs index 7022bf2c42b9..db011afeb5ef 100644 --- a/datafusion/physical-expr-common/src/expressions/intervals/mod.rs +++ b/datafusion/physical-expr-common/src/intervals/mod.rs @@ -19,3 +19,4 @@ pub mod cp_solver; pub mod utils; + diff --git a/datafusion/physical-expr-common/src/expressions/intervals/utils.rs b/datafusion/physical-expr-common/src/intervals/utils.rs similarity index 100% rename from datafusion/physical-expr-common/src/expressions/intervals/utils.rs rename to datafusion/physical-expr-common/src/intervals/utils.rs diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs index 53e3134a1b05..85ed7b41282f 100644 --- a/datafusion/physical-expr-common/src/lib.rs +++ b/datafusion/physical-expr-common/src/lib.rs @@ -18,6 +18,7 @@ pub mod aggregate; pub mod expressions; pub mod physical_expr; +pub mod intervals; pub mod sort_expr; pub mod sort_properties; pub mod tree_node; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index d3fc27f961cb..7c1ee7ceecc4 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -232,7 +232,7 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { /// # use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_common::DFSchema; /// # use datafusion_expr::{Expr, col, lit}; -/// # use datafusion_physical_expr::create_physical_expr; +/// # use datafusion_physical_expr_common::create_physical_expr; /// # use datafusion_expr::execution_props::ExecutionProps; /// // For a logical expression `a = 1`, we can create a physical expression /// let expr = col("a").eq(lit(1)); @@ -252,7 +252,7 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { /// # use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_common::{assert_batches_eq, DFSchema}; /// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; -/// # use datafusion_physical_expr::create_physical_expr; +/// # use datafusion_physical_expr_common::create_physical_expr; /// # use datafusion_expr::execution_props::ExecutionProps; /// # let expr = col("a").eq(lit(1)); /// # let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr/src/expressions/like.rs index 6e0beeb0beea..9d7267fa78e7 100644 --- a/datafusion/physical-expr/src/expressions/like.rs +++ b/datafusion/physical-expr/src/expressions/like.rs @@ -20,7 +20,7 @@ use std::{any::Any, sync::Arc}; use crate::{physical_expr::down_cast_any_ref, PhysicalExpr}; -use crate::expressions::datum::apply_cmp; +use datafusion_physical_expr_common::expressions::datum::apply_cmp; use arrow::record_batch::RecordBatch; use arrow_schema::{DataType, Schema}; use datafusion_common::{internal_err, Result}; diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 32a2a92b994f..4b5adfbb86ba 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -17,12 +17,9 @@ //! Defines physical expressions that can evaluated at runtime during query execution -#[macro_use] -mod binary; mod case; mod cast; mod column; -mod datum; mod in_list; mod is_not_null; mod is_null; @@ -30,7 +27,6 @@ mod like; mod negative; mod no_op; mod not; -mod try_cast; /// Module with some convenient methods used in expression building pub mod helpers { @@ -78,7 +74,7 @@ pub use datafusion_functions_aggregate::first_last::{ FirstValuePhysicalExpr as FirstValue, LastValuePhysicalExpr as LastValue, }; -pub use binary::{binary, BinaryExpr}; +pub use datafusion_physical_expr_common::expressions::binary::{binary, BinaryExpr}; pub use case::{case, CaseExpr}; pub use cast::{cast, cast_with_options, CastExpr}; pub use column::UnKnownColumn; @@ -92,7 +88,7 @@ pub use like::{like, LikeExpr}; pub use negative::{negative, NegativeExpr}; pub use no_op::NoOp; pub use not::{not, NotExpr}; -pub use try_cast::{try_cast, TryCastExpr}; +pub use datafusion_physical_expr_common::expressions::try_cast::{try_cast, TryCastExpr}; #[cfg(test)] pub(crate) mod tests { diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index 203ab45316f6..49981036d3ff 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -34,7 +34,7 @@ use petgraph::stable_graph::{DefaultIx, StableGraph}; use petgraph::visit::{Bfs, Dfs, DfsPostOrder, EdgeRef}; use petgraph::Outgoing; -pub use datafusion_physical_expr_common::expressions::intervals::cp_solver::{ +pub use datafusion_physical_expr_common::intervals::cp_solver::{ propagate_arithmetic, propagate_comparison, }; From c5dc3e2635b4e8292cd9ae1e81d59b08b84b2c97 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 20 Apr 2024 15:07:12 +0800 Subject: [PATCH 06/16] binary done Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 1 + .../src/expressions/binary.rs | 2 +- .../src/expressions/datum.rs | 2 +- .../physical-expr-common/src/intervals/mod.rs | 1 - datafusion/physical-expr-common/src/lib.rs | 2 +- .../physical-expr-common/src/physical_expr.rs | 31 ++++++++++--------- .../physical-expr/src/expressions/like.rs | 2 +- .../physical-expr/src/expressions/mod.rs | 4 +-- datafusion/physical-expr/src/planner.rs | 8 +++++ 9 files changed, 31 insertions(+), 22 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 5ce5beab4d70..747a9c4309bb 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1363,6 +1363,7 @@ dependencies = [ "arrow", "datafusion-common", "datafusion-expr", + "paste", ] [[package]] diff --git a/datafusion/physical-expr-common/src/expressions/binary.rs b/datafusion/physical-expr-common/src/expressions/binary.rs index 3be09a4b49a6..93ac640266ea 100644 --- a/datafusion/physical-expr-common/src/expressions/binary.rs +++ b/datafusion/physical-expr-common/src/expressions/binary.rs @@ -23,8 +23,8 @@ use std::{any::Any, sync::Arc}; use crate::expressions::datum::{apply, apply_cmp}; use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison}; use crate::physical_expr::down_cast_any_ref; -use crate::sort_properties::SortProperties; use crate::physical_expr::PhysicalExpr; +use crate::sort_properties::SortProperties; use arrow::array::*; use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene}; diff --git a/datafusion/physical-expr-common/src/expressions/datum.rs b/datafusion/physical-expr-common/src/expressions/datum.rs index f9f13cdf5fdf..201bf104f20e 100644 --- a/datafusion/physical-expr-common/src/expressions/datum.rs +++ b/datafusion/physical-expr-common/src/expressions/datum.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::BooleanArray; use arrow::array::{ArrayRef, Datum}; use arrow::error::ArrowError; -use arrow::array::BooleanArray; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::ColumnarValue; use std::sync::Arc; diff --git a/datafusion/physical-expr-common/src/intervals/mod.rs b/datafusion/physical-expr-common/src/intervals/mod.rs index db011afeb5ef..7022bf2c42b9 100644 --- a/datafusion/physical-expr-common/src/intervals/mod.rs +++ b/datafusion/physical-expr-common/src/intervals/mod.rs @@ -19,4 +19,3 @@ pub mod cp_solver; pub mod utils; - diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs index 85ed7b41282f..de3b61c035ad 100644 --- a/datafusion/physical-expr-common/src/lib.rs +++ b/datafusion/physical-expr-common/src/lib.rs @@ -17,8 +17,8 @@ pub mod aggregate; pub mod expressions; -pub mod physical_expr; pub mod intervals; +pub mod physical_expr; pub mod sort_expr; pub mod sort_properties; pub mod tree_node; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 7c1ee7ceecc4..f01d6896eab3 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -29,8 +29,9 @@ use datafusion_common::{internal_err, not_impl_err, DFSchema, Result}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr::Alias; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::{ColumnarValue, Expr}; +use datafusion_expr::{BinaryExpr, ColumnarValue, Expr}; +use crate::expressions::binary::binary; use crate::expressions::column::Column; use crate::expressions::literal::Literal; use crate::sort_properties::SortProperties; @@ -288,7 +289,7 @@ pub fn create_physical_expr( input_dfschema: &DFSchema, execution_props: &ExecutionProps, ) -> Result> { - let _input_schema: &Schema = &input_dfschema.into(); + let input_schema: &Schema = &input_dfschema.into(); match e { Expr::Alias(Alias { expr, .. }) => { @@ -366,19 +367,19 @@ pub fn create_physical_expr( // ); // create_physical_expr(&binary_op, input_dfschema, execution_props) // } - // Expr::BinaryExpr(BinaryExpr { left, op, right }) => { - // // Create physical expressions for left and right operands - // let lhs = create_physical_expr(left, input_dfschema, execution_props)?; - // let rhs = create_physical_expr(right, input_dfschema, execution_props)?; - // // Note that the logical planner is responsible - // // for type coercion on the arguments (e.g. if one - // // argument was originally Int32 and one was - // // Int64 they will both be coerced to Int64). - // // - // // There should be no coercion during physical - // // planning. - // binary(lhs, *op, rhs, input_schema) - // } + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + // Create physical expressions for left and right operands + let lhs = create_physical_expr(left, input_dfschema, execution_props)?; + let rhs = create_physical_expr(right, input_dfschema, execution_props)?; + // Note that the logical planner is responsible + // for type coercion on the arguments (e.g. if one + // argument was originally Int32 and one was + // Int64 they will both be coerced to Int64). + // + // There should be no coercion during physical + // planning. + binary(lhs, *op, rhs, input_schema) + } // Expr::Like(Like { // negated, // expr, diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr/src/expressions/like.rs index 9d7267fa78e7..c219dea83617 100644 --- a/datafusion/physical-expr/src/expressions/like.rs +++ b/datafusion/physical-expr/src/expressions/like.rs @@ -20,11 +20,11 @@ use std::{any::Any, sync::Arc}; use crate::{physical_expr::down_cast_any_ref, PhysicalExpr}; -use datafusion_physical_expr_common::expressions::datum::apply_cmp; use arrow::record_batch::RecordBatch; use arrow_schema::{DataType, Schema}; use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; +use datafusion_physical_expr_common::expressions::datum::apply_cmp; // Like expression #[derive(Debug, Hash)] diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 4b5adfbb86ba..1d26c9bd806a 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -74,13 +74,14 @@ pub use datafusion_functions_aggregate::first_last::{ FirstValuePhysicalExpr as FirstValue, LastValuePhysicalExpr as LastValue, }; -pub use datafusion_physical_expr_common::expressions::binary::{binary, BinaryExpr}; pub use case::{case, CaseExpr}; pub use cast::{cast, cast_with_options, CastExpr}; pub use column::UnKnownColumn; pub use datafusion_expr::utils::format_state_name; +pub use datafusion_physical_expr_common::expressions::binary::{binary, BinaryExpr}; pub use datafusion_physical_expr_common::expressions::column::{col, Column}; pub use datafusion_physical_expr_common::expressions::literal::{lit, Literal}; +pub use datafusion_physical_expr_common::expressions::try_cast::{try_cast, TryCastExpr}; pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; @@ -88,7 +89,6 @@ pub use like::{like, LikeExpr}; pub use negative::{negative, NegativeExpr}; pub use no_op::NoOp; pub use not::{not, NotExpr}; -pub use datafusion_physical_expr_common::expressions::try_cast::{try_cast, TryCastExpr}; #[cfg(test)] pub(crate) mod tests { diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 20626818c83b..fea73ed5da29 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -106,6 +106,14 @@ pub fn create_physical_expr( input_dfschema: &DFSchema, execution_props: &ExecutionProps, ) -> Result> { + use datafusion_physical_expr_common::physical_expr::create_physical_expr as create_physical_expr_common; + + // Temporary solution, after all the logic is moved to common, we can remove this function + let res = create_physical_expr_common(e, input_dfschema, execution_props); + if res.is_ok() { + return res; + } + let input_schema: &Schema = &input_dfschema.into(); match e { From 5a6dacf590f2172f88d20821ce1c8f5d1e6d09eb Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 20 Apr 2024 15:32:57 +0800 Subject: [PATCH 07/16] fix doc Signed-off-by: jayzhan211 --- datafusion/physical-expr-common/src/physical_expr.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index f01d6896eab3..184f78c29a0b 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -233,7 +233,7 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { /// # use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_common::DFSchema; /// # use datafusion_expr::{Expr, col, lit}; -/// # use datafusion_physical_expr_common::create_physical_expr; +/// # use datafusion_physical_expr_common::physical_expr::create_physical_expr; /// # use datafusion_expr::execution_props::ExecutionProps; /// // For a logical expression `a = 1`, we can create a physical expression /// let expr = col("a").eq(lit(1)); @@ -253,7 +253,7 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { /// # use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_common::{assert_batches_eq, DFSchema}; /// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; -/// # use datafusion_physical_expr_common::create_physical_expr; +/// # use datafusion_physical_expr_common::physical_expr::create_physical_expr; /// # use datafusion_expr::execution_props::ExecutionProps; /// # let expr = col("a").eq(lit(1)); /// # let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); From d79775e41151006a781b82c43ce074aa429f314e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 09:24:49 +0800 Subject: [PATCH 08/16] add link Signed-off-by: jayzhan211 --- datafusion/physical-expr/src/planner.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index fea73ed5da29..c658d568ec52 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -108,7 +108,7 @@ pub fn create_physical_expr( ) -> Result> { use datafusion_physical_expr_common::physical_expr::create_physical_expr as create_physical_expr_common; - // Temporary solution, after all the logic is moved to common, we can remove this function + // PR #10074: Temporary solution, after all the logic is moved to common, we can remove this function let res = create_physical_expr_common(e, input_dfschema, execution_props); if res.is_ok() { return res; From 55d91d8c39117b4f40d8e1b5352227b2be17a589 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 20:56:46 +0800 Subject: [PATCH 09/16] cast Signed-off-by: jayzhan211 --- .../src/expressions/cast.rs | 4 +- .../src/expressions/in_list.rs | 1445 +++++++++++++++++ .../src/expressions/mod.rs | 1 + .../physical-expr/src/expressions/mod.rs | 3 +- 4 files changed, 1449 insertions(+), 4 deletions(-) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/cast.rs (99%) create mode 100644 datafusion/physical-expr-common/src/expressions/in_list.rs diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr-common/src/expressions/cast.rs similarity index 99% rename from datafusion/physical-expr/src/expressions/cast.rs rename to datafusion/physical-expr-common/src/expressions/cast.rs index 0d94642f14e7..70d380419e62 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr-common/src/expressions/cast.rs @@ -17,7 +17,7 @@ use crate::physical_expr::down_cast_any_ref; use crate::sort_properties::SortProperties; -use crate::PhysicalExpr; +use crate::physical_expr::PhysicalExpr; use std::any::Any; use std::fmt; use std::hash::{Hash, Hasher}; @@ -217,7 +217,7 @@ pub fn cast( #[cfg(test)] mod tests { use super::*; - use crate::expressions::col; + use crate::expressions::column::col; use arrow::{ array::{ diff --git a/datafusion/physical-expr-common/src/expressions/in_list.rs b/datafusion/physical-expr-common/src/expressions/in_list.rs new file mode 100644 index 000000000000..296301501714 --- /dev/null +++ b/datafusion/physical-expr-common/src/expressions/in_list.rs @@ -0,0 +1,1445 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Implementation of `InList` expressions: [`InListExpr`] + +use std::any::Any; +use std::fmt::Debug; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; + +use crate::physical_expr::{down_cast_any_ref, physical_exprs_bag_equal}; +use crate::physical_expr::PhysicalExpr; + +// use arrow::array::*; +use arrow::buffer::BooleanBuffer; +use arrow::compute::kernels::boolean::{not, or_kleene}; +use arrow::compute::kernels::cmp::eq; +use arrow::compute::take; +use arrow::datatypes::{i256, DataType, Schema, TimeUnit}; +use arrow::record_batch::RecordBatch; +use arrow::util::bit_iterator::BitIndexIterator; +use arrow::array::{as_largestring_array, downcast_array, downcast_dictionary_array, Array, ArrayAccessor, ArrayData, ArrayIter, ArrayRef, BooleanArray}; +use arrow::array::downcast_primitive_array; +use datafusion_common::cast::{ + as_boolean_array, as_generic_binary_array, as_string_array, +}; +use datafusion_common::hash_utils::HashValue; +use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue}; +use datafusion_expr::ColumnarValue; + +use ahash::RandomState; +use hashbrown::hash_map::RawEntryMut; +use hashbrown::HashMap; + +/// InList +pub struct InListExpr { + expr: Arc, + list: Vec>, + negated: bool, + static_filter: Option>, +} + +impl Debug for InListExpr { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("InListExpr") + .field("expr", &self.expr) + .field("list", &self.list) + .field("negated", &self.negated) + .finish() + } +} + +/// A type-erased container of array elements +pub trait Set: Send + Sync { + fn contains(&self, v: &dyn Array, negated: bool) -> Result; + fn has_nulls(&self) -> bool; +} + +struct ArrayHashSet { + state: RandomState, + /// Used to provide a lookup from value to in list index + /// + /// Note: usize::hash is not used, instead the raw entry + /// API is used to store entries w.r.t their value + map: HashMap, +} + +struct ArraySet { + array: T, + hash_set: ArrayHashSet, +} + +impl ArraySet +where + T: Array + From, +{ + fn new(array: &T, hash_set: ArrayHashSet) -> Self { + Self { + array: downcast_array(array), + hash_set, + } + } +} + +impl Set for ArraySet +where + T: Array + 'static, + for<'a> &'a T: ArrayAccessor, + for<'a> <&'a T as ArrayAccessor>::Item: IsEqual, +{ + fn contains(&self, v: &dyn Array, negated: bool) -> Result { + downcast_dictionary_array! { + v => { + let values_contains = self.contains(v.values().as_ref(), negated)?; + let result = take(&values_contains, v.keys(), None)?; + return Ok(downcast_array(result.as_ref())) + } + _ => {} + } + + let v = v.as_any().downcast_ref::().unwrap(); + let in_array = &self.array; + let has_nulls = in_array.null_count() != 0; + + Ok(ArrayIter::new(v) + .map(|v| { + v.and_then(|v| { + let hash = v.hash_one(&self.hash_set.state); + let contains = self + .hash_set + .map + .raw_entry() + .from_hash(hash, |idx| in_array.value(*idx).is_equal(&v)) + .is_some(); + + match contains { + true => Some(!negated), + false if has_nulls => None, + false => Some(negated), + } + }) + }) + .collect()) + } + + fn has_nulls(&self) -> bool { + self.array.null_count() != 0 + } +} + +/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there +/// are nulls present or there are more than the configured number of +/// elements. +/// +/// Note: This is split into a separate function as higher-rank trait bounds currently +/// cause type inference to misbehave +fn make_hash_set(array: T) -> ArrayHashSet +where + T: ArrayAccessor, + T::Item: IsEqual, +{ + let state = RandomState::new(); + let mut map: HashMap = + HashMap::with_capacity_and_hasher(array.len(), ()); + + let insert_value = |idx| { + let value = array.value(idx); + let hash = value.hash_one(&state); + if let RawEntryMut::Vacant(v) = map + .raw_entry_mut() + .from_hash(hash, |x| array.value(*x).is_equal(&value)) + { + v.insert_with_hasher(hash, idx, (), |x| array.value(*x).hash_one(&state)); + } + }; + + match array.nulls() { + Some(nulls) => { + BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()) + .for_each(insert_value) + } + None => (0..array.len()).for_each(insert_value), + } + + ArrayHashSet { state, map } +} + +/// Creates a `Box` for the given list of `IN` expressions and `batch` +fn make_set(array: &dyn Array) -> Result> { + Ok(downcast_primitive_array! { + array => Arc::new(ArraySet::new(array, make_hash_set(array))), + DataType::Boolean => { + let array = as_boolean_array(array)?; + Arc::new(ArraySet::new(array, make_hash_set(array))) + }, + DataType::Utf8 => { + let array = as_string_array(array)?; + Arc::new(ArraySet::new(array, make_hash_set(array))) + } + DataType::LargeUtf8 => { + let array = as_largestring_array(array); + Arc::new(ArraySet::new(array, make_hash_set(array))) + } + DataType::Binary => { + let array = as_generic_binary_array::(array)?; + Arc::new(ArraySet::new(array, make_hash_set(array))) + } + DataType::LargeBinary => { + let array = as_generic_binary_array::(array)?; + Arc::new(ArraySet::new(array, make_hash_set(array))) + } + DataType::Dictionary(_, _) => unreachable!("dictionary should have been flattened"), + d => return not_impl_err!("DataType::{d} not supported in InList") + }) +} + +/// Evaluates the list of expressions into an array, flattening any dictionaries +fn evaluate_list( + list: &[Arc], + batch: &RecordBatch, +) -> Result { + let scalars = list + .iter() + .map(|expr| { + expr.evaluate(batch).and_then(|r| match r { + ColumnarValue::Array(_) => { + exec_err!("InList expression must evaluate to a scalar") + } + // Flatten dictionary values + ColumnarValue::Scalar(ScalarValue::Dictionary(_, v)) => Ok(*v), + ColumnarValue::Scalar(s) => Ok(s), + }) + }) + .collect::>>()?; + + ScalarValue::iter_to_array(scalars) +} + +fn try_cast_static_filter_to_set( + list: &[Arc], + schema: &Schema, +) -> Result> { + let batch = RecordBatch::new_empty(Arc::new(schema.clone())); + make_set(evaluate_list(list, &batch)?.as_ref()) +} + +/// Custom equality check function which is used with [`ArrayHashSet`] for existence check. +trait IsEqual: HashValue { + fn is_equal(&self, other: &Self) -> bool; +} + +impl<'a, T: IsEqual + ?Sized> IsEqual for &'a T { + fn is_equal(&self, other: &Self) -> bool { + T::is_equal(self, other) + } +} + +macro_rules! is_equal { + ($($t:ty),+) => { + $(impl IsEqual for $t { + fn is_equal(&self, other: &Self) -> bool { + self == other + } + })* + }; +} +is_equal!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64); +is_equal!(bool, str, [u8]); + +macro_rules! is_equal_float { + ($($t:ty),+) => { + $(impl IsEqual for $t { + fn is_equal(&self, other: &Self) -> bool { + self.to_bits() == other.to_bits() + } + })* + }; +} +is_equal_float!(half::f16, f32, f64); + +impl InListExpr { + /// Create a new InList expression + pub fn new( + expr: Arc, + list: Vec>, + negated: bool, + static_filter: Option>, + ) -> Self { + Self { + expr, + list, + negated, + static_filter, + } + } + + /// Input expression + pub fn expr(&self) -> &Arc { + &self.expr + } + + /// List to search in + pub fn list(&self) -> &[Arc] { + &self.list + } + + /// Is this negated e.g. NOT IN LIST + pub fn negated(&self) -> bool { + self.negated + } +} + +impl std::fmt::Display for InListExpr { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + if self.negated { + if self.static_filter.is_some() { + write!(f, "{} NOT IN (SET) ({:?})", self.expr, self.list) + } else { + write!(f, "{} NOT IN ({:?})", self.expr, self.list) + } + } else if self.static_filter.is_some() { + write!(f, "Use {} IN (SET) ({:?})", self.expr, self.list) + } else { + write!(f, "{} IN ({:?})", self.expr, self.list) + } + } +} + +impl PhysicalExpr for InListExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn data_type(&self, _input_schema: &Schema) -> Result { + Ok(DataType::Boolean) + } + + fn nullable(&self, input_schema: &Schema) -> Result { + if self.expr.nullable(input_schema)? { + return Ok(true); + } + + if let Some(static_filter) = &self.static_filter { + Ok(static_filter.has_nulls()) + } else { + for expr in &self.list { + if expr.nullable(input_schema)? { + return Ok(true); + } + } + Ok(false) + } + } + + fn evaluate(&self, batch: &RecordBatch) -> Result { + let num_rows = batch.num_rows(); + let value = self.expr.evaluate(batch)?; + let r = match &self.static_filter { + Some(f) => f.contains(value.into_array(num_rows)?.as_ref(), self.negated)?, + None => { + let value = value.into_array(num_rows)?; + let found = self.list.iter().map(|expr| expr.evaluate(batch)).try_fold( + BooleanArray::new(BooleanBuffer::new_unset(num_rows), None), + |result, expr| -> Result { + Ok(or_kleene( + &result, + &eq(&value, &expr?.into_array(num_rows)?)?, + )?) + }, + )?; + + if self.negated { + not(&found)? + } else { + found + } + } + }; + Ok(ColumnarValue::Array(Arc::new(r))) + } + + fn children(&self) -> Vec> { + let mut children = vec![]; + children.push(self.expr.clone()); + children.extend(self.list.clone()); + children + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + // assume the static_filter will not change during the rewrite process + Ok(Arc::new(InListExpr::new( + children[0].clone(), + children[1..].to_vec(), + self.negated, + self.static_filter.clone(), + ))) + } + + fn dyn_hash(&self, state: &mut dyn Hasher) { + let mut s = state; + self.expr.hash(&mut s); + self.negated.hash(&mut s); + self.list.hash(&mut s); + // Add `self.static_filter` when hash is available + } +} + +impl PartialEq for InListExpr { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| { + self.expr.eq(&x.expr) + && physical_exprs_bag_equal(&self.list, &x.list) + && self.negated == x.negated + }) + .unwrap_or(false) + } +} + +/// Checks if two types are logically equal, dictionary types are compared by their value types. +fn is_logically_eq(lhs: &DataType, rhs: &DataType) -> bool { + match (lhs, rhs) { + (DataType::Dictionary(_, v1), DataType::Dictionary(_, v2)) => { + v1.as_ref().eq(v2.as_ref()) + } + (DataType::Dictionary(_, l), _) => l.as_ref().eq(rhs), + (_, DataType::Dictionary(_, r)) => lhs.eq(r.as_ref()), + _ => lhs.eq(rhs), + } +} + +/// Creates a unary expression InList +pub fn in_list( + expr: Arc, + list: Vec>, + negated: &bool, + schema: &Schema, +) -> Result> { + // check the data type + let expr_data_type = expr.data_type(schema)?; + for list_expr in list.iter() { + let list_expr_data_type = list_expr.data_type(schema)?; + if !is_logically_eq(&expr_data_type, &list_expr_data_type) { + return internal_err!( + "The data type inlist should be same, the value type is {expr_data_type}, one of list expr type is {list_expr_data_type}" + ); + } + } + let static_filter = try_cast_static_filter_to_set(&list, schema).ok(); + Ok(Arc::new(InListExpr::new( + expr, + list, + *negated, + static_filter, + ))) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::expressions; + use crate::expressions::column::col; + use crate::expressions::literal::lit; + use crate::expressions::try_cast::try_cast; + use arrow::array::BinaryArray; + use arrow::array::Date32Array; + use arrow::array::Date64Array; + use arrow::array::Decimal128Array; + use arrow::array::Float64Array; + use arrow::array::Int64Array; + use arrow::array::StringArray; + use datafusion_common::plan_err; + use datafusion_common::Result; + use datafusion_expr::type_coercion::binary::comparison_coercion; + + use arrow::datatypes::Field; + + type InListCastResult = (Arc, Vec>); + + // Try to do the type coercion for list physical expr. + // It's just used in the test + fn in_list_cast( + expr: Arc, + list: Vec>, + input_schema: &Schema, + ) -> Result { + let expr_type = &expr.data_type(input_schema)?; + let list_types: Vec = list + .iter() + .map(|list_expr| list_expr.data_type(input_schema).unwrap()) + .collect(); + let result_type = get_coerce_type(expr_type, &list_types); + match result_type { + None => plan_err!( + "Can not find compatible types to compare {expr_type:?} with {list_types:?}" + ), + Some(data_type) => { + // find the coerced type + let cast_expr = try_cast(expr, input_schema, data_type.clone())?; + let cast_list_expr = list + .into_iter() + .map(|list_expr| { + try_cast(list_expr, input_schema, data_type.clone()).unwrap() + }) + .collect(); + Ok((cast_expr, cast_list_expr)) + } + } + } + + // Attempts to coerce the types of `list_type` to be comparable with the + // `expr_type` + fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option { + list_type + .iter() + .try_fold(expr_type.clone(), |left_type, right_type| { + comparison_coercion(&left_type, right_type) + }) + } + + // applies the in_list expr to an input batch and list + macro_rules! in_list { + ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{ + let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?; + in_list_raw!( + $BATCH, + cast_list_exprs, + $NEGATED, + $EXPECTED, + cast_expr, + $SCHEMA + ); + }}; + } + + // applies the in_list expr to an input batch and list without cast + macro_rules! in_list_raw { + ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{ + let expr = in_list($COL, $LIST, $NEGATED, $SCHEMA).unwrap(); + let result = expr + .evaluate(&$BATCH)? + .into_array($BATCH.num_rows()) + .expect("Failed to convert to array"); + let result = + as_boolean_array(&result).expect("failed to downcast to BooleanArray"); + let expected = &BooleanArray::from($EXPECTED); + assert_eq!(expected, result); + }}; + } + + #[test] + fn in_list_utf8() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); + let a = StringArray::from(vec![Some("a"), Some("d"), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in ("a", "b")" + let list = vec![lit("a"), lit("b")]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + col_a.clone(), + &schema + ); + + // expression: "a not in ("a", "b")" + let list = vec![lit("a"), lit("b")]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + col_a.clone(), + &schema + ); + + // expression: "a in ("a", "b", null)" + let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + col_a.clone(), + &schema + ); + + // expression: "a not in ("a", "b", null)" + let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone(), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_binary() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Binary, true)]); + let a = BinaryArray::from(vec![ + Some([1, 2, 3].as_slice()), + Some([1, 2, 2].as_slice()), + None, + ]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in ([1, 2, 3], [4, 5, 6])" + let list = vec![lit([1, 2, 3].as_slice()), lit([4, 5, 6].as_slice())]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), Some(false), None], + col_a.clone(), + &schema + ); + + // expression: "a not in ([1, 2, 3], [4, 5, 6])" + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + col_a.clone(), + &schema + ); + + // expression: "a in ([1, 2, 3], [4, 5, 6], null)" + let list = vec![ + lit([1, 2, 3].as_slice()), + lit([4, 5, 6].as_slice()), + lit(ScalarValue::Binary(None)), + ]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), None, None], + col_a.clone(), + &schema + ); + + // expression: "a in ([1, 2, 3], [4, 5, 6], null)" + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone(), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_int64() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); + let a = Int64Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![lit(0i64), lit(1i64)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + col_a.clone(), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![lit(0i64), lit(1i64)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + col_a.clone(), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![lit(0i64), lit(1i64), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + col_a.clone(), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![lit(0i64), lit(1i64), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone(), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_float64() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Float64, true)]); + let a = Float64Array::from(vec![ + Some(0.0), + Some(0.2), + None, + Some(f64::NAN), + Some(-f64::NAN), + ]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0.0, 0.1)" + let list = vec![lit(0.0f64), lit(0.1f64)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None, Some(false), Some(false)], + col_a.clone(), + &schema + ); + + // expression: "a not in (0.0, 0.1)" + let list = vec![lit(0.0f64), lit(0.1f64)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None, Some(true), Some(true)], + col_a.clone(), + &schema + ); + + // expression: "a in (0.0, 0.1, NULL)" + let list = vec![lit(0.0f64), lit(0.1f64), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None, None, None], + col_a.clone(), + &schema + ); + + // expression: "a not in (0.0, 0.1, NULL)" + let list = vec![lit(0.0f64), lit(0.1f64), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None, None, None], + col_a.clone(), + &schema + ); + + // expression: "a in (0.0, 0.1, NaN)" + let list = vec![lit(0.0f64), lit(0.1f64), lit(f64::NAN)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None, Some(true), Some(false)], + col_a.clone(), + &schema + ); + + // expression: "a not in (0.0, 0.1, NaN)" + let list = vec![lit(0.0f64), lit(0.1f64), lit(f64::NAN)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None, Some(false), Some(true)], + col_a.clone(), + &schema + ); + + // expression: "a in (0.0, 0.1, -NaN)" + let list = vec![lit(0.0f64), lit(0.1f64), lit(-f64::NAN)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None, Some(false), Some(true)], + col_a.clone(), + &schema + ); + + // expression: "a not in (0.0, 0.1, -NaN)" + let list = vec![lit(0.0f64), lit(0.1f64), lit(-f64::NAN)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None, Some(true), Some(false)], + col_a.clone(), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_bool() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Boolean, true)]); + let a = BooleanArray::from(vec![Some(true), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (true)" + let list = vec![lit(true)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None], + col_a.clone(), + &schema + ); + + // expression: "a not in (true)" + let list = vec![lit(true)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None], + col_a.clone(), + &schema + ); + + // expression: "a in (true, NULL)" + let list = vec![lit(true), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None], + col_a.clone(), + &schema + ); + + // expression: "a not in (true, NULL)" + let list = vec![lit(true), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None], + col_a.clone(), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_date64() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Date64, true)]); + let a = Date64Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![ + lit(ScalarValue::Date64(Some(0))), + lit(ScalarValue::Date64(Some(1))), + ]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + col_a.clone(), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![ + lit(ScalarValue::Date64(Some(0))), + lit(ScalarValue::Date64(Some(1))), + ]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + col_a.clone(), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![ + lit(ScalarValue::Date64(Some(0))), + lit(ScalarValue::Date64(Some(1))), + lit(ScalarValue::Null), + ]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + col_a.clone(), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![ + lit(ScalarValue::Date64(Some(0))), + lit(ScalarValue::Date64(Some(1))), + lit(ScalarValue::Null), + ]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone(), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_date32() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Date32, true)]); + let a = Date32Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![ + lit(ScalarValue::Date32(Some(0))), + lit(ScalarValue::Date32(Some(1))), + ]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + col_a.clone(), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![ + lit(ScalarValue::Date32(Some(0))), + lit(ScalarValue::Date32(Some(1))), + ]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + col_a.clone(), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![ + lit(ScalarValue::Date32(Some(0))), + lit(ScalarValue::Date32(Some(1))), + lit(ScalarValue::Null), + ]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + col_a.clone(), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![ + lit(ScalarValue::Date32(Some(0))), + lit(ScalarValue::Date32(Some(1))), + lit(ScalarValue::Null), + ]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone(), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_decimal() -> Result<()> { + // Now, we can check the NULL type + let schema = + Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]); + let array = vec![Some(100_0000_i128), None, Some(200_5000_i128)] + .into_iter() + .collect::(); + let array = array.with_precision_and_scale(13, 4).unwrap(); + let col_a = col("a", &schema)?; + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)])?; + + // expression: "a in (100,200), the data type of list is INT32 + let list = vec![lit(100i32), lit(200i32)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, Some(false)], + col_a.clone(), + &schema + ); + // expression: "a not in (100,200) + let list = vec![lit(100i32), lit(200i32)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, Some(true)], + col_a.clone(), + &schema + ); + + // expression: "a in (200,NULL), the data type of list is INT32 AND NULL + let list = vec![lit(ScalarValue::Int32(Some(100))), lit(ScalarValue::Null)]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), None, None], + col_a.clone(), + &schema + ); + // expression: "a not in (200,NULL), the data type of list is INT32 AND NULL + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone(), + &schema + ); + + // expression: "a in (200.5, 100), the data type of list is FLOAT32 and INT32 + let list = vec![lit(200.50f32), lit(100i32)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, Some(true)], + col_a.clone(), + &schema + ); + + // expression: "a not in (200.5, 100), the data type of list is FLOAT32 and INT32 + let list = vec![lit(200.50f32), lit(101i32)]; + in_list!( + batch, + list, + &true, + vec![Some(true), None, Some(false)], + col_a.clone(), + &schema + ); + + // test the optimization: set + // expression: "a in (99..300), the data type of list is INT32 + let list = (99i32..300).map(lit).collect::>(); + + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), None, Some(false)], + col_a.clone(), + &schema + ); + + in_list!( + batch, + list, + &true, + vec![Some(false), None, Some(true)], + col_a.clone(), + &schema + ); + + Ok(()) + } + + #[test] + fn test_cast_static_filter_to_set() -> Result<()> { + // random schema + let schema = + Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]); + + // list of phy expr + let mut phy_exprs = vec![ + lit(1i64), + expressions::cast(lit(2i32), &schema, DataType::Int64)?, + expressions::try_cast(lit(3.13f32), &schema, DataType::Int64)?, + ]; + let result = try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); + + let array = Int64Array::from(vec![1, 2, 3, 4]); + let r = result.contains(&array, false).unwrap(); + assert_eq!(r, BooleanArray::from(vec![true, true, true, false])); + + try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); + // cast(cast(lit())), but the cast to the same data type, one case will be ignored + phy_exprs.push(expressions::cast( + expressions::cast(lit(2i32), &schema, DataType::Int64)?, + &schema, + DataType::Int64, + )?); + try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); + + phy_exprs.clear(); + + // case(cast(lit())), the cast to the diff data type + phy_exprs.push(expressions::cast( + expressions::cast(lit(2i32), &schema, DataType::Int64)?, + &schema, + DataType::Int32, + )?); + try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); + + // column + phy_exprs.push(expressions::col("a", &schema)?); + assert!(try_cast_static_filter_to_set(&phy_exprs, &schema).is_err()); + + Ok(()) + } + + #[test] + fn in_list_timestamp() -> Result<()> { + let schema = Schema::new(vec![Field::new( + "a", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + )]); + let a = TimestampMicrosecondArray::from(vec![ + Some(1388588401000000000), + Some(1288588501000000000), + None, + ]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + let list = vec![ + lit(ScalarValue::TimestampMicrosecond( + Some(1388588401000000000), + None, + )), + lit(ScalarValue::TimestampMicrosecond( + Some(1388588401000000001), + None, + )), + lit(ScalarValue::TimestampMicrosecond( + Some(1388588401000000002), + None, + )), + ]; + + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), Some(false), None], + col_a.clone(), + &schema + ); + + in_list!( + batch, + list.clone(), + &true, + vec![Some(false), Some(true), None], + col_a.clone(), + &schema + ); + Ok(()) + } + + #[test] + fn in_expr_with_multiple_element_in_list() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Float64, true), + Field::new("b", DataType::Float64, true), + Field::new("c", DataType::Float64, true), + ]); + let a = Float64Array::from(vec![ + Some(0.0), + Some(1.0), + Some(2.0), + Some(f64::NAN), + Some(-f64::NAN), + ]); + let b = Float64Array::from(vec![ + Some(8.0), + Some(1.0), + Some(5.0), + Some(f64::NAN), + Some(3.0), + ]); + let c = Float64Array::from(vec![ + Some(6.0), + Some(7.0), + None, + Some(5.0), + Some(-f64::NAN), + ]); + let col_a = col("a", &schema)?; + let col_b = col("b", &schema)?; + let col_c = col("c", &schema)?; + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(a), Arc::new(b), Arc::new(c)], + )?; + + let list = vec![col_b.clone(), col_c.clone()]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(false), Some(true), None, Some(true), Some(true)], + col_a.clone(), + &schema + ); + + in_list!( + batch, + list, + &true, + vec![Some(true), Some(false), None, Some(false), Some(false)], + col_a.clone(), + &schema + ); + + Ok(()) + } + + macro_rules! test_nullable { + ($COL:expr, $LIST:expr, $SCHEMA:expr, $EXPECTED:expr) => {{ + let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?; + let expr = in_list(cast_expr, cast_list_exprs, &false, $SCHEMA).unwrap(); + let result = expr.nullable($SCHEMA)?; + assert_eq!($EXPECTED, result); + }}; + } + + #[test] + fn in_list_nullable() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1_nullable", DataType::Int64, true), + Field::new("c2_non_nullable", DataType::Int64, false), + ]); + + let c1_nullable = col("c1_nullable", &schema)?; + let c2_non_nullable = col("c2_non_nullable", &schema)?; + + // static_filter has no nulls + let list = vec![lit(1_i64), lit(2_i64)]; + test_nullable!(c1_nullable.clone(), list.clone(), &schema, true); + test_nullable!(c2_non_nullable.clone(), list.clone(), &schema, false); + + // static_filter has nulls + let list = vec![lit(1_i64), lit(2_i64), lit(ScalarValue::Null)]; + test_nullable!(c1_nullable.clone(), list.clone(), &schema, true); + test_nullable!(c2_non_nullable.clone(), list.clone(), &schema, true); + + let list = vec![c1_nullable.clone()]; + test_nullable!(c2_non_nullable.clone(), list.clone(), &schema, true); + + let list = vec![c2_non_nullable.clone()]; + test_nullable!(c1_nullable.clone(), list.clone(), &schema, true); + + let list = vec![c2_non_nullable.clone(), c2_non_nullable.clone()]; + test_nullable!(c2_non_nullable.clone(), list.clone(), &schema, false); + + Ok(()) + } + + #[test] + fn in_list_no_cols() -> Result<()> { + // test logic when the in_list expression doesn't have any columns + let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + let a = Int32Array::from(vec![Some(1), Some(2), None]); + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + let list = vec![lit(ScalarValue::from(1i32)), lit(ScalarValue::from(6i32))]; + + // 1 IN (1, 6) + let expr = lit(ScalarValue::Int32(Some(1))); + in_list!( + batch, + list.clone(), + &false, + // should have three outputs, as the input batch has three rows + vec![Some(true), Some(true), Some(true)], + expr, + &schema + ); + + // 2 IN (1, 6) + let expr = lit(ScalarValue::Int32(Some(2))); + in_list!( + batch, + list.clone(), + &false, + // should have three outputs, as the input batch has three rows + vec![Some(false), Some(false), Some(false)], + expr, + &schema + ); + + // NULL IN (1, 6) + let expr = lit(ScalarValue::Int32(None)); + in_list!( + batch, + list.clone(), + &false, + // should have three outputs, as the input batch has three rows + vec![None, None, None], + expr, + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_utf8_with_dict_types() -> Result<()> { + fn dict_lit(key_type: DataType, value: &str) -> Arc { + lit(ScalarValue::Dictionary( + Box::new(key_type), + Box::new(ScalarValue::new_utf8(value.to_string())), + )) + } + + fn null_dict_lit(key_type: DataType) -> Arc { + lit(ScalarValue::Dictionary( + Box::new(key_type), + Box::new(ScalarValue::Utf8(None)), + )) + } + + let schema = Schema::new(vec![Field::new( + "a", + DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)), + true, + )]); + let a: UInt16DictionaryArray = + vec![Some("a"), Some("d"), None].into_iter().collect(); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in ("a", "b")" + let lists = [ + vec![lit("a"), lit("b")], + vec![ + dict_lit(DataType::Int8, "a"), + dict_lit(DataType::UInt16, "b"), + ], + ]; + for list in lists.iter() { + in_list_raw!( + batch, + list.clone(), + &false, + vec![Some(true), Some(false), None], + col_a.clone(), + &schema + ); + } + + // expression: "a not in ("a", "b")" + for list in lists.iter() { + in_list_raw!( + batch, + list.clone(), + &true, + vec![Some(false), Some(true), None], + col_a.clone(), + &schema + ); + } + + // expression: "a in ("a", "b", null)" + let lists = [ + vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))], + vec![ + dict_lit(DataType::Int8, "a"), + dict_lit(DataType::UInt16, "b"), + null_dict_lit(DataType::UInt16), + ], + ]; + for list in lists.iter() { + in_list_raw!( + batch, + list.clone(), + &false, + vec![Some(true), None, None], + col_a.clone(), + &schema + ); + } + + // expression: "a not in ("a", "b", null)" + for list in lists.iter() { + in_list_raw!( + batch, + list.clone(), + &true, + vec![Some(false), None, None], + col_a.clone(), + &schema + ); + } + + Ok(()) + } +} diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index 74e36670fb3d..397c83148719 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -19,6 +19,7 @@ #[macro_use] pub mod binary; +pub mod cast; pub mod column; pub mod datum; pub mod literal; diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 1d26c9bd806a..a84cc898ef0e 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -18,7 +18,6 @@ //! Defines physical expressions that can evaluated at runtime during query execution mod case; -mod cast; mod column; mod in_list; mod is_not_null; @@ -75,7 +74,7 @@ pub use datafusion_functions_aggregate::first_last::{ }; pub use case::{case, CaseExpr}; -pub use cast::{cast, cast_with_options, CastExpr}; +pub use datafusion_physical_expr_common::expressions::cast::{cast, cast_with_options, CastExpr}; pub use column::UnKnownColumn; pub use datafusion_expr::utils::format_state_name; pub use datafusion_physical_expr_common::expressions::binary::{binary, BinaryExpr}; From 409dde04aab74db8862c9f596682c6df6186487a Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 21:01:30 +0800 Subject: [PATCH 10/16] inlist Signed-off-by: jayzhan211 --- datafusion/physical-expr-common/Cargo.toml | 8 +- .../src/expressions/in_list.rs | 24 +- .../src/expressions/mod.rs | 1 + .../physical-expr-common/src/physical_expr.rs | 62 +- .../physical-expr/src/expressions/in_list.rs | 1435 ----------------- .../physical-expr/src/expressions/mod.rs | 3 +- datafusion/physical-expr/src/physical_expr.rs | 23 +- 7 files changed, 67 insertions(+), 1489 deletions(-) delete mode 100644 datafusion/physical-expr/src/expressions/in_list.rs diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index 8fce83a79268..b34eca32c21c 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -36,7 +36,13 @@ name = "datafusion_physical_expr_common" path = "src/lib.rs" [dependencies] +ahash = { version = "0.8", default-features = false, features = [ + "runtime-rng", +] } arrow = { workspace = true } +arrow-schema = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } -paste = "^1.0" +half = { workspace = true } +hashbrown = { version = "0.14", features = ["raw"] } +paste = "^1.0" \ No newline at end of file diff --git a/datafusion/physical-expr-common/src/expressions/in_list.rs b/datafusion/physical-expr-common/src/expressions/in_list.rs index 296301501714..752ca4ef80d6 100644 --- a/datafusion/physical-expr-common/src/expressions/in_list.rs +++ b/datafusion/physical-expr-common/src/expressions/in_list.rs @@ -25,12 +25,11 @@ use std::sync::Arc; use crate::physical_expr::{down_cast_any_ref, physical_exprs_bag_equal}; use crate::physical_expr::PhysicalExpr; -// use arrow::array::*; use arrow::buffer::BooleanBuffer; use arrow::compute::kernels::boolean::{not, or_kleene}; use arrow::compute::kernels::cmp::eq; use arrow::compute::take; -use arrow::datatypes::{i256, DataType, Schema, TimeUnit}; +use arrow::datatypes::{i256, DataType, Schema}; use arrow::record_batch::RecordBatch; use arrow::util::bit_iterator::BitIndexIterator; use arrow::array::{as_largestring_array, downcast_array, downcast_dictionary_array, Array, ArrayAccessor, ArrayData, ArrayIter, ArrayRef, BooleanArray}; @@ -457,7 +456,8 @@ pub fn in_list( #[cfg(test)] mod tests { use super::*; - use crate::expressions; + + use crate::expressions::cast::cast; use crate::expressions::column::col; use crate::expressions::literal::lit; use crate::expressions::try_cast::try_cast; @@ -466,13 +466,17 @@ mod tests { use arrow::array::Date64Array; use arrow::array::Decimal128Array; use arrow::array::Float64Array; + use arrow::array::Int32Array; use arrow::array::Int64Array; use arrow::array::StringArray; + use arrow::array::TimestampMicrosecondArray; + use arrow::array::UInt16DictionaryArray; use datafusion_common::plan_err; use datafusion_common::Result; use datafusion_expr::type_coercion::binary::comparison_coercion; use arrow::datatypes::Field; + use arrow::datatypes::TimeUnit; type InListCastResult = (Arc, Vec>); @@ -1119,8 +1123,8 @@ mod tests { // list of phy expr let mut phy_exprs = vec![ lit(1i64), - expressions::cast(lit(2i32), &schema, DataType::Int64)?, - expressions::try_cast(lit(3.13f32), &schema, DataType::Int64)?, + cast(lit(2i32), &schema, DataType::Int64)?, + try_cast(lit(3.13f32), &schema, DataType::Int64)?, ]; let result = try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); @@ -1130,8 +1134,8 @@ mod tests { try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); // cast(cast(lit())), but the cast to the same data type, one case will be ignored - phy_exprs.push(expressions::cast( - expressions::cast(lit(2i32), &schema, DataType::Int64)?, + phy_exprs.push(cast( + cast(lit(2i32), &schema, DataType::Int64)?, &schema, DataType::Int64, )?); @@ -1140,15 +1144,15 @@ mod tests { phy_exprs.clear(); // case(cast(lit())), the cast to the diff data type - phy_exprs.push(expressions::cast( - expressions::cast(lit(2i32), &schema, DataType::Int64)?, + phy_exprs.push(cast( + cast(lit(2i32), &schema, DataType::Int64)?, &schema, DataType::Int32, )?); try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); // column - phy_exprs.push(expressions::col("a", &schema)?); + phy_exprs.push(col("a", &schema)?); assert!(try_cast_static_filter_to_set(&phy_exprs, &schema).is_err()); Ok(()) diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index 397c83148719..0abc3223390e 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -22,5 +22,6 @@ pub mod binary; pub mod cast; pub mod column; pub mod datum; +pub mod in_list; pub mod literal; pub mod try_cast; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 184f78c29a0b..4b67a6bd7fb9 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -25,15 +25,16 @@ use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::utils::DataPtr; -use datafusion_common::{internal_err, not_impl_err, DFSchema, Result}; +use datafusion_common::{internal_err, not_impl_err, DFSchema, Result, ScalarValue}; use datafusion_expr::execution_props::ExecutionProps; -use datafusion_expr::expr::Alias; +use datafusion_expr::expr::{Alias, InList}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::{BinaryExpr, ColumnarValue, Expr}; use crate::expressions::binary::binary; use crate::expressions::column::Column; -use crate::expressions::literal::Literal; +use crate::expressions::in_list::in_list; +use crate::expressions::literal::{lit, Literal}; use crate::sort_properties::SortProperties; use crate::utils::scatter; @@ -215,6 +216,29 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { } } +/// Checks whether the given physical expression slices are equal in the sense +/// of bags (multi-sets), disregarding their orderings. +pub fn physical_exprs_bag_equal( + lhs: &[Arc], + rhs: &[Arc], +) -> bool { + // TODO: Once we can use `HashMap`s with `Arc`, this + // function should use a `HashMap` to reduce computational complexity. + if lhs.len() == rhs.len() { + let mut rhs_vec = rhs.to_vec(); + for expr in lhs { + if let Some(idx) = rhs_vec.iter().position(|e| expr.eq(e)) { + rhs_vec.swap_remove(idx); + } else { + return false; + } + } + true + } else { + false + } +} + /// [PhysicalExpr] evaluate DataFusion expressions such as `A + 1`, or `CAST(c1 /// AS int)`. /// @@ -533,23 +557,23 @@ pub fn create_physical_expr( // binary_expr // } // } - // Expr::InList(InList { - // expr, - // list, - // negated, - // }) => match expr.as_ref() { - // Expr::Literal(ScalarValue::Utf8(None)) => { - // Ok(expressions::lit(ScalarValue::Boolean(None))) - // } - // _ => { - // let value_expr = - // create_physical_expr(expr, input_dfschema, execution_props)?; + Expr::InList(InList { + expr, + list, + negated, + }) => match expr.as_ref() { + Expr::Literal(ScalarValue::Utf8(None)) => { + Ok(lit(ScalarValue::Boolean(None))) + } + _ => { + let value_expr = + create_physical_expr(expr, input_dfschema, execution_props)?; - // let list_exprs = - // create_physical_exprs(list, input_dfschema, execution_props)?; - // expressions::in_list(value_expr, list_exprs, negated, input_schema) - // } - // }, + let list_exprs = + create_physical_exprs(list, input_dfschema, execution_props)?; + in_list(value_expr, list_exprs, negated, input_schema) + } + }, other => { not_impl_err!("Physical plan does not support logical expression {other:?}") } diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs deleted file mode 100644 index 07185b4d6527..000000000000 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ /dev/null @@ -1,1435 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Implementation of `InList` expressions: [`InListExpr`] - -use std::any::Any; -use std::fmt::Debug; -use std::hash::{Hash, Hasher}; -use std::sync::Arc; - -use crate::physical_expr::{down_cast_any_ref, physical_exprs_bag_equal}; -use crate::PhysicalExpr; - -use arrow::array::*; -use arrow::buffer::BooleanBuffer; -use arrow::compute::kernels::boolean::{not, or_kleene}; -use arrow::compute::kernels::cmp::eq; -use arrow::compute::take; -use arrow::datatypes::*; -use arrow::record_batch::RecordBatch; -use arrow::util::bit_iterator::BitIndexIterator; -use arrow::{downcast_dictionary_array, downcast_primitive_array}; -use datafusion_common::cast::{ - as_boolean_array, as_generic_binary_array, as_string_array, -}; -use datafusion_common::hash_utils::HashValue; -use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue}; -use datafusion_expr::ColumnarValue; - -use ahash::RandomState; -use hashbrown::hash_map::RawEntryMut; -use hashbrown::HashMap; - -/// InList -pub struct InListExpr { - expr: Arc, - list: Vec>, - negated: bool, - static_filter: Option>, -} - -impl Debug for InListExpr { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - f.debug_struct("InListExpr") - .field("expr", &self.expr) - .field("list", &self.list) - .field("negated", &self.negated) - .finish() - } -} - -/// A type-erased container of array elements -pub trait Set: Send + Sync { - fn contains(&self, v: &dyn Array, negated: bool) -> Result; - fn has_nulls(&self) -> bool; -} - -struct ArrayHashSet { - state: RandomState, - /// Used to provide a lookup from value to in list index - /// - /// Note: usize::hash is not used, instead the raw entry - /// API is used to store entries w.r.t their value - map: HashMap, -} - -struct ArraySet { - array: T, - hash_set: ArrayHashSet, -} - -impl ArraySet -where - T: Array + From, -{ - fn new(array: &T, hash_set: ArrayHashSet) -> Self { - Self { - array: downcast_array(array), - hash_set, - } - } -} - -impl Set for ArraySet -where - T: Array + 'static, - for<'a> &'a T: ArrayAccessor, - for<'a> <&'a T as ArrayAccessor>::Item: IsEqual, -{ - fn contains(&self, v: &dyn Array, negated: bool) -> Result { - downcast_dictionary_array! { - v => { - let values_contains = self.contains(v.values().as_ref(), negated)?; - let result = take(&values_contains, v.keys(), None)?; - return Ok(downcast_array(result.as_ref())) - } - _ => {} - } - - let v = v.as_any().downcast_ref::().unwrap(); - let in_array = &self.array; - let has_nulls = in_array.null_count() != 0; - - Ok(ArrayIter::new(v) - .map(|v| { - v.and_then(|v| { - let hash = v.hash_one(&self.hash_set.state); - let contains = self - .hash_set - .map - .raw_entry() - .from_hash(hash, |idx| in_array.value(*idx).is_equal(&v)) - .is_some(); - - match contains { - true => Some(!negated), - false if has_nulls => None, - false => Some(negated), - } - }) - }) - .collect()) - } - - fn has_nulls(&self) -> bool { - self.array.null_count() != 0 - } -} - -/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there -/// are nulls present or there are more than the configured number of -/// elements. -/// -/// Note: This is split into a separate function as higher-rank trait bounds currently -/// cause type inference to misbehave -fn make_hash_set(array: T) -> ArrayHashSet -where - T: ArrayAccessor, - T::Item: IsEqual, -{ - let state = RandomState::new(); - let mut map: HashMap = - HashMap::with_capacity_and_hasher(array.len(), ()); - - let insert_value = |idx| { - let value = array.value(idx); - let hash = value.hash_one(&state); - if let RawEntryMut::Vacant(v) = map - .raw_entry_mut() - .from_hash(hash, |x| array.value(*x).is_equal(&value)) - { - v.insert_with_hasher(hash, idx, (), |x| array.value(*x).hash_one(&state)); - } - }; - - match array.nulls() { - Some(nulls) => { - BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()) - .for_each(insert_value) - } - None => (0..array.len()).for_each(insert_value), - } - - ArrayHashSet { state, map } -} - -/// Creates a `Box` for the given list of `IN` expressions and `batch` -fn make_set(array: &dyn Array) -> Result> { - Ok(downcast_primitive_array! { - array => Arc::new(ArraySet::new(array, make_hash_set(array))), - DataType::Boolean => { - let array = as_boolean_array(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) - }, - DataType::Utf8 => { - let array = as_string_array(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) - } - DataType::LargeUtf8 => { - let array = as_largestring_array(array); - Arc::new(ArraySet::new(array, make_hash_set(array))) - } - DataType::Binary => { - let array = as_generic_binary_array::(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) - } - DataType::LargeBinary => { - let array = as_generic_binary_array::(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) - } - DataType::Dictionary(_, _) => unreachable!("dictionary should have been flattened"), - d => return not_impl_err!("DataType::{d} not supported in InList") - }) -} - -/// Evaluates the list of expressions into an array, flattening any dictionaries -fn evaluate_list( - list: &[Arc], - batch: &RecordBatch, -) -> Result { - let scalars = list - .iter() - .map(|expr| { - expr.evaluate(batch).and_then(|r| match r { - ColumnarValue::Array(_) => { - exec_err!("InList expression must evaluate to a scalar") - } - // Flatten dictionary values - ColumnarValue::Scalar(ScalarValue::Dictionary(_, v)) => Ok(*v), - ColumnarValue::Scalar(s) => Ok(s), - }) - }) - .collect::>>()?; - - ScalarValue::iter_to_array(scalars) -} - -fn try_cast_static_filter_to_set( - list: &[Arc], - schema: &Schema, -) -> Result> { - let batch = RecordBatch::new_empty(Arc::new(schema.clone())); - make_set(evaluate_list(list, &batch)?.as_ref()) -} - -/// Custom equality check function which is used with [`ArrayHashSet`] for existence check. -trait IsEqual: HashValue { - fn is_equal(&self, other: &Self) -> bool; -} - -impl<'a, T: IsEqual + ?Sized> IsEqual for &'a T { - fn is_equal(&self, other: &Self) -> bool { - T::is_equal(self, other) - } -} - -macro_rules! is_equal { - ($($t:ty),+) => { - $(impl IsEqual for $t { - fn is_equal(&self, other: &Self) -> bool { - self == other - } - })* - }; -} -is_equal!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64); -is_equal!(bool, str, [u8]); - -macro_rules! is_equal_float { - ($($t:ty),+) => { - $(impl IsEqual for $t { - fn is_equal(&self, other: &Self) -> bool { - self.to_bits() == other.to_bits() - } - })* - }; -} -is_equal_float!(half::f16, f32, f64); - -impl InListExpr { - /// Create a new InList expression - pub fn new( - expr: Arc, - list: Vec>, - negated: bool, - static_filter: Option>, - ) -> Self { - Self { - expr, - list, - negated, - static_filter, - } - } - - /// Input expression - pub fn expr(&self) -> &Arc { - &self.expr - } - - /// List to search in - pub fn list(&self) -> &[Arc] { - &self.list - } - - /// Is this negated e.g. NOT IN LIST - pub fn negated(&self) -> bool { - self.negated - } -} - -impl std::fmt::Display for InListExpr { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - if self.negated { - if self.static_filter.is_some() { - write!(f, "{} NOT IN (SET) ({:?})", self.expr, self.list) - } else { - write!(f, "{} NOT IN ({:?})", self.expr, self.list) - } - } else if self.static_filter.is_some() { - write!(f, "Use {} IN (SET) ({:?})", self.expr, self.list) - } else { - write!(f, "{} IN ({:?})", self.expr, self.list) - } - } -} - -impl PhysicalExpr for InListExpr { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn data_type(&self, _input_schema: &Schema) -> Result { - Ok(DataType::Boolean) - } - - fn nullable(&self, input_schema: &Schema) -> Result { - if self.expr.nullable(input_schema)? { - return Ok(true); - } - - if let Some(static_filter) = &self.static_filter { - Ok(static_filter.has_nulls()) - } else { - for expr in &self.list { - if expr.nullable(input_schema)? { - return Ok(true); - } - } - Ok(false) - } - } - - fn evaluate(&self, batch: &RecordBatch) -> Result { - let num_rows = batch.num_rows(); - let value = self.expr.evaluate(batch)?; - let r = match &self.static_filter { - Some(f) => f.contains(value.into_array(num_rows)?.as_ref(), self.negated)?, - None => { - let value = value.into_array(num_rows)?; - let found = self.list.iter().map(|expr| expr.evaluate(batch)).try_fold( - BooleanArray::new(BooleanBuffer::new_unset(num_rows), None), - |result, expr| -> Result { - Ok(or_kleene( - &result, - &eq(&value, &expr?.into_array(num_rows)?)?, - )?) - }, - )?; - - if self.negated { - not(&found)? - } else { - found - } - } - }; - Ok(ColumnarValue::Array(Arc::new(r))) - } - - fn children(&self) -> Vec> { - let mut children = vec![]; - children.push(self.expr.clone()); - children.extend(self.list.clone()); - children - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> Result> { - // assume the static_filter will not change during the rewrite process - Ok(Arc::new(InListExpr::new( - children[0].clone(), - children[1..].to_vec(), - self.negated, - self.static_filter.clone(), - ))) - } - - fn dyn_hash(&self, state: &mut dyn Hasher) { - let mut s = state; - self.expr.hash(&mut s); - self.negated.hash(&mut s); - self.list.hash(&mut s); - // Add `self.static_filter` when hash is available - } -} - -impl PartialEq for InListExpr { - fn eq(&self, other: &dyn Any) -> bool { - down_cast_any_ref(other) - .downcast_ref::() - .map(|x| { - self.expr.eq(&x.expr) - && physical_exprs_bag_equal(&self.list, &x.list) - && self.negated == x.negated - }) - .unwrap_or(false) - } -} - -/// Checks if two types are logically equal, dictionary types are compared by their value types. -fn is_logically_eq(lhs: &DataType, rhs: &DataType) -> bool { - match (lhs, rhs) { - (DataType::Dictionary(_, v1), DataType::Dictionary(_, v2)) => { - v1.as_ref().eq(v2.as_ref()) - } - (DataType::Dictionary(_, l), _) => l.as_ref().eq(rhs), - (_, DataType::Dictionary(_, r)) => lhs.eq(r.as_ref()), - _ => lhs.eq(rhs), - } -} - -/// Creates a unary expression InList -pub fn in_list( - expr: Arc, - list: Vec>, - negated: &bool, - schema: &Schema, -) -> Result> { - // check the data type - let expr_data_type = expr.data_type(schema)?; - for list_expr in list.iter() { - let list_expr_data_type = list_expr.data_type(schema)?; - if !is_logically_eq(&expr_data_type, &list_expr_data_type) { - return internal_err!( - "The data type inlist should be same, the value type is {expr_data_type}, one of list expr type is {list_expr_data_type}" - ); - } - } - let static_filter = try_cast_static_filter_to_set(&list, schema).ok(); - Ok(Arc::new(InListExpr::new( - expr, - list, - *negated, - static_filter, - ))) -} - -#[cfg(test)] -mod tests { - use arrow::{array::StringArray, datatypes::Field}; - - use super::*; - use crate::expressions; - use crate::expressions::{col, lit, try_cast}; - use datafusion_common::plan_err; - use datafusion_common::Result; - use datafusion_expr::type_coercion::binary::comparison_coercion; - - type InListCastResult = (Arc, Vec>); - - // Try to do the type coercion for list physical expr. - // It's just used in the test - fn in_list_cast( - expr: Arc, - list: Vec>, - input_schema: &Schema, - ) -> Result { - let expr_type = &expr.data_type(input_schema)?; - let list_types: Vec = list - .iter() - .map(|list_expr| list_expr.data_type(input_schema).unwrap()) - .collect(); - let result_type = get_coerce_type(expr_type, &list_types); - match result_type { - None => plan_err!( - "Can not find compatible types to compare {expr_type:?} with {list_types:?}" - ), - Some(data_type) => { - // find the coerced type - let cast_expr = try_cast(expr, input_schema, data_type.clone())?; - let cast_list_expr = list - .into_iter() - .map(|list_expr| { - try_cast(list_expr, input_schema, data_type.clone()).unwrap() - }) - .collect(); - Ok((cast_expr, cast_list_expr)) - } - } - } - - // Attempts to coerce the types of `list_type` to be comparable with the - // `expr_type` - fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option { - list_type - .iter() - .try_fold(expr_type.clone(), |left_type, right_type| { - comparison_coercion(&left_type, right_type) - }) - } - - // applies the in_list expr to an input batch and list - macro_rules! in_list { - ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{ - let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?; - in_list_raw!( - $BATCH, - cast_list_exprs, - $NEGATED, - $EXPECTED, - cast_expr, - $SCHEMA - ); - }}; - } - - // applies the in_list expr to an input batch and list without cast - macro_rules! in_list_raw { - ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{ - let expr = in_list($COL, $LIST, $NEGATED, $SCHEMA).unwrap(); - let result = expr - .evaluate(&$BATCH)? - .into_array($BATCH.num_rows()) - .expect("Failed to convert to array"); - let result = - as_boolean_array(&result).expect("failed to downcast to BooleanArray"); - let expected = &BooleanArray::from($EXPECTED); - assert_eq!(expected, result); - }}; - } - - #[test] - fn in_list_utf8() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); - let a = StringArray::from(vec![Some("a"), Some("d"), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in ("a", "b")" - let list = vec![lit("a"), lit("b")]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - col_a.clone(), - &schema - ); - - // expression: "a not in ("a", "b")" - let list = vec![lit("a"), lit("b")]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - col_a.clone(), - &schema - ); - - // expression: "a in ("a", "b", null)" - let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - col_a.clone(), - &schema - ); - - // expression: "a not in ("a", "b", null)" - let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - col_a.clone(), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_binary() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Binary, true)]); - let a = BinaryArray::from(vec![ - Some([1, 2, 3].as_slice()), - Some([1, 2, 2].as_slice()), - None, - ]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in ([1, 2, 3], [4, 5, 6])" - let list = vec![lit([1, 2, 3].as_slice()), lit([4, 5, 6].as_slice())]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), Some(false), None], - col_a.clone(), - &schema - ); - - // expression: "a not in ([1, 2, 3], [4, 5, 6])" - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - col_a.clone(), - &schema - ); - - // expression: "a in ([1, 2, 3], [4, 5, 6], null)" - let list = vec![ - lit([1, 2, 3].as_slice()), - lit([4, 5, 6].as_slice()), - lit(ScalarValue::Binary(None)), - ]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), None, None], - col_a.clone(), - &schema - ); - - // expression: "a in ([1, 2, 3], [4, 5, 6], null)" - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - col_a.clone(), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_int64() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); - let a = Int64Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![lit(0i64), lit(1i64)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - col_a.clone(), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![lit(0i64), lit(1i64)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - col_a.clone(), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![lit(0i64), lit(1i64), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - col_a.clone(), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![lit(0i64), lit(1i64), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - col_a.clone(), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_float64() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Float64, true)]); - let a = Float64Array::from(vec![ - Some(0.0), - Some(0.2), - None, - Some(f64::NAN), - Some(-f64::NAN), - ]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0.0, 0.1)" - let list = vec![lit(0.0f64), lit(0.1f64)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None, Some(false), Some(false)], - col_a.clone(), - &schema - ); - - // expression: "a not in (0.0, 0.1)" - let list = vec![lit(0.0f64), lit(0.1f64)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None, Some(true), Some(true)], - col_a.clone(), - &schema - ); - - // expression: "a in (0.0, 0.1, NULL)" - let list = vec![lit(0.0f64), lit(0.1f64), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None, None, None], - col_a.clone(), - &schema - ); - - // expression: "a not in (0.0, 0.1, NULL)" - let list = vec![lit(0.0f64), lit(0.1f64), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None, None, None], - col_a.clone(), - &schema - ); - - // expression: "a in (0.0, 0.1, NaN)" - let list = vec![lit(0.0f64), lit(0.1f64), lit(f64::NAN)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None, Some(true), Some(false)], - col_a.clone(), - &schema - ); - - // expression: "a not in (0.0, 0.1, NaN)" - let list = vec![lit(0.0f64), lit(0.1f64), lit(f64::NAN)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None, Some(false), Some(true)], - col_a.clone(), - &schema - ); - - // expression: "a in (0.0, 0.1, -NaN)" - let list = vec![lit(0.0f64), lit(0.1f64), lit(-f64::NAN)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None, Some(false), Some(true)], - col_a.clone(), - &schema - ); - - // expression: "a not in (0.0, 0.1, -NaN)" - let list = vec![lit(0.0f64), lit(0.1f64), lit(-f64::NAN)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None, Some(true), Some(false)], - col_a.clone(), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_bool() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Boolean, true)]); - let a = BooleanArray::from(vec![Some(true), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (true)" - let list = vec![lit(true)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None], - col_a.clone(), - &schema - ); - - // expression: "a not in (true)" - let list = vec![lit(true)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None], - col_a.clone(), - &schema - ); - - // expression: "a in (true, NULL)" - let list = vec![lit(true), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None], - col_a.clone(), - &schema - ); - - // expression: "a not in (true, NULL)" - let list = vec![lit(true), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None], - col_a.clone(), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_date64() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Date64, true)]); - let a = Date64Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![ - lit(ScalarValue::Date64(Some(0))), - lit(ScalarValue::Date64(Some(1))), - ]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - col_a.clone(), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![ - lit(ScalarValue::Date64(Some(0))), - lit(ScalarValue::Date64(Some(1))), - ]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - col_a.clone(), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![ - lit(ScalarValue::Date64(Some(0))), - lit(ScalarValue::Date64(Some(1))), - lit(ScalarValue::Null), - ]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - col_a.clone(), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![ - lit(ScalarValue::Date64(Some(0))), - lit(ScalarValue::Date64(Some(1))), - lit(ScalarValue::Null), - ]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - col_a.clone(), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_date32() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Date32, true)]); - let a = Date32Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![ - lit(ScalarValue::Date32(Some(0))), - lit(ScalarValue::Date32(Some(1))), - ]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - col_a.clone(), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![ - lit(ScalarValue::Date32(Some(0))), - lit(ScalarValue::Date32(Some(1))), - ]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - col_a.clone(), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![ - lit(ScalarValue::Date32(Some(0))), - lit(ScalarValue::Date32(Some(1))), - lit(ScalarValue::Null), - ]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - col_a.clone(), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![ - lit(ScalarValue::Date32(Some(0))), - lit(ScalarValue::Date32(Some(1))), - lit(ScalarValue::Null), - ]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - col_a.clone(), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_decimal() -> Result<()> { - // Now, we can check the NULL type - let schema = - Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]); - let array = vec![Some(100_0000_i128), None, Some(200_5000_i128)] - .into_iter() - .collect::(); - let array = array.with_precision_and_scale(13, 4).unwrap(); - let col_a = col("a", &schema)?; - let batch = - RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)])?; - - // expression: "a in (100,200), the data type of list is INT32 - let list = vec![lit(100i32), lit(200i32)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, Some(false)], - col_a.clone(), - &schema - ); - // expression: "a not in (100,200) - let list = vec![lit(100i32), lit(200i32)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, Some(true)], - col_a.clone(), - &schema - ); - - // expression: "a in (200,NULL), the data type of list is INT32 AND NULL - let list = vec![lit(ScalarValue::Int32(Some(100))), lit(ScalarValue::Null)]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), None, None], - col_a.clone(), - &schema - ); - // expression: "a not in (200,NULL), the data type of list is INT32 AND NULL - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - col_a.clone(), - &schema - ); - - // expression: "a in (200.5, 100), the data type of list is FLOAT32 and INT32 - let list = vec![lit(200.50f32), lit(100i32)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, Some(true)], - col_a.clone(), - &schema - ); - - // expression: "a not in (200.5, 100), the data type of list is FLOAT32 and INT32 - let list = vec![lit(200.50f32), lit(101i32)]; - in_list!( - batch, - list, - &true, - vec![Some(true), None, Some(false)], - col_a.clone(), - &schema - ); - - // test the optimization: set - // expression: "a in (99..300), the data type of list is INT32 - let list = (99i32..300).map(lit).collect::>(); - - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), None, Some(false)], - col_a.clone(), - &schema - ); - - in_list!( - batch, - list, - &true, - vec![Some(false), None, Some(true)], - col_a.clone(), - &schema - ); - - Ok(()) - } - - #[test] - fn test_cast_static_filter_to_set() -> Result<()> { - // random schema - let schema = - Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]); - - // list of phy expr - let mut phy_exprs = vec![ - lit(1i64), - expressions::cast(lit(2i32), &schema, DataType::Int64)?, - expressions::try_cast(lit(3.13f32), &schema, DataType::Int64)?, - ]; - let result = try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); - - let array = Int64Array::from(vec![1, 2, 3, 4]); - let r = result.contains(&array, false).unwrap(); - assert_eq!(r, BooleanArray::from(vec![true, true, true, false])); - - try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); - // cast(cast(lit())), but the cast to the same data type, one case will be ignored - phy_exprs.push(expressions::cast( - expressions::cast(lit(2i32), &schema, DataType::Int64)?, - &schema, - DataType::Int64, - )?); - try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); - - phy_exprs.clear(); - - // case(cast(lit())), the cast to the diff data type - phy_exprs.push(expressions::cast( - expressions::cast(lit(2i32), &schema, DataType::Int64)?, - &schema, - DataType::Int32, - )?); - try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); - - // column - phy_exprs.push(expressions::col("a", &schema)?); - assert!(try_cast_static_filter_to_set(&phy_exprs, &schema).is_err()); - - Ok(()) - } - - #[test] - fn in_list_timestamp() -> Result<()> { - let schema = Schema::new(vec![Field::new( - "a", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - )]); - let a = TimestampMicrosecondArray::from(vec![ - Some(1388588401000000000), - Some(1288588501000000000), - None, - ]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - let list = vec![ - lit(ScalarValue::TimestampMicrosecond( - Some(1388588401000000000), - None, - )), - lit(ScalarValue::TimestampMicrosecond( - Some(1388588401000000001), - None, - )), - lit(ScalarValue::TimestampMicrosecond( - Some(1388588401000000002), - None, - )), - ]; - - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), Some(false), None], - col_a.clone(), - &schema - ); - - in_list!( - batch, - list.clone(), - &true, - vec![Some(false), Some(true), None], - col_a.clone(), - &schema - ); - Ok(()) - } - - #[test] - fn in_expr_with_multiple_element_in_list() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("a", DataType::Float64, true), - Field::new("b", DataType::Float64, true), - Field::new("c", DataType::Float64, true), - ]); - let a = Float64Array::from(vec![ - Some(0.0), - Some(1.0), - Some(2.0), - Some(f64::NAN), - Some(-f64::NAN), - ]); - let b = Float64Array::from(vec![ - Some(8.0), - Some(1.0), - Some(5.0), - Some(f64::NAN), - Some(3.0), - ]); - let c = Float64Array::from(vec![ - Some(6.0), - Some(7.0), - None, - Some(5.0), - Some(-f64::NAN), - ]); - let col_a = col("a", &schema)?; - let col_b = col("b", &schema)?; - let col_c = col("c", &schema)?; - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(a), Arc::new(b), Arc::new(c)], - )?; - - let list = vec![col_b.clone(), col_c.clone()]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(false), Some(true), None, Some(true), Some(true)], - col_a.clone(), - &schema - ); - - in_list!( - batch, - list, - &true, - vec![Some(true), Some(false), None, Some(false), Some(false)], - col_a.clone(), - &schema - ); - - Ok(()) - } - - macro_rules! test_nullable { - ($COL:expr, $LIST:expr, $SCHEMA:expr, $EXPECTED:expr) => {{ - let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?; - let expr = in_list(cast_expr, cast_list_exprs, &false, $SCHEMA).unwrap(); - let result = expr.nullable($SCHEMA)?; - assert_eq!($EXPECTED, result); - }}; - } - - #[test] - fn in_list_nullable() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1_nullable", DataType::Int64, true), - Field::new("c2_non_nullable", DataType::Int64, false), - ]); - - let c1_nullable = col("c1_nullable", &schema)?; - let c2_non_nullable = col("c2_non_nullable", &schema)?; - - // static_filter has no nulls - let list = vec![lit(1_i64), lit(2_i64)]; - test_nullable!(c1_nullable.clone(), list.clone(), &schema, true); - test_nullable!(c2_non_nullable.clone(), list.clone(), &schema, false); - - // static_filter has nulls - let list = vec![lit(1_i64), lit(2_i64), lit(ScalarValue::Null)]; - test_nullable!(c1_nullable.clone(), list.clone(), &schema, true); - test_nullable!(c2_non_nullable.clone(), list.clone(), &schema, true); - - let list = vec![c1_nullable.clone()]; - test_nullable!(c2_non_nullable.clone(), list.clone(), &schema, true); - - let list = vec![c2_non_nullable.clone()]; - test_nullable!(c1_nullable.clone(), list.clone(), &schema, true); - - let list = vec![c2_non_nullable.clone(), c2_non_nullable.clone()]; - test_nullable!(c2_non_nullable.clone(), list.clone(), &schema, false); - - Ok(()) - } - - #[test] - fn in_list_no_cols() -> Result<()> { - // test logic when the in_list expression doesn't have any columns - let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); - let a = Int32Array::from(vec![Some(1), Some(2), None]); - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - let list = vec![lit(ScalarValue::from(1i32)), lit(ScalarValue::from(6i32))]; - - // 1 IN (1, 6) - let expr = lit(ScalarValue::Int32(Some(1))); - in_list!( - batch, - list.clone(), - &false, - // should have three outputs, as the input batch has three rows - vec![Some(true), Some(true), Some(true)], - expr, - &schema - ); - - // 2 IN (1, 6) - let expr = lit(ScalarValue::Int32(Some(2))); - in_list!( - batch, - list.clone(), - &false, - // should have three outputs, as the input batch has three rows - vec![Some(false), Some(false), Some(false)], - expr, - &schema - ); - - // NULL IN (1, 6) - let expr = lit(ScalarValue::Int32(None)); - in_list!( - batch, - list.clone(), - &false, - // should have three outputs, as the input batch has three rows - vec![None, None, None], - expr, - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_utf8_with_dict_types() -> Result<()> { - fn dict_lit(key_type: DataType, value: &str) -> Arc { - lit(ScalarValue::Dictionary( - Box::new(key_type), - Box::new(ScalarValue::new_utf8(value.to_string())), - )) - } - - fn null_dict_lit(key_type: DataType) -> Arc { - lit(ScalarValue::Dictionary( - Box::new(key_type), - Box::new(ScalarValue::Utf8(None)), - )) - } - - let schema = Schema::new(vec![Field::new( - "a", - DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)), - true, - )]); - let a: UInt16DictionaryArray = - vec![Some("a"), Some("d"), None].into_iter().collect(); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in ("a", "b")" - let lists = [ - vec![lit("a"), lit("b")], - vec![ - dict_lit(DataType::Int8, "a"), - dict_lit(DataType::UInt16, "b"), - ], - ]; - for list in lists.iter() { - in_list_raw!( - batch, - list.clone(), - &false, - vec![Some(true), Some(false), None], - col_a.clone(), - &schema - ); - } - - // expression: "a not in ("a", "b")" - for list in lists.iter() { - in_list_raw!( - batch, - list.clone(), - &true, - vec![Some(false), Some(true), None], - col_a.clone(), - &schema - ); - } - - // expression: "a in ("a", "b", null)" - let lists = [ - vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))], - vec![ - dict_lit(DataType::Int8, "a"), - dict_lit(DataType::UInt16, "b"), - null_dict_lit(DataType::UInt16), - ], - ]; - for list in lists.iter() { - in_list_raw!( - batch, - list.clone(), - &false, - vec![Some(true), None, None], - col_a.clone(), - &schema - ); - } - - // expression: "a not in ("a", "b", null)" - for list in lists.iter() { - in_list_raw!( - batch, - list.clone(), - &true, - vec![Some(false), None, None], - col_a.clone(), - &schema - ); - } - - Ok(()) - } -} diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index a84cc898ef0e..c6e4920c692f 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -19,7 +19,6 @@ mod case; mod column; -mod in_list; mod is_not_null; mod is_null; mod like; @@ -81,7 +80,7 @@ pub use datafusion_physical_expr_common::expressions::binary::{binary, BinaryExp pub use datafusion_physical_expr_common::expressions::column::{col, Column}; pub use datafusion_physical_expr_common::expressions::literal::{lit, Literal}; pub use datafusion_physical_expr_common::expressions::try_cast::{try_cast, TryCastExpr}; -pub use in_list::{in_list, InListExpr}; +pub use datafusion_physical_expr_common::expressions::in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; pub use like::{like, LikeExpr}; diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index bc265d3819a5..7e47bc5ba66b 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -21,6 +21,7 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use itertools::izip; pub use datafusion_physical_expr_common::physical_expr::down_cast_any_ref; +pub use datafusion_physical_expr_common::physical_expr::physical_exprs_bag_equal; /// Shared [`PhysicalExpr`]. pub type PhysicalExprRef = Arc; @@ -44,28 +45,6 @@ pub fn physical_exprs_equal( lhs.len() == rhs.len() && izip!(lhs, rhs).all(|(lhs, rhs)| lhs.eq(rhs)) } -/// Checks whether the given physical expression slices are equal in the sense -/// of bags (multi-sets), disregarding their orderings. -pub fn physical_exprs_bag_equal( - lhs: &[Arc], - rhs: &[Arc], -) -> bool { - // TODO: Once we can use `HashMap`s with `Arc`, this - // function should use a `HashMap` to reduce computational complexity. - if lhs.len() == rhs.len() { - let mut rhs_vec = rhs.to_vec(); - for expr in lhs { - if let Some(idx) = rhs_vec.iter().position(|e| expr.eq(e)) { - rhs_vec.swap_remove(idx); - } else { - return false; - } - } - true - } else { - false - } -} /// This utility function removes duplicates from the given `exprs` vector. /// Note that this function does not necessarily preserve its input ordering. From de415b60b27952e26c9a85875eab11f52e0121c0 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 21:07:50 +0800 Subject: [PATCH 11/16] like Signed-off-by: jayzhan211 --- .../src/expressions/like.rs | 6 +-- .../src/expressions/mod.rs | 1 + .../physical-expr-common/src/physical_expr.rs | 49 ++++++++++--------- .../physical-expr/src/expressions/mod.rs | 3 +- 4 files changed, 30 insertions(+), 29 deletions(-) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/like.rs (97%) diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr-common/src/expressions/like.rs similarity index 97% rename from datafusion/physical-expr/src/expressions/like.rs rename to datafusion/physical-expr-common/src/expressions/like.rs index c219dea83617..48114227b61c 100644 --- a/datafusion/physical-expr/src/expressions/like.rs +++ b/datafusion/physical-expr-common/src/expressions/like.rs @@ -18,13 +18,13 @@ use std::hash::{Hash, Hasher}; use std::{any::Any, sync::Arc}; -use crate::{physical_expr::down_cast_any_ref, PhysicalExpr}; +use crate::physical_expr::{down_cast_any_ref, PhysicalExpr}; use arrow::record_batch::RecordBatch; use arrow_schema::{DataType, Schema}; use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; -use datafusion_physical_expr_common::expressions::datum::apply_cmp; +use crate::expressions::datum::apply_cmp; // Like expression #[derive(Debug, Hash)] @@ -174,7 +174,7 @@ pub fn like( #[cfg(test)] mod test { use super::*; - use crate::expressions::col; + use crate::expressions::column::col; use arrow::array::*; use arrow_schema::Field; use datafusion_common::cast::as_boolean_array; diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index 0abc3223390e..d1beb5eb4a4f 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -23,5 +23,6 @@ pub mod cast; pub mod column; pub mod datum; pub mod in_list; +pub mod like; pub mod literal; pub mod try_cast; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 4b67a6bd7fb9..f3f5bb6a69af 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -25,15 +25,16 @@ use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::utils::DataPtr; -use datafusion_common::{internal_err, not_impl_err, DFSchema, Result, ScalarValue}; +use datafusion_common::{exec_err, internal_err, not_impl_err, DFSchema, Result, ScalarValue}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr::{Alias, InList}; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::{BinaryExpr, ColumnarValue, Expr}; +use datafusion_expr::{BinaryExpr, ColumnarValue, Expr, Like}; use crate::expressions::binary::binary; use crate::expressions::column::Column; use crate::expressions::in_list::in_list; +use crate::expressions::like::like; use crate::expressions::literal::{lit, Literal}; use crate::sort_properties::SortProperties; use crate::utils::scatter; @@ -404,28 +405,28 @@ pub fn create_physical_expr( // planning. binary(lhs, *op, rhs, input_schema) } - // Expr::Like(Like { - // negated, - // expr, - // pattern, - // escape_char, - // case_insensitive, - // }) => { - // if escape_char.is_some() { - // return exec_err!("LIKE does not support escape_char"); - // } - // let physical_expr = - // create_physical_expr(expr, input_dfschema, execution_props)?; - // let physical_pattern = - // create_physical_expr(pattern, input_dfschema, execution_props)?; - // like( - // *negated, - // *case_insensitive, - // physical_expr, - // physical_pattern, - // input_schema, - // ) - // } + Expr::Like(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive, + }) => { + if escape_char.is_some() { + return exec_err!("LIKE does not support escape_char"); + } + let physical_expr = + create_physical_expr(expr, input_dfschema, execution_props)?; + let physical_pattern = + create_physical_expr(pattern, input_dfschema, execution_props)?; + like( + *negated, + *case_insensitive, + physical_expr, + physical_pattern, + input_schema, + ) + } // Expr::Case(case) => { // let expr: Option> = if let Some(e) = &case.expr { // Some(create_physical_expr( diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index c6e4920c692f..22f0b71647ab 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -21,7 +21,6 @@ mod case; mod column; mod is_not_null; mod is_null; -mod like; mod negative; mod no_op; mod not; @@ -83,7 +82,7 @@ pub use datafusion_physical_expr_common::expressions::try_cast::{try_cast, TryCa pub use datafusion_physical_expr_common::expressions::in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; -pub use like::{like, LikeExpr}; +pub use datafusion_physical_expr_common::expressions::like::{like, LikeExpr}; pub use negative::{negative, NegativeExpr}; pub use no_op::NoOp; pub use not::{not, NotExpr}; From 9eb460bc65ffc1165af98f8647781e2eee71525a Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 21:33:05 +0800 Subject: [PATCH 12/16] logic ops Signed-off-by: jayzhan211 --- .../physical-expr-common/src/physical_expr.rs | 120 +++++++++--------- 1 file changed, 61 insertions(+), 59 deletions(-) diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index f3f5bb6a69af..27abdd8f461f 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -29,13 +29,15 @@ use datafusion_common::{exec_err, internal_err, not_impl_err, DFSchema, Result, use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr::{Alias, InList}; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::{BinaryExpr, ColumnarValue, Expr, Like}; +use datafusion_expr::{binary_expr, BinaryExpr, Cast, ColumnarValue, Expr, Like, Operator, TryCast}; use crate::expressions::binary::binary; +use crate::expressions::cast::cast; use crate::expressions::column::Column; use crate::expressions::in_list::in_list; use crate::expressions::like::like; use crate::expressions::literal::{lit, Literal}; +use crate::expressions::try_cast::try_cast; use crate::sort_properties::SortProperties; use crate::utils::scatter; @@ -344,54 +346,54 @@ pub fn create_physical_expr( // } // } // } - // Expr::IsTrue(expr) => { - // let binary_op = binary_expr( - // expr.as_ref().clone(), - // Operator::IsNotDistinctFrom, - // Expr::Literal(ScalarValue::Boolean(Some(true))), - // ); - // create_physical_expr(&binary_op, input_dfschema, execution_props) - // } - // Expr::IsNotTrue(expr) => { - // let binary_op = binary_expr( - // expr.as_ref().clone(), - // Operator::IsDistinctFrom, - // Expr::Literal(ScalarValue::Boolean(Some(true))), - // ); - // create_physical_expr(&binary_op, input_dfschema, execution_props) - // } - // Expr::IsFalse(expr) => { - // let binary_op = binary_expr( - // expr.as_ref().clone(), - // Operator::IsNotDistinctFrom, - // Expr::Literal(ScalarValue::Boolean(Some(false))), - // ); - // create_physical_expr(&binary_op, input_dfschema, execution_props) - // } - // Expr::IsNotFalse(expr) => { - // let binary_op = binary_expr( - // expr.as_ref().clone(), - // Operator::IsDistinctFrom, - // Expr::Literal(ScalarValue::Boolean(Some(false))), - // ); - // create_physical_expr(&binary_op, input_dfschema, execution_props) - // } - // Expr::IsUnknown(expr) => { - // let binary_op = binary_expr( - // expr.as_ref().clone(), - // Operator::IsNotDistinctFrom, - // Expr::Literal(ScalarValue::Boolean(None)), - // ); - // create_physical_expr(&binary_op, input_dfschema, execution_props) - // } - // Expr::IsNotUnknown(expr) => { - // let binary_op = binary_expr( - // expr.as_ref().clone(), - // Operator::IsDistinctFrom, - // Expr::Literal(ScalarValue::Boolean(None)), - // ); - // create_physical_expr(&binary_op, input_dfschema, execution_props) - // } + Expr::IsTrue(expr) => { + let binary_op = binary_expr( + expr.as_ref().clone(), + Operator::IsNotDistinctFrom, + Expr::Literal(ScalarValue::Boolean(Some(true))), + ); + create_physical_expr(&binary_op, input_dfschema, execution_props) + } + Expr::IsNotTrue(expr) => { + let binary_op = binary_expr( + expr.as_ref().clone(), + Operator::IsDistinctFrom, + Expr::Literal(ScalarValue::Boolean(Some(true))), + ); + create_physical_expr(&binary_op, input_dfschema, execution_props) + } + Expr::IsFalse(expr) => { + let binary_op = binary_expr( + expr.as_ref().clone(), + Operator::IsNotDistinctFrom, + Expr::Literal(ScalarValue::Boolean(Some(false))), + ); + create_physical_expr(&binary_op, input_dfschema, execution_props) + } + Expr::IsNotFalse(expr) => { + let binary_op = binary_expr( + expr.as_ref().clone(), + Operator::IsDistinctFrom, + Expr::Literal(ScalarValue::Boolean(Some(false))), + ); + create_physical_expr(&binary_op, input_dfschema, execution_props) + } + Expr::IsUnknown(expr) => { + let binary_op = binary_expr( + expr.as_ref().clone(), + Operator::IsNotDistinctFrom, + Expr::Literal(ScalarValue::Boolean(None)), + ); + create_physical_expr(&binary_op, input_dfschema, execution_props) + } + Expr::IsNotUnknown(expr) => { + let binary_op = binary_expr( + expr.as_ref().clone(), + Operator::IsDistinctFrom, + Expr::Literal(ScalarValue::Boolean(None)), + ); + create_physical_expr(&binary_op, input_dfschema, execution_props) + } Expr::BinaryExpr(BinaryExpr { left, op, right }) => { // Create physical expressions for left and right operands let lhs = create_physical_expr(left, input_dfschema, execution_props)?; @@ -464,16 +466,16 @@ pub fn create_physical_expr( // }; // Ok(expressions::case(expr, when_then_expr, else_expr)?) // } - // Expr::Cast(Cast { expr, data_type }) => expressions::cast( - // create_physical_expr(expr, input_dfschema, execution_props)?, - // input_schema, - // data_type.clone(), - // ), - // Expr::TryCast(TryCast { expr, data_type }) => expressions::try_cast( - // create_physical_expr(expr, input_dfschema, execution_props)?, - // input_schema, - // data_type.clone(), - // ), + Expr::Cast(Cast { expr, data_type }) => cast( + create_physical_expr(expr, input_dfschema, execution_props)?, + input_schema, + data_type.clone(), + ), + Expr::TryCast(TryCast { expr, data_type }) => try_cast( + create_physical_expr(expr, input_dfschema, execution_props)?, + input_schema, + data_type.clone(), + ), // Expr::Not(expr) => { // expressions::not(create_physical_expr(expr, input_dfschema, execution_props)?) // } From 1d4705819fd80a0be3f2dee47c3ad97444344801 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 21:41:00 +0800 Subject: [PATCH 13/16] rename imported func Signed-off-by: jayzhan211 --- .../src/expressions/mod.rs | 1 + .../src/expressions/not.rs | 4 ++-- .../physical-expr-common/src/physical_expr.rs | 22 ++++++++++--------- .../physical-expr/src/expressions/mod.rs | 3 +-- 4 files changed, 16 insertions(+), 14 deletions(-) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/not.rs (98%) diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index d1beb5eb4a4f..01c136258748 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -25,4 +25,5 @@ pub mod datum; pub mod in_list; pub mod like; pub mod literal; +pub mod not; pub mod try_cast; diff --git a/datafusion/physical-expr/src/expressions/not.rs b/datafusion/physical-expr-common/src/expressions/not.rs similarity index 98% rename from datafusion/physical-expr/src/expressions/not.rs rename to datafusion/physical-expr-common/src/expressions/not.rs index f17df73e3070..5bc5550df36a 100644 --- a/datafusion/physical-expr/src/expressions/not.rs +++ b/datafusion/physical-expr-common/src/expressions/not.rs @@ -23,7 +23,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use crate::physical_expr::down_cast_any_ref; -use crate::PhysicalExpr; +use crate::physical_expr::PhysicalExpr; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::{cast::as_boolean_array, Result, ScalarValue}; @@ -123,7 +123,7 @@ pub fn not(arg: Arc) -> Result> { #[cfg(test)] mod tests { use super::*; - use crate::expressions::col; + use crate::expressions::column::col; use arrow::{array::BooleanArray, datatypes::*}; use datafusion_common::Result; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 27abdd8f461f..55e511c38202 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -24,12 +24,13 @@ use arrow::array::BooleanArray; use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; + use datafusion_common::utils::DataPtr; use datafusion_common::{exec_err, internal_err, not_impl_err, DFSchema, Result, ScalarValue}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr::{Alias, InList}; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::{binary_expr, BinaryExpr, Cast, ColumnarValue, Expr, Like, Operator, TryCast}; +use datafusion_expr::{BinaryExpr, Cast, ColumnarValue, Expr, Like, Operator, TryCast}; use crate::expressions::binary::binary; use crate::expressions::cast::cast; @@ -37,6 +38,7 @@ use crate::expressions::column::Column; use crate::expressions::in_list::in_list; use crate::expressions::like::like; use crate::expressions::literal::{lit, Literal}; +use crate::expressions::not::not; use crate::expressions::try_cast::try_cast; use crate::sort_properties::SortProperties; use crate::utils::scatter; @@ -347,7 +349,7 @@ pub fn create_physical_expr( // } // } Expr::IsTrue(expr) => { - let binary_op = binary_expr( + let binary_op = datafusion_expr::binary_expr( expr.as_ref().clone(), Operator::IsNotDistinctFrom, Expr::Literal(ScalarValue::Boolean(Some(true))), @@ -355,7 +357,7 @@ pub fn create_physical_expr( create_physical_expr(&binary_op, input_dfschema, execution_props) } Expr::IsNotTrue(expr) => { - let binary_op = binary_expr( + let binary_op = datafusion_expr::binary_expr( expr.as_ref().clone(), Operator::IsDistinctFrom, Expr::Literal(ScalarValue::Boolean(Some(true))), @@ -363,7 +365,7 @@ pub fn create_physical_expr( create_physical_expr(&binary_op, input_dfschema, execution_props) } Expr::IsFalse(expr) => { - let binary_op = binary_expr( + let binary_op = datafusion_expr::binary_expr( expr.as_ref().clone(), Operator::IsNotDistinctFrom, Expr::Literal(ScalarValue::Boolean(Some(false))), @@ -371,7 +373,7 @@ pub fn create_physical_expr( create_physical_expr(&binary_op, input_dfschema, execution_props) } Expr::IsNotFalse(expr) => { - let binary_op = binary_expr( + let binary_op = datafusion_expr::binary_expr( expr.as_ref().clone(), Operator::IsDistinctFrom, Expr::Literal(ScalarValue::Boolean(Some(false))), @@ -379,7 +381,7 @@ pub fn create_physical_expr( create_physical_expr(&binary_op, input_dfschema, execution_props) } Expr::IsUnknown(expr) => { - let binary_op = binary_expr( + let binary_op = datafusion_expr::binary_expr( expr.as_ref().clone(), Operator::IsNotDistinctFrom, Expr::Literal(ScalarValue::Boolean(None)), @@ -387,7 +389,7 @@ pub fn create_physical_expr( create_physical_expr(&binary_op, input_dfschema, execution_props) } Expr::IsNotUnknown(expr) => { - let binary_op = binary_expr( + let binary_op = datafusion_expr::binary_expr( expr.as_ref().clone(), Operator::IsDistinctFrom, Expr::Literal(ScalarValue::Boolean(None)), @@ -476,9 +478,9 @@ pub fn create_physical_expr( input_schema, data_type.clone(), ), - // Expr::Not(expr) => { - // expressions::not(create_physical_expr(expr, input_dfschema, execution_props)?) - // } + Expr::Not(expr) => { + not(create_physical_expr(expr, input_dfschema, execution_props)?) + } // Expr::Negative(expr) => expressions::negative( // create_physical_expr(expr, input_dfschema, execution_props)?, // input_schema, diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 22f0b71647ab..bb7c7e2ae457 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -23,7 +23,6 @@ mod is_not_null; mod is_null; mod negative; mod no_op; -mod not; /// Module with some convenient methods used in expression building pub mod helpers { @@ -85,7 +84,7 @@ pub use is_null::{is_null, IsNullExpr}; pub use datafusion_physical_expr_common::expressions::like::{like, LikeExpr}; pub use negative::{negative, NegativeExpr}; pub use no_op::NoOp; -pub use not::{not, NotExpr}; +pub use datafusion_physical_expr_common::expressions::not::{not, NotExpr}; #[cfg(test)] pub(crate) mod tests { From e4601f5d1aafac6d5008a368c0b3fa5cee534be2 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 21:42:53 +0800 Subject: [PATCH 14/16] between Signed-off-by: jayzhan211 --- .../physical-expr-common/src/physical_expr.rs | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 55e511c38202..5ddc61fc3065 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -30,7 +30,7 @@ use datafusion_common::{exec_err, internal_err, not_impl_err, DFSchema, Result, use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr::{Alias, InList}; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::{BinaryExpr, Cast, ColumnarValue, Expr, Like, Operator, TryCast}; +use datafusion_expr::{Between, BinaryExpr, Cast, ColumnarValue, Expr, Like, Operator, TryCast}; use crate::expressions::binary::binary; use crate::expressions::cast::cast; @@ -538,30 +538,30 @@ pub fn create_physical_expr( // } // } // } - // Expr::Between(Between { - // expr, - // negated, - // low, - // high, - // }) => { - // let value_expr = create_physical_expr(expr, input_dfschema, execution_props)?; - // let low_expr = create_physical_expr(low, input_dfschema, execution_props)?; - // let high_expr = create_physical_expr(high, input_dfschema, execution_props)?; + Expr::Between(Between { + expr, + negated, + low, + high, + }) => { + let value_expr = create_physical_expr(expr, input_dfschema, execution_props)?; + let low_expr = create_physical_expr(low, input_dfschema, execution_props)?; + let high_expr = create_physical_expr(high, input_dfschema, execution_props)?; - // // rewrite the between into the two binary operators - // let binary_expr = binary( - // binary(value_expr.clone(), Operator::GtEq, low_expr, input_schema)?, - // Operator::And, - // binary(value_expr.clone(), Operator::LtEq, high_expr, input_schema)?, - // input_schema, - // ); + // rewrite the between into the two binary operators + let binary_expr = binary( + binary(value_expr.clone(), Operator::GtEq, low_expr, input_schema)?, + Operator::And, + binary(value_expr.clone(), Operator::LtEq, high_expr, input_schema)?, + input_schema, + ); - // if *negated { - // expressions::not(binary_expr?) - // } else { - // binary_expr - // } - // } + if *negated { + not(binary_expr?) + } else { + binary_expr + } + } Expr::InList(InList { expr, list, From 9bc57104957d6e59f10f3d0104a4375fa7fc1a2f Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 23:14:35 +0800 Subject: [PATCH 15/16] negative Signed-off-by: jayzhan211 --- datafusion/physical-expr-common/src/expressions/mod.rs | 1 + .../src/expressions/negative.rs | 4 ++-- datafusion/physical-expr-common/src/physical_expr.rs | 9 +++++---- datafusion/physical-expr/src/expressions/mod.rs | 3 +-- 4 files changed, 9 insertions(+), 8 deletions(-) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/negative.rs (99%) diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index 01c136258748..329440e7ed62 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -25,5 +25,6 @@ pub mod datum; pub mod in_list; pub mod like; pub mod literal; +pub mod negative; pub mod not; pub mod try_cast; diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr-common/src/expressions/negative.rs similarity index 99% rename from datafusion/physical-expr/src/expressions/negative.rs rename to datafusion/physical-expr-common/src/expressions/negative.rs index d6dd3ddbea5e..223fed728364 100644 --- a/datafusion/physical-expr/src/expressions/negative.rs +++ b/datafusion/physical-expr-common/src/expressions/negative.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use crate::physical_expr::down_cast_any_ref; use crate::sort_properties::SortProperties; -use crate::PhysicalExpr; +use crate::physical_expr::PhysicalExpr; use arrow::{ compute::kernels::numeric::neg_wrapping, @@ -173,7 +173,7 @@ pub fn negative( #[cfg(test)] mod tests { use super::*; - use crate::expressions::{col, Column}; + use crate::expressions::column::{col, Column}; use arrow::array::*; use arrow::datatypes::*; diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 5ddc61fc3065..3422f92ab92e 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -38,6 +38,7 @@ use crate::expressions::column::Column; use crate::expressions::in_list::in_list; use crate::expressions::like::like; use crate::expressions::literal::{lit, Literal}; +use crate::expressions::negative::negative; use crate::expressions::not::not; use crate::expressions::try_cast::try_cast; use crate::sort_properties::SortProperties; @@ -481,10 +482,10 @@ pub fn create_physical_expr( Expr::Not(expr) => { not(create_physical_expr(expr, input_dfschema, execution_props)?) } - // Expr::Negative(expr) => expressions::negative( - // create_physical_expr(expr, input_dfschema, execution_props)?, - // input_schema, - // ), + Expr::Negative(expr) => negative( + create_physical_expr(expr, input_dfschema, execution_props)?, + input_schema, + ), // Expr::IsNull(expr) => expressions::is_null(create_physical_expr( // expr, // input_dfschema, diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index bb7c7e2ae457..4f7bdc99424a 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -21,7 +21,6 @@ mod case; mod column; mod is_not_null; mod is_null; -mod negative; mod no_op; /// Module with some convenient methods used in expression building @@ -82,7 +81,7 @@ pub use datafusion_physical_expr_common::expressions::in_list::{in_list, InListE pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; pub use datafusion_physical_expr_common::expressions::like::{like, LikeExpr}; -pub use negative::{negative, NegativeExpr}; +pub use datafusion_physical_expr_common::expressions::negative::{negative, NegativeExpr}; pub use no_op::NoOp; pub use datafusion_physical_expr_common::expressions::not::{not, NotExpr}; From b43f25302950afab2743a0fc2af9047b534b6486 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Apr 2024 23:21:35 +0800 Subject: [PATCH 16/16] is null Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 4 + datafusion/physical-expr-common/Cargo.toml | 2 +- .../src/expressions/cast.rs | 2 +- .../src/expressions/in_list.rs | 11 +- .../src/expressions/is_not_null.rs | 4 +- .../src/expressions/is_null.rs | 4 +- .../src/expressions/like.rs | 2 +- .../src/expressions/mod.rs | 2 + .../src/expressions/negative.rs | 2 +- .../physical-expr-common/src/physical_expr.rs | 104 +++++++++--------- .../physical-expr/src/expressions/mod.rs | 22 ++-- datafusion/physical-expr/src/physical_expr.rs | 1 - 12 files changed, 87 insertions(+), 73 deletions(-) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/is_not_null.rs (98%) rename datafusion/{physical-expr => physical-expr-common}/src/expressions/is_null.rs (98%) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index ea3644183355..cd6faf4f7884 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1361,9 +1361,13 @@ dependencies = [ name = "datafusion-physical-expr-common" version = "37.1.0" dependencies = [ + "ahash", "arrow", + "arrow-schema", "datafusion-common", "datafusion-expr", + "half", + "hashbrown 0.14.3", "paste", ] diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index b34eca32c21c..6ef738380531 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -45,4 +45,4 @@ datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } -paste = "^1.0" \ No newline at end of file +paste = "^1.0" diff --git a/datafusion/physical-expr-common/src/expressions/cast.rs b/datafusion/physical-expr-common/src/expressions/cast.rs index 70d380419e62..b67187353f0c 100644 --- a/datafusion/physical-expr-common/src/expressions/cast.rs +++ b/datafusion/physical-expr-common/src/expressions/cast.rs @@ -16,8 +16,8 @@ // under the License. use crate::physical_expr::down_cast_any_ref; -use crate::sort_properties::SortProperties; use crate::physical_expr::PhysicalExpr; +use crate::sort_properties::SortProperties; use std::any::Any; use std::fmt; use std::hash::{Hash, Hasher}; diff --git a/datafusion/physical-expr-common/src/expressions/in_list.rs b/datafusion/physical-expr-common/src/expressions/in_list.rs index 752ca4ef80d6..e1f2e1d5ff68 100644 --- a/datafusion/physical-expr-common/src/expressions/in_list.rs +++ b/datafusion/physical-expr-common/src/expressions/in_list.rs @@ -22,9 +22,14 @@ use std::fmt::Debug; use std::hash::{Hash, Hasher}; use std::sync::Arc; -use crate::physical_expr::{down_cast_any_ref, physical_exprs_bag_equal}; use crate::physical_expr::PhysicalExpr; +use crate::physical_expr::{down_cast_any_ref, physical_exprs_bag_equal}; +use arrow::array::downcast_primitive_array; +use arrow::array::{ + as_largestring_array, downcast_array, downcast_dictionary_array, Array, + ArrayAccessor, ArrayData, ArrayIter, ArrayRef, BooleanArray, +}; use arrow::buffer::BooleanBuffer; use arrow::compute::kernels::boolean::{not, or_kleene}; use arrow::compute::kernels::cmp::eq; @@ -32,8 +37,6 @@ use arrow::compute::take; use arrow::datatypes::{i256, DataType, Schema}; use arrow::record_batch::RecordBatch; use arrow::util::bit_iterator::BitIndexIterator; -use arrow::array::{as_largestring_array, downcast_array, downcast_dictionary_array, Array, ArrayAccessor, ArrayData, ArrayIter, ArrayRef, BooleanArray}; -use arrow::array::downcast_primitive_array; use datafusion_common::cast::{ as_boolean_array, as_generic_binary_array, as_string_array, }; @@ -474,7 +477,7 @@ mod tests { use datafusion_common::plan_err; use datafusion_common::Result; use datafusion_expr::type_coercion::binary::comparison_coercion; - + use arrow::datatypes::Field; use arrow::datatypes::TimeUnit; diff --git a/datafusion/physical-expr/src/expressions/is_not_null.rs b/datafusion/physical-expr-common/src/expressions/is_not_null.rs similarity index 98% rename from datafusion/physical-expr/src/expressions/is_not_null.rs rename to datafusion/physical-expr-common/src/expressions/is_not_null.rs index 2e6a2bec9cab..c4d3e27b1fb5 100644 --- a/datafusion/physical-expr/src/expressions/is_not_null.rs +++ b/datafusion/physical-expr-common/src/expressions/is_not_null.rs @@ -21,7 +21,7 @@ use std::hash::{Hash, Hasher}; use std::{any::Any, sync::Arc}; use crate::physical_expr::down_cast_any_ref; -use crate::PhysicalExpr; +use crate::physical_expr::PhysicalExpr; use arrow::compute; use arrow::{ datatypes::{DataType, Schema}, @@ -115,7 +115,7 @@ pub fn is_not_null(arg: Arc) -> Result> #[cfg(test)] mod tests { use super::*; - use crate::expressions::col; + use crate::expressions::column::col; use arrow::{ array::{BooleanArray, StringArray}, datatypes::*, diff --git a/datafusion/physical-expr/src/expressions/is_null.rs b/datafusion/physical-expr-common/src/expressions/is_null.rs similarity index 98% rename from datafusion/physical-expr/src/expressions/is_null.rs rename to datafusion/physical-expr-common/src/expressions/is_null.rs index 3ad4058dd649..fa14f364354b 100644 --- a/datafusion/physical-expr/src/expressions/is_null.rs +++ b/datafusion/physical-expr-common/src/expressions/is_null.rs @@ -27,7 +27,7 @@ use arrow::{ }; use crate::physical_expr::down_cast_any_ref; -use crate::PhysicalExpr; +use crate::physical_expr::PhysicalExpr; use datafusion_common::Result; use datafusion_common::ScalarValue; use datafusion_expr::ColumnarValue; @@ -116,7 +116,7 @@ pub fn is_null(arg: Arc) -> Result> { #[cfg(test)] mod tests { use super::*; - use crate::expressions::col; + use crate::expressions::column::col; use arrow::{ array::{BooleanArray, StringArray}, datatypes::*, diff --git a/datafusion/physical-expr-common/src/expressions/like.rs b/datafusion/physical-expr-common/src/expressions/like.rs index 48114227b61c..e76e0b9f01ef 100644 --- a/datafusion/physical-expr-common/src/expressions/like.rs +++ b/datafusion/physical-expr-common/src/expressions/like.rs @@ -20,11 +20,11 @@ use std::{any::Any, sync::Arc}; use crate::physical_expr::{down_cast_any_ref, PhysicalExpr}; +use crate::expressions::datum::apply_cmp; use arrow::record_batch::RecordBatch; use arrow_schema::{DataType, Schema}; use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; -use crate::expressions::datum::apply_cmp; // Like expression #[derive(Debug, Hash)] diff --git a/datafusion/physical-expr-common/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs index 329440e7ed62..48fd2a1a07a3 100644 --- a/datafusion/physical-expr-common/src/expressions/mod.rs +++ b/datafusion/physical-expr-common/src/expressions/mod.rs @@ -23,6 +23,8 @@ pub mod cast; pub mod column; pub mod datum; pub mod in_list; +pub mod is_not_null; +pub mod is_null; pub mod like; pub mod literal; pub mod negative; diff --git a/datafusion/physical-expr-common/src/expressions/negative.rs b/datafusion/physical-expr-common/src/expressions/negative.rs index 223fed728364..16044cf3abdc 100644 --- a/datafusion/physical-expr-common/src/expressions/negative.rs +++ b/datafusion/physical-expr-common/src/expressions/negative.rs @@ -22,8 +22,8 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use crate::physical_expr::down_cast_any_ref; -use crate::sort_properties::SortProperties; use crate::physical_expr::PhysicalExpr; +use crate::sort_properties::SortProperties; use arrow::{ compute::kernels::numeric::neg_wrapping, diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 3422f92ab92e..66a57b485af8 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -26,16 +26,24 @@ use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::utils::DataPtr; -use datafusion_common::{exec_err, internal_err, not_impl_err, DFSchema, Result, ScalarValue}; +use datafusion_common::{ + exec_err, internal_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, +}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr::{Alias, InList}; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::{Between, BinaryExpr, Cast, ColumnarValue, Expr, Like, Operator, TryCast}; +use datafusion_expr::var_provider::{is_system_variables, VarType}; +use datafusion_expr::{ + Between, BinaryExpr, Cast, ColumnarValue, Expr, GetFieldAccess, GetIndexedField, + Like, Operator, TryCast, +}; use crate::expressions::binary::binary; use crate::expressions::cast::cast; use crate::expressions::column::Column; use crate::expressions::in_list::in_list; +use crate::expressions::is_not_null::is_not_null; +use crate::expressions::is_null::is_null; use crate::expressions::like::like; use crate::expressions::literal::{lit, Literal}; use crate::expressions::negative::negative; @@ -330,25 +338,25 @@ pub fn create_physical_expr( Ok(Arc::new(Column::new(&c.name, idx))) } Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))), - // Expr::ScalarVariable(_, variable_names) => { - // if is_system_variables(variable_names) { - // match execution_props.get_var_provider(VarType::System) { - // Some(provider) => { - // let scalar_value = provider.get_value(variable_names.clone())?; - // Ok(Arc::new(Literal::new(scalar_value))) - // } - // _ => plan_err!("No system variable provider found"), - // } - // } else { - // match execution_props.get_var_provider(VarType::UserDefined) { - // Some(provider) => { - // let scalar_value = provider.get_value(variable_names.clone())?; - // Ok(Arc::new(Literal::new(scalar_value))) - // } - // _ => plan_err!("No user defined variable provider found"), - // } - // } - // } + Expr::ScalarVariable(_, variable_names) => { + if is_system_variables(variable_names) { + match execution_props.get_var_provider(VarType::System) { + Some(provider) => { + let scalar_value = provider.get_value(variable_names.clone())?; + Ok(Arc::new(Literal::new(scalar_value))) + } + _ => plan_err!("No system variable provider found"), + } + } else { + match execution_props.get_var_provider(VarType::UserDefined) { + Some(provider) => { + let scalar_value = provider.get_value(variable_names.clone())?; + Ok(Arc::new(Literal::new(scalar_value))) + } + _ => plan_err!("No user defined variable provider found"), + } + } + } Expr::IsTrue(expr) => { let binary_op = datafusion_expr::binary_expr( expr.as_ref().clone(), @@ -486,33 +494,29 @@ pub fn create_physical_expr( create_physical_expr(expr, input_dfschema, execution_props)?, input_schema, ), - // Expr::IsNull(expr) => expressions::is_null(create_physical_expr( - // expr, - // input_dfschema, - // execution_props, - // )?), - // Expr::IsNotNull(expr) => expressions::is_not_null(create_physical_expr( - // expr, - // input_dfschema, - // execution_props, - // )?), - // Expr::GetIndexedField(GetIndexedField { expr: _, field }) => match field { - // GetFieldAccess::NamedStructField { name: _ } => { - // internal_err!( - // "NamedStructField should be rewritten in OperatorToFunction" - // ) - // } - // GetFieldAccess::ListIndex { key: _ } => { - // internal_err!("ListIndex should be rewritten in OperatorToFunction") - // } - // GetFieldAccess::ListRange { - // start: _, - // stop: _, - // stride: _, - // } => { - // internal_err!("ListRange should be rewritten in OperatorToFunction") - // } - // }, + Expr::IsNull(expr) => { + is_null(create_physical_expr(expr, input_dfschema, execution_props)?) + } + Expr::IsNotNull(expr) => { + is_not_null(create_physical_expr(expr, input_dfschema, execution_props)?) + } + Expr::GetIndexedField(GetIndexedField { expr: _, field }) => match field { + GetFieldAccess::NamedStructField { name: _ } => { + internal_err!( + "NamedStructField should be rewritten in OperatorToFunction" + ) + } + GetFieldAccess::ListIndex { key: _ } => { + internal_err!("ListIndex should be rewritten in OperatorToFunction") + } + GetFieldAccess::ListRange { + start: _, + stop: _, + stride: _, + } => { + internal_err!("ListRange should be rewritten in OperatorToFunction") + } + }, // Expr::ScalarFunction(ScalarFunction { func_def, args }) => { // let physical_args = @@ -568,9 +572,7 @@ pub fn create_physical_expr( list, negated, }) => match expr.as_ref() { - Expr::Literal(ScalarValue::Utf8(None)) => { - Ok(lit(ScalarValue::Boolean(None))) - } + Expr::Literal(ScalarValue::Utf8(None)) => Ok(lit(ScalarValue::Boolean(None))), _ => { let value_expr = create_physical_expr(expr, input_dfschema, execution_props)?; diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 4f7bdc99424a..50cad65b829d 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -19,8 +19,6 @@ mod case; mod column; -mod is_not_null; -mod is_null; mod no_op; /// Module with some convenient methods used in expression building @@ -70,20 +68,26 @@ pub use datafusion_functions_aggregate::first_last::{ }; pub use case::{case, CaseExpr}; -pub use datafusion_physical_expr_common::expressions::cast::{cast, cast_with_options, CastExpr}; pub use column::UnKnownColumn; pub use datafusion_expr::utils::format_state_name; pub use datafusion_physical_expr_common::expressions::binary::{binary, BinaryExpr}; +pub use datafusion_physical_expr_common::expressions::cast::{ + cast, cast_with_options, CastExpr, +}; pub use datafusion_physical_expr_common::expressions::column::{col, Column}; -pub use datafusion_physical_expr_common::expressions::literal::{lit, Literal}; -pub use datafusion_physical_expr_common::expressions::try_cast::{try_cast, TryCastExpr}; pub use datafusion_physical_expr_common::expressions::in_list::{in_list, InListExpr}; -pub use is_not_null::{is_not_null, IsNotNullExpr}; -pub use is_null::{is_null, IsNullExpr}; +pub use datafusion_physical_expr_common::expressions::is_not_null::{ + is_not_null, IsNotNullExpr, +}; +pub use datafusion_physical_expr_common::expressions::is_null::{is_null, IsNullExpr}; pub use datafusion_physical_expr_common::expressions::like::{like, LikeExpr}; -pub use datafusion_physical_expr_common::expressions::negative::{negative, NegativeExpr}; -pub use no_op::NoOp; +pub use datafusion_physical_expr_common::expressions::literal::{lit, Literal}; +pub use datafusion_physical_expr_common::expressions::negative::{ + negative, NegativeExpr, +}; pub use datafusion_physical_expr_common::expressions::not::{not, NotExpr}; +pub use datafusion_physical_expr_common::expressions::try_cast::{try_cast, TryCastExpr}; +pub use no_op::NoOp; #[cfg(test)] pub(crate) mod tests { diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index 7e47bc5ba66b..9856e955c6c8 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -45,7 +45,6 @@ pub fn physical_exprs_equal( lhs.len() == rhs.len() && izip!(lhs, rhs).all(|(lhs, rhs)| lhs.eq(rhs)) } - /// This utility function removes duplicates from the given `exprs` vector. /// Note that this function does not necessarily preserve its input ordering. pub fn deduplicate_physical_exprs(exprs: &mut Vec>) {