From 1cbbf62640b09157229c5e3789d77b4056956506 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 19 Nov 2025 07:36:50 +0800 Subject: [PATCH 1/6] Refactor InListExpr to support structs by re-using existing hashing infrastructure (#18449) This PR is part of an EPIC to push down hash table references from HashJoinExec into scans. The EPIC is tracked in https://github.com/apache/datafusion/issues/17171. A "target state" is tracked in https://github.com/apache/datafusion/pull/18393. There is a series of PRs to get us to this target state in smaller more reviewable changes that are still valuable on their own: - https://github.com/apache/datafusion/pull/18448 - (This PR): https://github.com/apache/datafusion/pull/18449 (depends on https://github.com/apache/datafusion/pull/18448) - https://github.com/apache/datafusion/pull/18451 - Enhance InListExpr to efficiently store homogeneous lists as arrays and avoid a conversion to Vec by adding an internal InListStorage enum with Array and Exprs variants - Re-use existing hashing and comparison utilities to support Struct arrays and other complex types - Add public function `in_list_from_array(expr, list_array, negated)` for creating InList from arrays Although the diff looks large most of it is actually tests and docs. I think the actual code change is a negative LOC change, or at least negative complexity (eliminates a trait, a macro, matching on data types). --------- Co-authored-by: David Hewitt Co-authored-by: Andrew Lamb (cherry picked from commit 486c5d860b02fba8aca53a10fca918d3e23e3d26) --- datafusion/common/src/hash_utils.rs | 178 +- .../physical-expr/src/expressions/in_list.rs | 1667 +++++++++++++++-- datafusion/physical-plan/src/joins/utils.rs | 2 +- datafusion/sqllogictest/test_files/array.slt | 35 +- datafusion/sqllogictest/test_files/expr.slt | 207 ++ .../test_files/tpch/plans/q19.slt.part | 18 +- .../test_files/tpch/plans/q22.slt.part | 26 +- 7 files changed, 1902 insertions(+), 231 deletions(-) diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index d60189fb6fa3f..0fa47671d303a 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -31,8 +31,8 @@ use crate::cast::{ as_string_array, as_string_view_array, as_struct_array, }; use crate::error::Result; -#[cfg(not(feature = "force_hash_collisions"))] -use crate::error::_internal_err; +use crate::error::{_internal_datafusion_err, _internal_err}; +use std::cell::RefCell; // Combines two hashes into one hash #[inline] @@ -41,6 +41,94 @@ pub fn combine_hashes(l: u64, r: u64) -> u64 { hash.wrapping_mul(37).wrapping_add(r) } +/// Maximum size for the thread-local hash buffer before truncation (4MB = 524,288 u64 elements). +/// The goal of this is to avoid unbounded memory growth that would appear as a memory leak. +/// We allow temporary allocations beyond this size, but after use the buffer is truncated +/// to this size. +const MAX_BUFFER_SIZE: usize = 524_288; + +thread_local! { + /// Thread-local buffer for hash computations to avoid repeated allocations. + /// The buffer is reused across calls and truncated if it exceeds MAX_BUFFER_SIZE. + /// Defaults to a capacity of 8192 u64 elements which is the default batch size. + /// This corresponds to 64KB of memory. + static HASH_BUFFER: RefCell> = const { RefCell::new(Vec::new()) }; +} + +/// Creates hashes for the given arrays using a thread-local buffer, then calls the provided callback +/// with an immutable reference to the computed hashes. +/// +/// This function manages a thread-local buffer to avoid repeated allocations. The buffer is automatically +/// truncated if it exceeds `MAX_BUFFER_SIZE` after use. +/// +/// # Arguments +/// * `arrays` - The arrays to hash (must contain at least one array) +/// * `random_state` - The random state for hashing +/// * `callback` - A function that receives an immutable reference to the hash slice and returns a result +/// +/// # Errors +/// Returns an error if: +/// - No arrays are provided +/// - The function is called reentrantly (i.e., the callback invokes `with_hashes` again on the same thread) +/// - The function is called during or after thread destruction +/// +/// # Example +/// ```ignore +/// use datafusion_common::hash_utils::{with_hashes, RandomState}; +/// use arrow::array::{Int32Array, ArrayRef}; +/// use std::sync::Arc; +/// +/// let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); +/// let random_state = RandomState::new(); +/// +/// let result = with_hashes([&array], &random_state, |hashes| { +/// // Use the hashes here +/// Ok(hashes.len()) +/// })?; +/// ``` +pub fn with_hashes( + arrays: I, + random_state: &RandomState, + callback: F, +) -> Result +where + I: IntoIterator, + T: AsDynArray, + F: FnOnce(&[u64]) -> Result, +{ + // Peek at the first array to determine buffer size without fully collecting + let mut iter = arrays.into_iter().peekable(); + + // Get the required size from the first array + let required_size = match iter.peek() { + Some(arr) => arr.as_dyn_array().len(), + None => return _internal_err!("with_hashes requires at least one array"), + }; + + HASH_BUFFER.try_with(|cell| { + let mut buffer = cell.try_borrow_mut() + .map_err(|_| _internal_datafusion_err!("with_hashes cannot be called reentrantly on the same thread"))?; + + // Ensure buffer has sufficient length, clearing old values + buffer.clear(); + buffer.resize(required_size, 0); + + // Create hashes in the buffer - this consumes the iterator + create_hashes(iter, random_state, &mut buffer[..required_size])?; + + // Execute the callback with an immutable slice + let result = callback(&buffer[..required_size])?; + + // Cleanup: truncate if buffer grew too large + if buffer.capacity() > MAX_BUFFER_SIZE { + buffer.truncate(MAX_BUFFER_SIZE); + buffer.shrink_to_fit(); + } + + Ok(result) + }).map_err(|_| _internal_datafusion_err!("with_hashes cannot access thread-local storage during or after thread destruction"))? +} + #[cfg(not(feature = "force_hash_collisions"))] fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) { if mul_col { @@ -478,8 +566,8 @@ impl AsDynArray for &ArrayRef { pub fn create_hashes<'a, I, T>( arrays: I, random_state: &RandomState, - hashes_buffer: &'a mut Vec, -) -> Result<&'a mut Vec> + hashes_buffer: &'a mut [u64], +) -> Result<&'a mut [u64]> where I: IntoIterator, T: AsDynArray, @@ -522,7 +610,7 @@ mod tests { fn create_hashes_for_empty_fixed_size_lit() -> Result<()> { let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish(); let random_state = RandomState::with_seeds(0, 0, 0, 0); - let hashes_buff = &mut vec![0; 0]; + let hashes_buff = &mut [0; 0]; let hashes = create_hashes( &[Arc::new(empty_array) as ArrayRef], &random_state, @@ -1000,4 +1088,84 @@ mod tests { assert_eq!(hashes1, hashes2); } + + #[test] + fn test_with_hashes() { + let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + // Test that with_hashes produces the same results as create_hashes + let mut expected_hashes = vec![0; array.len()]; + create_hashes([&array], &random_state, &mut expected_hashes).unwrap(); + + let result = with_hashes([&array], &random_state, |hashes| { + assert_eq!(hashes.len(), 4); + // Verify hashes match expected values + assert_eq!(hashes, &expected_hashes[..]); + // Return a copy of the hashes + Ok(hashes.to_vec()) + }) + .unwrap(); + + // Verify callback result is returned correctly + assert_eq!(result, expected_hashes); + } + + #[test] + fn test_with_hashes_multi_column() { + let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + let str_array: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"])); + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + // Test multi-column hashing + let mut expected_hashes = vec![0; int_array.len()]; + create_hashes( + [&int_array, &str_array], + &random_state, + &mut expected_hashes, + ) + .unwrap(); + + with_hashes([&int_array, &str_array], &random_state, |hashes| { + assert_eq!(hashes.len(), 3); + assert_eq!(hashes, &expected_hashes[..]); + Ok(()) + }) + .unwrap(); + } + + #[test] + fn test_with_hashes_empty_arrays() { + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + // Test that passing no arrays returns an error + let empty: [&ArrayRef; 0] = []; + let result = with_hashes(empty, &random_state, |_hashes| Ok(())); + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("requires at least one array")); + } + + #[test] + fn test_with_hashes_reentrancy() { + let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + let array2: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6])); + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + // Test that reentrant calls return an error instead of panicking + let result = with_hashes([&array], &random_state, |_hashes| { + // Try to call with_hashes again inside the callback + with_hashes([&array2], &random_state, |_inner_hashes| Ok(())) + }); + + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("reentrantly") || err_msg.contains("cannot be called"), + "Error message should mention reentrancy: {err_msg}", + ); + } } diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index fa91635d9bfd9..10197f1e97b28 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -25,34 +25,36 @@ use std::sync::Arc; use crate::physical_expr::physical_exprs_bag_equal; use crate::PhysicalExpr; -use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano}; use arrow::array::*; use arrow::buffer::BooleanBuffer; use arrow::compute::kernels::boolean::{not, or_kleene}; -use arrow::compute::take; +use arrow::compute::{take, SortOptions}; use arrow::datatypes::*; use arrow::util::bit_iterator::BitIndexIterator; -use arrow::{downcast_dictionary_array, downcast_primitive_array}; -use datafusion_common::cast::{ - as_boolean_array, as_generic_binary_array, as_string_array, -}; -use datafusion_common::hash_utils::HashValue; +use datafusion_common::hash_utils::with_hashes; use datafusion_common::{ - exec_err, internal_err, not_impl_err, DFSchema, Result, ScalarValue, + DFSchema, HashSet, Result, ScalarValue, exec_datafusion_err, exec_err, internal_err }; -use datafusion_expr::ColumnarValue; -use datafusion_physical_expr_common::datum::compare_with_eq; +use datafusion_expr::{expr_vec_fmt, ColumnarValue}; use ahash::RandomState; use datafusion_common::HashMap; use hashbrown::hash_map::RawEntryMut; +/// Trait for InList static filters +trait StaticFilter { + fn null_count(&self) -> usize; + + /// Checks if values in `v` are contained in the filter + fn contains(&self, v: &dyn Array, negated: bool) -> Result; +} + /// InList pub struct InListExpr { expr: Arc, list: Vec>, negated: bool, - static_filter: Option>, + static_filter: Option>, } impl Debug for InListExpr { @@ -65,13 +67,10 @@ impl Debug for InListExpr { } } -/// A type-erased container of array elements -pub trait Set: Send + Sync { - fn contains(&self, v: &dyn Array, negated: bool) -> Result; - fn has_nulls(&self) -> bool; -} - -struct ArrayHashSet { +/// Static filter for InList that stores the array and hash set for O(1) lookups +#[derive(Debug, Clone)] +struct ArrayStaticFilter { + in_array: ArrayRef, state: RandomState, /// Used to provide a lookup from value to in list index /// @@ -80,30 +79,20 @@ struct ArrayHashSet { map: HashMap, } -struct ArraySet { - array: T, - hash_set: ArrayHashSet, -} - -impl ArraySet -where - T: Array + From, -{ - fn new(array: &T, hash_set: ArrayHashSet) -> Self { - Self { - array: downcast_array(array), - hash_set, - } +impl StaticFilter for ArrayStaticFilter { + fn null_count(&self) -> usize { + self.in_array.null_count() } -} -impl Set for ArraySet -where - T: Array + 'static, - for<'a> &'a T: ArrayAccessor, - for<'a> <&'a T as ArrayAccessor>::Item: IsEqual, -{ + /// Checks if values in `v` are contained in the `in_array` using this hash set for lookup. fn contains(&self, v: &dyn Array, negated: bool) -> Result { + // Null type comparisons always return null (SQL three-valued logic) + if v.data_type() == &DataType::Null + || self.in_array.data_type() == &DataType::Null + { + return Ok(BooleanArray::from(vec![None; v.len()])); + } + downcast_dictionary_array! { v => { let values_contains = self.contains(v.values().as_ref(), negated)?; @@ -113,100 +102,161 @@ where _ => {} } - let v = v.as_any().downcast_ref::().unwrap(); - let in_array = &self.array; - let has_nulls = in_array.null_count() != 0; + let needle_nulls = v.logical_nulls(); + let needle_nulls = needle_nulls.as_ref(); + let haystack_has_nulls = self.in_array.null_count() != 0; + + with_hashes([v], &self.state, |hashes| { + let cmp = make_comparator(v, &self.in_array, SortOptions::default())?; + Ok((0..v.len()) + .map(|i| { + // SQL three-valued logic: null IN (...) is always null + if needle_nulls.is_some_and(|nulls| nulls.is_null(i)) { + return None; + } - Ok(ArrayIter::new(v) - .map(|v| { - v.and_then(|v| { - let hash = v.hash_one(&self.hash_set.state); + let hash = hashes[i]; let contains = self - .hash_set .map .raw_entry() - .from_hash(hash, |idx| in_array.value(*idx).is_equal(&v)) + .from_hash(hash, |idx| cmp(i, *idx).is_eq()) .is_some(); match contains { true => Some(!negated), - false if has_nulls => None, + false if haystack_has_nulls => None, false => Some(negated), } }) - }) - .collect()) + .collect()) + }) } +} - fn has_nulls(&self) -> bool { - self.array.null_count() != 0 +fn instantiate_static_filter( + in_array: ArrayRef, +) -> Result> { + match in_array.data_type() { + DataType::Int32 => Ok(Arc::new(Int32StaticFilter::try_new(&in_array)?)), + _ => { + /* fall through to generic implementation */ + Ok(Arc::new(ArrayStaticFilter::try_new(in_array)?)) + } } } -/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there -/// are nulls present or there are more than the configured number of -/// elements. -/// -/// Note: This is split into a separate function as higher-rank trait bounds currently -/// cause type inference to misbehave -fn make_hash_set(array: T) -> ArrayHashSet -where - T: ArrayAccessor, - T::Item: IsEqual, -{ - let state = RandomState::new(); - let mut map: HashMap = - HashMap::with_capacity_and_hasher(array.len(), ()); - - let insert_value = |idx| { - let value = array.value(idx); - let hash = value.hash_one(&state); - if let RawEntryMut::Vacant(v) = map - .raw_entry_mut() - .from_hash(hash, |x| array.value(*x).is_equal(&value)) - { - v.insert_with_hasher(hash, idx, (), |x| array.value(*x).hash_one(&state)); +impl ArrayStaticFilter { + /// Computes a [`StaticFilter`] for the provided [`Array`] if there + /// are nulls present or there are more than the configured number of + /// elements. + /// + /// Note: This is split into a separate function as higher-rank trait bounds currently + /// cause type inference to misbehave + fn try_new(in_array: ArrayRef) -> Result { + // Null type has no natural order - return empty hash set + if in_array.data_type() == &DataType::Null { + return Ok(ArrayStaticFilter { + in_array, + state: RandomState::new(), + map: HashMap::with_hasher(()), + }); } - }; - match array.nulls() { - Some(nulls) => { - BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()) - .for_each(insert_value) - } - None => (0..array.len()).for_each(insert_value), + let state = RandomState::new(); + let mut map: HashMap = HashMap::with_hasher(()); + + with_hashes([&in_array], &state, |hashes| -> Result<()> { + let cmp = make_comparator(&in_array, &in_array, SortOptions::default())?; + + let insert_value = |idx| { + let hash = hashes[idx]; + if let RawEntryMut::Vacant(v) = map + .raw_entry_mut() + .from_hash(hash, |x| cmp(*x, idx).is_eq()) + { + v.insert_with_hasher(hash, idx, (), |x| hashes[*x]); + } + }; + + match in_array.nulls() { + Some(nulls) => { + BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()) + .for_each(insert_value) + } + None => (0..in_array.len()).for_each(insert_value), + } + + Ok(()) + })?; + + Ok(Self { + in_array, + state, + map, + }) } +} - ArrayHashSet { state, map } +struct Int32StaticFilter { + null_count: usize, + values: HashSet, } -/// Creates a `Box` for the given list of `IN` expressions and `batch` -fn make_set(array: &dyn Array) -> Result> { - Ok(downcast_primitive_array! { - array => Arc::new(ArraySet::new(array, make_hash_set(array))), - DataType::Boolean => { - let array = as_boolean_array(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) - }, - DataType::Utf8 => { - let array = as_string_array(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) - } - DataType::LargeUtf8 => { - let array = as_largestring_array(array); - Arc::new(ArraySet::new(array, make_hash_set(array))) - } - DataType::Binary => { - let array = as_generic_binary_array::(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) - } - DataType::LargeBinary => { - let array = as_generic_binary_array::(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) +impl Int32StaticFilter { + fn try_new(in_array: &ArrayRef) -> Result { + let in_array = in_array + .as_primitive_opt::() + .ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?; + + let mut values = HashSet::with_capacity(in_array.len()); + let null_count = in_array.null_count(); + + for v in in_array.iter().flatten() { + values.insert(v); } - DataType::Dictionary(_, _) => unreachable!("dictionary should have been flattened"), - d => return not_impl_err!("DataType::{d} not supported in InList") - }) + + Ok(Self { null_count, values }) + } +} + +impl StaticFilter for Int32StaticFilter { + fn null_count(&self) -> usize { + self.null_count + } + + fn contains(&self, v: &dyn Array, negated: bool) -> Result { + let v = v + .as_primitive_opt::() + .ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?; + + let result = match (v.null_count() > 0, negated) { + (true, false) => { + // has nulls, not negated" + BooleanArray::from_iter( + v.iter().map(|value| Some(self.values.contains(&value?))), + ) + } + (true, true) => { + // has nulls, negated + BooleanArray::from_iter( + v.iter().map(|value| Some(!self.values.contains(&value?))), + ) + } + (false, false) => { + //no null, not negated + BooleanArray::from_iter( + v.values().iter().map(|value| self.values.contains(value)), + ) + } + (false, true) => { + // no null, negated + BooleanArray::from_iter( + v.values().iter().map(|value| !self.values.contains(value)), + ) + } + }; + Ok(result) + } } /// Evaluates the list of expressions into an array, flattening any dictionaries @@ -231,56 +281,26 @@ fn evaluate_list( ScalarValue::iter_to_array(scalars) } -fn try_cast_static_filter_to_set( +/// Try to evaluate a list of expressions as constants. +/// +/// Returns an ArrayRef if all expressions are constants (can be evaluated on an +/// empty RecordBatch), otherwise returns an error. This is used to detect when +/// a list contains only literals, casts of literals, or other constant expressions. +fn try_evaluate_constant_list( list: &[Arc], schema: &Schema, -) -> Result> { +) -> Result { let batch = RecordBatch::new_empty(Arc::new(schema.clone())); - make_set(evaluate_list(list, &batch)?.as_ref()) -} - -/// Custom equality check function which is used with [`ArrayHashSet`] for existence check. -trait IsEqual: HashValue { - fn is_equal(&self, other: &Self) -> bool; -} - -impl IsEqual for &T { - fn is_equal(&self, other: &Self) -> bool { - T::is_equal(self, other) - } -} - -macro_rules! is_equal { - ($($t:ty),+) => { - $(impl IsEqual for $t { - fn is_equal(&self, other: &Self) -> bool { - self == other - } - })* - }; -} -is_equal!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64); -is_equal!(bool, str, [u8]); -is_equal!(IntervalDayTime, IntervalMonthDayNano); - -macro_rules! is_equal_float { - ($($t:ty),+) => { - $(impl IsEqual for $t { - fn is_equal(&self, other: &Self) -> bool { - self.to_bits() == other.to_bits() - } - })* - }; + evaluate_list(list, &batch) } -is_equal_float!(half::f16, f32, f64); impl InListExpr { /// Create a new InList expression - pub fn new( + fn new( expr: Arc, list: Vec>, negated: bool, - static_filter: Option>, + static_filter: Option>, ) -> Self { Self { expr, @@ -304,19 +324,37 @@ impl InListExpr { pub fn negated(&self) -> bool { self.negated } -} -#[macro_export] -macro_rules! expr_vec_fmt { - ( $ARRAY:expr ) => {{ - $ARRAY - .iter() - .map(|e| format!("{e}")) - .collect::>() - .join(", ") - }}; + /// Create a new InList expression directly from an array, bypassing expression evaluation. + /// + /// This is more efficient than `in_list()` when you already have the list as an array, + /// as it avoids the conversion: `ArrayRef -> Vec -> ArrayRef -> StaticFilter`. + /// Instead it goes directly: `ArrayRef -> StaticFilter`. + /// + /// The `list` field will be empty when using this constructor, as the array is stored + /// directly in the static filter. + /// + /// This does not make the expression any more performant at runtime, but it does make it slightly + /// cheaper to build. + pub fn try_new_from_array( + expr: Arc, + array: ArrayRef, + negated: bool, + ) -> Result { + let list = (0..array.len()) + .map(|i| { + let scalar = ScalarValue::try_from_array(array.as_ref(), i)?; + Ok(crate::expressions::lit(scalar) as Arc) + }) + .collect::>>()?; + Ok(Self::new( + expr, + list, + negated, + Some(instantiate_static_filter(array)?), + )) + } } - impl std::fmt::Display for InListExpr { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let list = expr_vec_fmt!(self.list); @@ -351,7 +389,7 @@ impl PhysicalExpr for InListExpr { } if let Some(static_filter) = &self.static_filter { - Ok(static_filter.has_nulls()) + Ok(static_filter.null_count() > 0) } else { for expr in &self.list { if expr.nullable(input_schema)? { @@ -366,18 +404,85 @@ impl PhysicalExpr for InListExpr { let num_rows = batch.num_rows(); let value = self.expr.evaluate(batch)?; let r = match &self.static_filter { - Some(f) => f.contains(value.into_array(num_rows)?.as_ref(), self.negated)?, + Some(filter) => { + match value { + ColumnarValue::Array(array) => { + filter.contains(&array, self.negated)? + } + ColumnarValue::Scalar(scalar) => { + if scalar.is_null() { + // SQL three-valued logic: null IN (...) is always null + // The code below would handle this correctly but this is a faster path + return Ok(ColumnarValue::Array(Arc::new( + BooleanArray::from(vec![None; num_rows]), + ))); + } + // Use a 1 row array to avoid code duplication/branching + // Since all we do is compute hash and lookup this should be efficient enough + let array = scalar.to_array()?; + let result_array = + filter.contains(array.as_ref(), self.negated)?; + // Broadcast the single result to all rows + // Must check is_null() to preserve NULL values (SQL three-valued logic) + if result_array.is_null(0) { + BooleanArray::from(vec![None; num_rows]) + } else { + BooleanArray::from_iter(std::iter::repeat_n( + result_array.value(0), + num_rows, + )) + } + } + } + } None => { + // No static filter: iterate through each expression, compare, and OR results let value = value.into_array(num_rows)?; - let is_nested = value.data_type().is_nested(); let found = self.list.iter().map(|expr| expr.evaluate(batch)).try_fold( BooleanArray::new(BooleanBuffer::new_unset(num_rows), None), |result, expr| -> Result { - let rhs = compare_with_eq( - &value, - &expr?.into_array(num_rows)?, - is_nested, - )?; + let rhs = match expr? { + ColumnarValue::Array(array) => { + let cmp = make_comparator( + value.as_ref(), + array.as_ref(), + SortOptions::default(), + )?; + (0..num_rows) + .map(|i| { + if value.is_null(i) || array.is_null(i) { + return None; + } + Some(cmp(i, i).is_eq()) + }) + .collect::() + } + ColumnarValue::Scalar(scalar) => { + // Check if scalar is null once, before the loop + if scalar.is_null() { + // If scalar is null, all comparisons return null + BooleanArray::from(vec![None; num_rows]) + } else { + // Convert scalar to 1-element array + let array = scalar.to_array()?; + let cmp = make_comparator( + value.as_ref(), + array.as_ref(), + SortOptions::default(), + )?; + // Compare each row of value with the single scalar element + (0..num_rows) + .map(|i| { + if value.is_null(i) { + None + } else { + Some(cmp(i, 0).is_eq()) + } + }) + .collect::() + } + } + }; Ok(or_kleene(&result, &rhs)?) }, )?; @@ -393,8 +498,7 @@ impl PhysicalExpr for InListExpr { } fn children(&self) -> Vec<&Arc> { - let mut children = vec![]; - children.push(&self.expr); + let mut children = vec![&self.expr]; children.extend(&self.list); children } @@ -408,7 +512,7 @@ impl PhysicalExpr for InListExpr { Arc::clone(&children[0]), children[1..].to_vec(), self.negated, - self.static_filter.clone(), + self.static_filter.as_ref().map(Arc::clone), ))) } @@ -443,8 +547,8 @@ impl Hash for InListExpr { fn hash(&self, state: &mut H) { self.expr.hash(state); self.negated.hash(state); - self.list.hash(state); // Add `self.static_filter` when hash is available + self.list.hash(state); } } @@ -465,7 +569,15 @@ pub fn in_list( ); } } - let static_filter = try_cast_static_filter_to_set(&list, schema).ok(); + + // Try to create a static filter for constant expressions + let static_filter = try_evaluate_constant_list(&list, schema) + .and_then(ArrayStaticFilter::try_new) + .ok() + .map(|static_filter| { + Arc::new(static_filter) as Arc + }); + Ok(Arc::new(InListExpr::new( expr, list, @@ -479,11 +591,12 @@ mod tests { use super::*; use crate::expressions; use crate::expressions::{col, lit, try_cast}; + use arrow::buffer::NullBuffer; use datafusion_common::plan_err; use datafusion_expr::type_coercion::binary::comparison_coercion; use datafusion_physical_expr_common::physical_expr::fmt_sql; use insta::assert_snapshot; - use itertools::Itertools as _; + use itertools::Itertools; type InListCastResult = (Arc, Vec>); @@ -519,6 +632,14 @@ mod tests { } } + fn try_cast_static_filter_to_set( + list: &[Arc], + schema: &Schema, + ) -> Result { + let array = try_evaluate_constant_list(list, schema)?; + ArrayStaticFilter::try_new(array) + } + // Attempts to coerce the types of `list_type` to be comparable with the // `expr_type` fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option { @@ -529,7 +650,18 @@ mod tests { }) } - // applies the in_list expr to an input batch and list + /// Test helper macro that evaluates an IN LIST expression with automatic type casting. + /// + /// # Parameters + /// - `$BATCH`: The `RecordBatch` containing the input data to evaluate against + /// - `$LIST`: A `Vec>` of literal expressions representing the IN list values + /// - `$NEGATED`: A `&bool` indicating whether this is a NOT IN operation (true) or IN operation (false) + /// - `$EXPECTED`: A `Vec>` representing the expected boolean results for each row + /// - `$COL`: An `Arc` representing the column expression to evaluate + /// - `$SCHEMA`: A `&Schema` reference for the input batch + /// + /// This macro first applies type casting to the column and list expressions to ensure + /// type compatibility, then delegates to `in_list_raw!` to perform the evaluation and assertion. macro_rules! in_list { ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{ let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?; @@ -544,7 +676,19 @@ mod tests { }}; } - // applies the in_list expr to an input batch and list without cast + /// Test helper macro that evaluates an IN LIST expression without automatic type casting. + /// + /// # Parameters + /// - `$BATCH`: The `RecordBatch` containing the input data to evaluate against + /// - `$LIST`: A `Vec>` of literal expressions representing the IN list values + /// - `$NEGATED`: A `&bool` indicating whether this is a NOT IN operation (true) or IN operation (false) + /// - `$EXPECTED`: A `Vec>` representing the expected boolean results for each row + /// - `$COL`: An `Arc` representing the column expression to evaluate + /// - `$SCHEMA`: A `&Schema` reference for the input batch + /// + /// This macro creates an IN LIST expression, evaluates it against the batch, converts the result + /// to a `BooleanArray`, and asserts that it matches the expected output. Use this when the column + /// and list expressions are already the correct types and don't require casting. macro_rules! in_list_raw { ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{ let expr = in_list($COL, $LIST, $NEGATED, $SCHEMA).unwrap(); @@ -552,8 +696,7 @@ mod tests { .evaluate(&$BATCH)? .into_array($BATCH.num_rows()) .expect("Failed to convert to array"); - let result = - as_boolean_array(&result).expect("failed to downcast to BooleanArray"); + let result = as_boolean_array(&result); let expected = &BooleanArray::from($EXPECTED); assert_eq!(expected, result); }}; @@ -1134,10 +1277,10 @@ mod tests { expressions::cast(lit(2i32), &schema, DataType::Int64)?, try_cast(lit(3.13f32), &schema, DataType::Int64)?, ]; - let result = try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); + let static_filter = try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); let array = Int64Array::from(vec![1, 2, 3, 4]); - let r = result.contains(&array, false).unwrap(); + let r = static_filter.contains(&array, false).unwrap(); assert_eq!(r, BooleanArray::from(vec![true, true, true, false])); try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap(); @@ -1514,4 +1657,1166 @@ mod tests { assert_snapshot!(display_string, @"a@0 NOT IN (SET) ([a, b, NULL])"); Ok(()) } + + #[test] + fn in_list_struct() -> Result<()> { + // Create schema with a struct column + let struct_fields = Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), + ]); + let schema = Schema::new(vec![Field::new( + "a", + DataType::Struct(struct_fields.clone()), + true, + )]); + + // Create test data: array of structs + let x_array = Arc::new(Int32Array::from(vec![1, 2, 3])); + let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"])); + let struct_array = + StructArray::new(struct_fields.clone(), vec![x_array, y_array], None); + + let col_a = col("a", &schema)?; + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?; + + // Create literal structs for the IN list + // Struct {x: 1, y: "a"} + let struct1 = ScalarValue::Struct(Arc::new(StructArray::new( + struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec!["a"])), + ], + None, + ))); + + // Struct {x: 3, y: "c"} + let struct3 = ScalarValue::Struct(Arc::new(StructArray::new( + struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![3])), + Arc::new(StringArray::from(vec!["c"])), + ], + None, + ))); + + // Test: a IN ({1, "a"}, {3, "c"}) + let list = vec![lit(struct1.clone()), lit(struct3.clone())]; + in_list_raw!( + batch, + list.clone(), + &false, + vec![Some(true), Some(false), Some(true)], + Arc::clone(&col_a), + &schema + ); + + // Test: a NOT IN ({1, "a"}, {3, "c"}) + in_list_raw!( + batch, + list, + &true, + vec![Some(false), Some(true), Some(false)], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_struct_with_nulls() -> Result<()> { + // Create schema with a struct column + let struct_fields = Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), + ]); + let schema = Schema::new(vec![Field::new( + "a", + DataType::Struct(struct_fields.clone()), + true, + )]); + + // Create test data with a null struct + let x_array = Arc::new(Int32Array::from(vec![1, 2])); + let y_array = Arc::new(StringArray::from(vec!["a", "b"])); + let struct_array = StructArray::new( + struct_fields.clone(), + vec![x_array, y_array], + Some(NullBuffer::from(vec![true, false])), + ); + + let col_a = col("a", &schema)?; + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?; + + // Create literal struct for the IN list + let struct1 = ScalarValue::Struct(Arc::new(StructArray::new( + struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec!["a"])), + ], + None, + ))); + + // Test: a IN ({1, "a"}) + let list = vec![lit(struct1.clone())]; + in_list_raw!( + batch, + list.clone(), + &false, + vec![Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // Test: a NOT IN ({1, "a"}) + in_list_raw!( + batch, + list, + &true, + vec![Some(false), None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_struct_with_null_in_list() -> Result<()> { + // Create schema with a struct column + let struct_fields = Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), + ]); + let schema = Schema::new(vec![Field::new( + "a", + DataType::Struct(struct_fields.clone()), + true, + )]); + + // Create test data + let x_array = Arc::new(Int32Array::from(vec![1, 2, 3])); + let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"])); + let struct_array = + StructArray::new(struct_fields.clone(), vec![x_array, y_array], None); + + let col_a = col("a", &schema)?; + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?; + + // Create literal structs including a NULL + let struct1 = ScalarValue::Struct(Arc::new(StructArray::new( + struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec!["a"])), + ], + None, + ))); + + let null_struct = ScalarValue::Struct(Arc::new(StructArray::new_null( + struct_fields.clone(), + 1, + ))); + + // Test: a IN ({1, "a"}, NULL) + let list = vec![lit(struct1), lit(null_struct.clone())]; + in_list_raw!( + batch, + list.clone(), + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // Test: a NOT IN ({1, "a"}, NULL) + in_list_raw!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_nested_struct() -> Result<()> { + // Create nested struct schema + let inner_struct_fields = Fields::from(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Utf8, false), + ]); + let outer_struct_fields = Fields::from(vec![ + Field::new( + "inner", + DataType::Struct(inner_struct_fields.clone()), + false, + ), + Field::new("c", DataType::Int32, false), + ]); + let schema = Schema::new(vec![Field::new( + "x", + DataType::Struct(outer_struct_fields.clone()), + true, + )]); + + // Create test data with nested structs + let inner1 = Arc::new(StructArray::new( + inner_struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec!["x", "y"])), + ], + None, + )); + let c_array = Arc::new(Int32Array::from(vec![10, 20])); + let outer_array = + StructArray::new(outer_struct_fields.clone(), vec![inner1, c_array], None); + + let col_x = col("x", &schema)?; + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(outer_array)])?; + + // Create a nested struct literal matching the first row + let inner_match = Arc::new(StructArray::new( + inner_struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec!["x"])), + ], + None, + )); + let outer_match = ScalarValue::Struct(Arc::new(StructArray::new( + outer_struct_fields.clone(), + vec![inner_match, Arc::new(Int32Array::from(vec![10]))], + None, + ))); + + // Test: x IN ({{1, "x"}, 10}) + let list = vec![lit(outer_match)]; + in_list_raw!( + batch, + list.clone(), + &false, + vec![Some(true), Some(false)], + Arc::clone(&col_x), + &schema + ); + + // Test: x NOT IN ({{1, "x"}, 10}) + in_list_raw!( + batch, + list, + &true, + vec![Some(false), Some(true)], + Arc::clone(&col_x), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_struct_with_exprs_not_array() -> Result<()> { + // Test InList using expressions (not the array constructor) with structs + // By using InListExpr::new directly, we bypass the array optimization + // and use the Exprs variant, testing the expression evaluation path + + // Create schema with a struct column {x: Int32, y: Utf8} + let struct_fields = Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), + ]); + let schema = Schema::new(vec![Field::new( + "a", + DataType::Struct(struct_fields.clone()), + true, + )]); + + // Create test data: array of structs [{1, "a"}, {2, "b"}, {3, "c"}] + let x_array = Arc::new(Int32Array::from(vec![1, 2, 3])); + let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"])); + let struct_array = + StructArray::new(struct_fields.clone(), vec![x_array, y_array], None); + + let col_a = col("a", &schema)?; + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?; + + // Create struct literals with the SAME shape (so types are compatible) + // Struct {x: 1, y: "a"} + let struct1 = ScalarValue::Struct(Arc::new(StructArray::new( + struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec!["a"])), + ], + None, + ))); + + // Struct {x: 3, y: "c"} + let struct3 = ScalarValue::Struct(Arc::new(StructArray::new( + struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![3])), + Arc::new(StringArray::from(vec!["c"])), + ], + None, + ))); + + // Create list of struct expressions + let list = vec![lit(struct1), lit(struct3)]; + + // Use InListExpr::new directly (not in_list()) to bypass array optimization + // This creates an InList without a static filter + let expr = Arc::new(InListExpr::new(Arc::clone(&col_a), list, false, None)); + + // Verify that the expression doesn't have a static filter + // by checking the display string does NOT contain "(SET)" + let display_string = expr.to_string(); + assert!( + !display_string.contains("(SET)"), + "Expected display string to NOT contain '(SET)' (should use Exprs variant), but got: {display_string}", + ); + + // Evaluate the expression + let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?; + let result = as_boolean_array(&result); + + // Expected: first row {1, "a"} matches struct1, + // second row {2, "b"} doesn't match, + // third row {3, "c"} matches struct3 + let expected = BooleanArray::from(vec![Some(true), Some(false), Some(true)]); + assert_eq!(result, &expected); + + // Test NOT IN as well + let expr_not = Arc::new(InListExpr::new( + Arc::clone(&col_a), + vec![ + lit(ScalarValue::Struct(Arc::new(StructArray::new( + struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec!["a"])), + ], + None, + )))), + lit(ScalarValue::Struct(Arc::new(StructArray::new( + struct_fields.clone(), + vec![ + Arc::new(Int32Array::from(vec![3])), + Arc::new(StringArray::from(vec!["c"])), + ], + None, + )))), + ], + true, + None, + )); + + let result_not = expr_not.evaluate(&batch)?.into_array(batch.num_rows())?; + let result_not = as_boolean_array(&result_not); + + let expected_not = BooleanArray::from(vec![Some(false), Some(true), Some(false)]); + assert_eq!(result_not, &expected_not); + + Ok(()) + } + + #[test] + fn test_in_list_null_handling_comprehensive() -> Result<()> { + // Comprehensive test demonstrating SQL three-valued logic for IN expressions + // This test explicitly shows all possible outcomes: true, false, and null + let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); + + // Test data: [1, 2, 3, null] + // - 1 will match in both lists + // - 2 will not match in either list + // - 3 will not match in either list + // - null is always null + let a = Int64Array::from(vec![Some(1), Some(2), Some(3), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // Case 1: List WITHOUT null - demonstrates true/false/null outcomes + // "a IN (1, 4)" - 1 matches, 2 and 3 don't match, null is null + let list = vec![lit(1i64), lit(4i64)]; + in_list!( + batch, + list, + &false, + vec![ + Some(true), // 1 is in the list → true + Some(false), // 2 is not in the list → false + Some(false), // 3 is not in the list → false + None, // null IN (...) → null (SQL three-valued logic) + ], + Arc::clone(&col_a), + &schema + ); + + // Case 2: List WITH null - demonstrates null propagation for non-matches + // "a IN (1, NULL)" - 1 matches (true), 2/3 don't match but list has null (null), null is null + let list = vec![lit(1i64), lit(ScalarValue::Int64(None))]; + in_list!( + batch, + list, + &false, + vec![ + Some(true), // 1 is in the list → true (found match) + None, // 2 is not in list, but list has NULL → null (might match NULL) + None, // 3 is not in list, but list has NULL → null (might match NULL) + None, // null IN (...) → null (SQL three-valued logic) + ], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn test_in_list_with_only_nulls() -> Result<()> { + // Edge case: IN list contains ONLY null values + let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); + let a = Int64Array::from(vec![Some(1), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // "a IN (NULL, NULL)" - list has only nulls + let list = vec![lit(ScalarValue::Int64(None)), lit(ScalarValue::Int64(None))]; + + // All results should be NULL because: + // - Non-null values (1, 2) can't match anything concrete, but list might contain matching value + // - NULL value is always NULL in IN expressions + in_list!( + batch, + list.clone(), + &false, + vec![None, None, None], + Arc::clone(&col_a), + &schema + ); + + // "a NOT IN (NULL, NULL)" - list has only nulls + // All results should still be NULL due to three-valued logic + in_list!( + batch, + list, + &true, + vec![None, None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn test_in_list_multiple_nulls_deduplication() -> Result<()> { + // Test that multiple NULLs in the list are handled correctly + // This verifies deduplication doesn't break null handling + let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); + let col_a = col("a", &schema)?; + + // Create array with multiple nulls: [1, 2, NULL, NULL, 3, NULL] + let array = Arc::new(Int64Array::from(vec![ + Some(1), + Some(2), + None, + None, + Some(3), + None, + ])) as ArrayRef; + + // Create InListExpr from array + let expr = Arc::new(InListExpr::try_new_from_array( + Arc::clone(&col_a), + array, + false, + )?) as Arc; + + // Create test data: [1, 2, 3, 4, null] + let a = Int64Array::from(vec![Some(1), Some(2), Some(3), Some(4), None]); + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // Evaluate the expression + let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?; + let result = as_boolean_array(&result); + + // Expected behavior with multiple NULLs in list: + // - Values in the list (1,2,3) → true + // - Values not in the list (4) → NULL (because list contains NULL) + // - NULL input → NULL + let expected = BooleanArray::from(vec![ + Some(true), // 1 is in list + Some(true), // 2 is in list + Some(true), // 3 is in list + None, // 4 not in list, but list has NULLs + None, // NULL input + ]); + assert_eq!(result, &expected); + + Ok(()) + } + + #[test] + fn test_not_in_null_handling_comprehensive() -> Result<()> { + // Comprehensive test demonstrating SQL three-valued logic for NOT IN expressions + // This test explicitly shows all possible outcomes for NOT IN: true, false, and null + let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); + + // Test data: [1, 2, 3, null] + let a = Int64Array::from(vec![Some(1), Some(2), Some(3), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // Case 1: List WITHOUT null - demonstrates true/false/null outcomes for NOT IN + // "a NOT IN (1, 4)" - 1 matches (false), 2 and 3 don't match (true), null is null + let list = vec![lit(1i64), lit(4i64)]; + in_list!( + batch, + list, + &true, + vec![ + Some(false), // 1 is in the list → NOT IN returns false + Some(true), // 2 is not in the list → NOT IN returns true + Some(true), // 3 is not in the list → NOT IN returns true + None, // null NOT IN (...) → null (SQL three-valued logic) + ], + Arc::clone(&col_a), + &schema + ); + + // Case 2: List WITH null - demonstrates null propagation for NOT IN + // "a NOT IN (1, NULL)" - 1 matches (false), 2/3 don't match but list has null (null), null is null + let list = vec![lit(1i64), lit(ScalarValue::Int64(None))]; + in_list!( + batch, + list, + &true, + vec![ + Some(false), // 1 is in the list → NOT IN returns false + None, // 2 is not in known values, but list has NULL → null (can't prove it's not in list) + None, // 3 is not in known values, but list has NULL → null (can't prove it's not in list) + None, // null NOT IN (...) → null (SQL three-valued logic) + ], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn test_in_list_null_type_column() -> Result<()> { + // Test with a column that has DataType::Null (not just nullable values) + // All values in a NullArray are null by definition + let schema = Schema::new(vec![Field::new("a", DataType::Null, true)]); + let a = NullArray::new(3); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // "null_column IN (1, 2)" - comparing Null type against Int64 list + // Note: This tests type coercion behavior between Null and Int64 + let list = vec![lit(1i64), lit(2i64)]; + + // All results should be NULL because: + // - Every value in the column is null (DataType::Null) + // - null IN (anything) always returns null per SQL three-valued logic + in_list!( + batch, + list.clone(), + &false, + vec![None, None, None], + Arc::clone(&col_a), + &schema + ); + + // "null_column NOT IN (1, 2)" + // Same behavior for NOT IN - null NOT IN (anything) is still null + in_list!( + batch, + list, + &true, + vec![None, None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn test_in_list_null_type_list() -> Result<()> { + // Test with a list that has DataType::Null + let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); + let a = Int64Array::from(vec![Some(1), Some(2), None]); + let col_a = col("a", &schema)?; + + // Create a NullArray as the list + let null_array = Arc::new(NullArray::new(2)) as ArrayRef; + + // Try to create InListExpr with a NullArray list + // This tests whether try_new_from_array can handle Null type arrays + let expr = Arc::new(InListExpr::try_new_from_array( + Arc::clone(&col_a), + null_array, + false, + )?) as Arc; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?; + let result = as_boolean_array(&result); + + // If it succeeds, all results should be NULL + // because the list contains only null type values + let expected = BooleanArray::from(vec![None, None, None]); + assert_eq!(result, &expected); + + Ok(()) + } + + #[test] + fn test_in_list_null_type_both() -> Result<()> { + // Test when both column and list are DataType::Null + let schema = Schema::new(vec![Field::new("a", DataType::Null, true)]); + let a = NullArray::new(3); + let col_a = col("a", &schema)?; + + // Create a NullArray as the list + let null_array = Arc::new(NullArray::new(2)) as ArrayRef; + + // Try to create InListExpr with both Null types + let expr = Arc::new(InListExpr::try_new_from_array( + Arc::clone(&col_a), + null_array, + false, + )?) as Arc; + + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?; + let result = as_boolean_array(&result); + + // If successful, all results should be NULL + // null IN [null, null] -> null + let expected = BooleanArray::from(vec![None, None, None]); + assert_eq!(result, &expected); + + Ok(()) + } + + #[test] + fn test_in_list_comprehensive_null_handling() -> Result<()> { + // Comprehensive test for IN LIST operations with various NULL handling scenarios. + // This test covers the key cases validated against DuckDB as the source of truth. + // + // Note: Some scalar literal tests (like NULL IN (1, 2)) are omitted as they + // appear to expose an issue with static filter optimization. These are covered + // by existing tests like in_list_no_cols(). + + let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)])); + let col_b = col("b", &schema)?; + let null_i32 = ScalarValue::Int32(None); + + // Helper to create a batch + let make_batch = |values: Vec>| -> Result { + let array = Arc::new(Int32Array::from(values)); + Ok(RecordBatch::try_new(Arc::clone(&schema), vec![array])?) + }; + + // Helper to run a test + let run_test = |batch: &RecordBatch, + expr: Arc, + list: Vec>, + expected: Vec>| + -> Result<()> { + let in_expr = in_list(expr, list, &false, schema.as_ref())?; + let result = in_expr.evaluate(batch)?.into_array(batch.num_rows())?; + let result = as_boolean_array(&result); + assert_eq!(result, &BooleanArray::from(expected)); + Ok(()) + }; + + // ======================================================================== + // COLUMN TESTS - col(b) IN [1, 2] + // ======================================================================== + + // [1] IN (1, 2) => [TRUE] + let batch = make_batch(vec![Some(1)])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(1i32), lit(2i32)], + vec![Some(true)], + )?; + + // [1, 2] IN (1, 2) => [TRUE, TRUE] + let batch = make_batch(vec![Some(1), Some(2)])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(1i32), lit(2i32)], + vec![Some(true), Some(true)], + )?; + + // [3, 4] IN (1, 2) => [FALSE, FALSE] + let batch = make_batch(vec![Some(3), Some(4)])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(1i32), lit(2i32)], + vec![Some(false), Some(false)], + )?; + + // [1, NULL] IN (1, 2) => [TRUE, NULL] + let batch = make_batch(vec![Some(1), None])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(1i32), lit(2i32)], + vec![Some(true), None], + )?; + + // [3, NULL] IN (1, 2) => [FALSE, NULL] (no match, NULL is NULL) + let batch = make_batch(vec![Some(3), None])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(1i32), lit(2i32)], + vec![Some(false), None], + )?; + + // ======================================================================== + // COLUMN WITH NULL IN LIST - col(b) IN [NULL, 1] + // ======================================================================== + + // [1] IN (NULL, 1) => [TRUE] (found match) + let batch = make_batch(vec![Some(1)])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(null_i32.clone()), lit(1i32)], + vec![Some(true)], + )?; + + // [2] IN (NULL, 1) => [NULL] (no match, but list has NULL) + let batch = make_batch(vec![Some(2)])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(null_i32.clone()), lit(1i32)], + vec![None], + )?; + + // [NULL] IN (NULL, 1) => [NULL] + let batch = make_batch(vec![None])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(null_i32.clone()), lit(1i32)], + vec![None], + )?; + + // ======================================================================== + // COLUMN WITH ALL NULLS IN LIST - col(b) IN [NULL, NULL] + // ======================================================================== + + // [1] IN (NULL, NULL) => [NULL] + let batch = make_batch(vec![Some(1)])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(null_i32.clone()), lit(null_i32.clone())], + vec![None], + )?; + + // [NULL] IN (NULL, NULL) => [NULL] + let batch = make_batch(vec![None])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(null_i32.clone()), lit(null_i32.clone())], + vec![None], + )?; + + // ======================================================================== + // LITERAL IN LIST WITH COLUMN - lit(1) IN [2, col(b)] + // ======================================================================== + + // 1 IN (2, [1]) => [TRUE] (matches column value) + let batch = make_batch(vec![Some(1)])?; + run_test( + &batch, + lit(1i32), + vec![lit(2i32), Arc::clone(&col_b)], + vec![Some(true)], + )?; + + // 1 IN (2, [3]) => [FALSE] (no match) + let batch = make_batch(vec![Some(3)])?; + run_test( + &batch, + lit(1i32), + vec![lit(2i32), Arc::clone(&col_b)], + vec![Some(false)], + )?; + + // 1 IN (2, [NULL]) => [NULL] (no match, column is NULL) + let batch = make_batch(vec![None])?; + run_test( + &batch, + lit(1i32), + vec![lit(2i32), Arc::clone(&col_b)], + vec![None], + )?; + + // ======================================================================== + // COLUMN IN LIST CONTAINING ITSELF - col(b) IN [1, col(b)] + // ======================================================================== + + // [1] IN (1, [1]) => [TRUE] (always matches - either list literal or itself) + let batch = make_batch(vec![Some(1)])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(1i32), Arc::clone(&col_b)], + vec![Some(true)], + )?; + + // [2] IN (1, [2]) => [TRUE] (matches itself) + let batch = make_batch(vec![Some(2)])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(1i32), Arc::clone(&col_b)], + vec![Some(true)], + )?; + + // [NULL] IN (1, [NULL]) => [NULL] (NULL is never equal to anything) + let batch = make_batch(vec![None])?; + run_test( + &batch, + Arc::clone(&col_b), + vec![lit(1i32), Arc::clone(&col_b)], + vec![None], + )?; + + Ok(()) + } + + #[test] + fn test_in_list_scalar_literal_cases() -> Result<()> { + // Test scalar literal cases (both NULL and non-NULL) to ensure SQL three-valued + // logic is correctly implemented. This covers the important case where a scalar + // value is tested against a list containing NULL. + + let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)])); + let null_i32 = ScalarValue::Int32(None); + + // Helper to create a batch + let make_batch = |values: Vec>| -> Result { + let array = Arc::new(Int32Array::from(values)); + Ok(RecordBatch::try_new(Arc::clone(&schema), vec![array])?) + }; + + // Helper to run a test + let run_test = |batch: &RecordBatch, + expr: Arc, + list: Vec>, + negated: bool, + expected: Vec>| + -> Result<()> { + let in_expr = in_list(expr, list, &negated, schema.as_ref())?; + let result = in_expr.evaluate(batch)?.into_array(batch.num_rows())?; + let result = as_boolean_array(&result); + let expected_array = BooleanArray::from(expected); + assert_eq!( + result, + &expected_array, + "Expected {:?}, got {:?}", + expected_array, + result.iter().collect::>() + ); + Ok(()) + }; + + let batch = make_batch(vec![Some(1)])?; + + // ======================================================================== + // NULL LITERAL TESTS + // According to SQL semantics, NULL IN (any_list) should always return NULL + // ======================================================================== + + // NULL IN (1, 1) => NULL + run_test( + &batch, + lit(null_i32.clone()), + vec![lit(1i32), lit(1i32)], + false, + vec![None], + )?; + + // NULL IN (NULL, 1) => NULL + run_test( + &batch, + lit(null_i32.clone()), + vec![lit(null_i32.clone()), lit(1i32)], + false, + vec![None], + )?; + + // NULL IN (NULL, NULL) => NULL + run_test( + &batch, + lit(null_i32.clone()), + vec![lit(null_i32.clone()), lit(null_i32.clone())], + false, + vec![None], + )?; + + // ======================================================================== + // NON-NULL SCALAR LITERALS WITH NULL IN LIST - Int32 + // When a scalar value is NOT in a list containing NULL, the result is NULL + // When a scalar value IS in the list, the result is TRUE (NULL doesn't matter) + // ======================================================================== + + // 3 IN (0, 1, 2, NULL) => NULL (not in list, but list has NULL) + run_test( + &batch, + lit(3i32), + vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())], + false, + vec![None], + )?; + + // 3 NOT IN (0, 1, 2, NULL) => NULL (not in list, but list has NULL) + run_test( + &batch, + lit(3i32), + vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())], + true, + vec![None], + )?; + + // 1 IN (0, 1, 2, NULL) => TRUE (found match, NULL doesn't matter) + run_test( + &batch, + lit(1i32), + vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())], + false, + vec![Some(true)], + )?; + + // 1 NOT IN (0, 1, 2, NULL) => FALSE (found match, NULL doesn't matter) + run_test( + &batch, + lit(1i32), + vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())], + true, + vec![Some(false)], + )?; + + // ======================================================================== + // NON-NULL SCALAR LITERALS WITH NULL IN LIST - String + // Same semantics as Int32 but with string type + // ======================================================================== + + let schema_str = + Arc::new(Schema::new(vec![Field::new("s", DataType::Utf8, true)])); + let batch_str = RecordBatch::try_new( + Arc::clone(&schema_str), + vec![Arc::new(StringArray::from(vec![Some("dummy")]))], + )?; + let null_str = ScalarValue::Utf8(None); + + let run_test_str = |expr: Arc, + list: Vec>, + negated: bool, + expected: Vec>| + -> Result<()> { + let in_expr = in_list(expr, list, &negated, schema_str.as_ref())?; + let result = in_expr + .evaluate(&batch_str)? + .into_array(batch_str.num_rows())?; + let result = as_boolean_array(&result); + let expected_array = BooleanArray::from(expected); + assert_eq!( + result, + &expected_array, + "Expected {:?}, got {:?}", + expected_array, + result.iter().collect::>() + ); + Ok(()) + }; + + // 'c' IN ('a', 'b', NULL) => NULL (not in list, but list has NULL) + run_test_str( + lit("c"), + vec![lit("a"), lit("b"), lit(null_str.clone())], + false, + vec![None], + )?; + + // 'c' NOT IN ('a', 'b', NULL) => NULL (not in list, but list has NULL) + run_test_str( + lit("c"), + vec![lit("a"), lit("b"), lit(null_str.clone())], + true, + vec![None], + )?; + + // 'a' IN ('a', 'b', NULL) => TRUE (found match, NULL doesn't matter) + run_test_str( + lit("a"), + vec![lit("a"), lit("b"), lit(null_str.clone())], + false, + vec![Some(true)], + )?; + + // 'a' NOT IN ('a', 'b', NULL) => FALSE (found match, NULL doesn't matter) + run_test_str( + lit("a"), + vec![lit("a"), lit("b"), lit(null_str.clone())], + true, + vec![Some(false)], + )?; + + Ok(()) + } + + #[test] + fn test_in_list_tuple_cases() -> Result<()> { + // Test tuple/struct cases from the original request: (lit, lit) IN (lit, lit) + // These test row-wise comparisons like (1, 2) IN ((1, 2), (3, 4)) + + let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)])); + + // Helper to create struct scalars for tuple comparisons + let make_struct = |v1: Option, v2: Option| -> ScalarValue { + let fields = Fields::from(vec![ + Field::new("field_0", DataType::Int32, true), + Field::new("field_1", DataType::Int32, true), + ]); + ScalarValue::Struct(Arc::new(StructArray::new( + fields, + vec![ + Arc::new(Int32Array::from(vec![v1])), + Arc::new(Int32Array::from(vec![v2])), + ], + None, + ))) + }; + + // Need a single row batch for scalar tests + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![Some(1)]))], + )?; + + // Helper to run tuple tests + let run_tuple_test = |lhs: ScalarValue, + list: Vec, + expected: Vec>| + -> Result<()> { + let expr = in_list( + lit(lhs), + list.into_iter().map(lit).collect(), + &false, + schema.as_ref(), + )?; + let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?; + let result = as_boolean_array(&result); + assert_eq!(result, &BooleanArray::from(expected)); + Ok(()) + }; + + // (NULL, NULL) IN ((1, 2)) => FALSE (tuples don't match) + run_tuple_test( + make_struct(None, None), + vec![make_struct(Some(1), Some(2))], + vec![Some(false)], + )?; + + // (NULL, NULL) IN ((NULL, 1)) => FALSE + run_tuple_test( + make_struct(None, None), + vec![make_struct(None, Some(1))], + vec![Some(false)], + )?; + + // (NULL, NULL) IN ((NULL, NULL)) => TRUE (exact match including nulls) + run_tuple_test( + make_struct(None, None), + vec![make_struct(None, None)], + vec![Some(true)], + )?; + + // (NULL, 1) IN ((1, 2)) => FALSE + run_tuple_test( + make_struct(None, Some(1)), + vec![make_struct(Some(1), Some(2))], + vec![Some(false)], + )?; + + // (NULL, 1) IN ((NULL, 1)) => TRUE (exact match) + run_tuple_test( + make_struct(None, Some(1)), + vec![make_struct(None, Some(1))], + vec![Some(true)], + )?; + + // (NULL, 1) IN ((NULL, NULL)) => FALSE + run_tuple_test( + make_struct(None, Some(1)), + vec![make_struct(None, None)], + vec![Some(false)], + )?; + + // (1, 2) IN ((1, 2)) => TRUE + run_tuple_test( + make_struct(Some(1), Some(2)), + vec![make_struct(Some(1), Some(2))], + vec![Some(true)], + )?; + + // (1, 3) IN ((1, 2)) => FALSE + run_tuple_test( + make_struct(Some(1), Some(3)), + vec![make_struct(Some(1), Some(2))], + vec![Some(false)], + )?; + + // (4, 4) IN ((1, 2)) => FALSE + run_tuple_test( + make_struct(Some(4), Some(4)), + vec![make_struct(Some(1), Some(2))], + vec![Some(false)], + )?; + + // (1, 1) IN ((NULL, 1)) => FALSE + run_tuple_test( + make_struct(Some(1), Some(1)), + vec![make_struct(None, Some(1))], + vec![Some(false)], + )?; + + // (1, 1) IN ((NULL, NULL)) => FALSE + run_tuple_test( + make_struct(Some(1), Some(1)), + vec![make_struct(None, None)], + vec![Some(false)], + )?; + + Ok(()) + } } diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 9b589b674cc5b..1d03a35d2cbc4 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -1669,7 +1669,7 @@ pub fn update_hash( hash_map: &mut dyn JoinHashMapType, offset: usize, random_state: &RandomState, - hashes_buffer: &mut Vec, + hashes_buffer: &mut [u64], deleted_offset: usize, fifo_hashmap: bool, ) -> Result<()> { diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 00629c392df48..36ea5f6fc5add 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6407,10 +6407,9 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) @@ -6436,10 +6435,9 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) @@ -6465,10 +6463,9 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) @@ -6494,10 +6491,9 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) @@ -6523,10 +6519,9 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 87345b833e264..df88d26c9c9de 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1066,6 +1066,213 @@ SELECT '2' NOT IN ('a','b',NULL,1) ---- NULL +# ======================================================================== +# Comprehensive IN LIST tests with NULL handling +# These tests validate SQL three-valued logic for IN operations +# ======================================================================== + +# test_in_list_null_literals +# NULL IN (any_list) should always return NULL per SQL three-valued logic + +query B +SELECT NULL IN (1, 1) +---- +NULL + +query B +SELECT NULL IN (NULL, 1) +---- +NULL + +query B +SELECT NULL IN (NULL, NULL) +---- +NULL + +# test_in_list_with_columns +# Create test table for column-based IN LIST tests + +statement ok +CREATE OR REPLACE TABLE in_list_test(b INT) AS VALUES (1), (2), (3), (4), (NULL); + +# Test: b IN (1, 2) with various values + +query B +SELECT b IN (1, 2) FROM in_list_test WHERE b = 1; +---- +true + +query IB +SELECT b, b IN (1, 2) FROM in_list_test WHERE b IN (1, 2) ORDER BY b; +---- +1 true +2 true + +query IB +SELECT b, b IN (1, 2) FROM in_list_test WHERE b IN (3, 4) ORDER BY b; +---- +3 false +4 false + +query B +SELECT b IN (1, 2) FROM in_list_test WHERE b = 1; +---- +true + +query B +SELECT b IN (1, 2) FROM in_list_test WHERE b = 3; +---- +false + +query B +SELECT b IN (1, 2) FROM in_list_test WHERE b IS NULL; +---- +NULL + +# Test: b IN (NULL, 1) - list contains NULL + +query B +SELECT b IN (NULL, 1) FROM in_list_test WHERE b = 1; +---- +true + +query B +SELECT b IN (NULL, 1) FROM in_list_test WHERE b = 2; +---- +NULL + +query B +SELECT b IN (NULL, 1) FROM in_list_test WHERE b IS NULL; +---- +NULL + +# Test: b IN (NULL, NULL) - list contains only NULLs + +query B +SELECT b IN (NULL, NULL) FROM in_list_test WHERE b = 1; +---- +NULL + +query B +SELECT b IN (NULL, NULL) FROM in_list_test WHERE b IS NULL; +---- +NULL + +# Test: literal IN (list_with_column) - column appears in the list + +statement ok +CREATE OR REPLACE TABLE in_list_col_test(b INT) AS VALUES (1), (3), (NULL); + +query B +SELECT 1 IN (2, b) FROM in_list_col_test WHERE b = 1; +---- +true + +query B +SELECT 1 IN (2, b) FROM in_list_col_test WHERE b = 3; +---- +false + +query B +SELECT 1 IN (2, b) FROM in_list_col_test WHERE b IS NULL; +---- +NULL + +# Test: b IN (1, b) - column references itself in list + +query B +SELECT b IN (1, b) FROM in_list_col_test WHERE b = 1; +---- +true + +query B +SELECT b IN (1, b) FROM in_list_col_test WHERE b = 3; +---- +true + +query B +SELECT b IN (1, b) FROM in_list_col_test WHERE b IS NULL; +---- +NULL + +# test_in_list_tuples +# Test tuple/row-wise IN comparisons using struct syntax +# Note: Using arrow_cast for precise type control + +# (NULL, NULL) IN ((1, 2)) => FALSE +query B +SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(1, 2)) +---- +false + +# (NULL, NULL) IN ((NULL, 1)) => FALSE +query B +SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(arrow_cast(NULL, 'Int32'), 1)) +---- +false + +# (NULL, NULL) IN ((NULL, NULL)) => TRUE (exact match) +query B +SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32'))) +---- +true + +# (NULL, 1) IN ((1, 2)) => FALSE +query B +SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(1, 2)) +---- +false + +# (NULL, 1) IN ((NULL, 1)) => TRUE (exact match) +query B +SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(arrow_cast(NULL, 'Int32'), 1)) +---- +true + +# (NULL, 1) IN ((NULL, NULL)) => FALSE +query B +SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32'))) +---- +false + +# (1, 2) IN ((1, 2)) => TRUE +query B +SELECT struct(1, 2) IN (struct(1, 2)) +---- +true + +# (1, 3) IN ((1, 2)) => FALSE +query B +SELECT struct(1, 3) IN (struct(1, 2)) +---- +false + +# (4, 4) IN ((1, 2)) => FALSE +query B +SELECT struct(4, 4) IN (struct(1, 2)) +---- +false + +# (1, 1) IN ((NULL, 1)) => FALSE +query B +SELECT struct(1, 1) IN (struct(NULL, 1)) +---- +false + +# (1, 1) IN ((NULL, NULL)) => FALSE +query B +SELECT struct(1, 1) IN (struct(NULL, NULL)) +---- +false + +# Cleanup test tables + +statement ok +DROP TABLE in_list_test; + +statement ok +DROP TABLE in_list_col_test; + query T SELECT encode('tom','base64'); ---- diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part index 4960ad1f4a914..d20f090fa5b8f 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part @@ -69,15 +69,13 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] 05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN ([SM CASE, SM BOX, SM PACK, SM PKG]) AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN ([MED BAG, MED BOX, MED PKG, MED PACK]) AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN ([LG CASE, LG BOX, LG PACK, LG PKG]) AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3] +06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 -09)----------------CoalesceBatchesExec: target_batch_size=8192 -10)------------------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] -11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], file_type=csv, has_header=false -12)------------CoalesceBatchesExec: target_batch_size=8192 -13)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -14)----------------CoalesceBatchesExec: target_batch_size=8192 -15)------------------FilterExec: (p_brand@1 = Brand#12 AND p_container@3 IN ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) AND p_size@2 >= 1 -16)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -17)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_size, p_container], file_type=csv, has_header=false +09)----------------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] +10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], file_type=csv, has_header=false +11)------------CoalesceBatchesExec: target_batch_size=8192 +12)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +13)----------------FilterExec: (p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) AND p_size@2 >= 1 +14)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +15)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_size, p_container], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part index 22476156b80d8..a9d95fb1ab79f 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part @@ -90,17 +90,15 @@ physical_plan 14)--------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2] 15)----------------------------CoalesceBatchesExec: target_batch_size=8192 16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -17)--------------------------------CoalesceBatchesExec: target_batch_size=8192 -18)----------------------------------FilterExec: substr(c_phone@1, 1, 2) IN ([13, 31, 23, 29, 30, 18, 17]) -19)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -20)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false -21)----------------------------CoalesceBatchesExec: target_batch_size=8192 -22)------------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4 -23)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false -24)--------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] -25)----------------------CoalescePartitionsExec -26)------------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] -27)--------------------------CoalesceBatchesExec: target_batch_size=8192 -28)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1] -29)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -30)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false +17)--------------------------------FilterExec: substr(c_phone@1, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]) +18)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +19)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false +20)----------------------------CoalesceBatchesExec: target_batch_size=8192 +21)------------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4 +22)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false +23)--------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] +24)----------------------CoalescePartitionsExec +25)------------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] +26)--------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1] +27)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +28)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false From 34fbf14f46c68bc40f2302ffa6e0b9b867dc7420 Mon Sep 17 00:00:00 2001 From: Emily Matheys <55631053+EmilyMatt@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:10:55 +0200 Subject: [PATCH 2/6] feat: Add evaluate_to_arrays function (#18446) ## Which issue does this PR close? - Closes #18330 . ## Rationale for this change Reduce code duplication. ## What changes are included in this PR? A util function replacing many calls which are using the same code. ## Are these changes tested? No logic should change whatsoever, so each area which now uses this code should have it's own tests and benchmarks unmodified. ## Are there any user-facing changes? Yes, there is now a new pub function. No other changes to API. --------- Co-authored-by: Martin Grigorov (cherry picked from commit 76b4156aff9033680c907430d98de4dd274b1fd0) --- datafusion/expr-common/src/columnar_value.rs | 100 +++++++++++++++++- datafusion/physical-expr-common/src/utils.rs | 21 ++++ .../window/standard_window_function_expr.rs | 9 +- .../physical-expr/src/window/window_expr.rs | 9 +- .../physical-plan/src/aggregates/mod.rs | 45 +++----- .../src/aggregates/no_grouping.rs | 11 +- .../physical-plan/src/joins/hash_join/exec.rs | 9 +- .../src/joins/hash_join/stream.rs | 7 +- .../src/joins/symmetric_hash_join.rs | 11 +- datafusion/physical-plan/src/joins/utils.rs | 6 +- datafusion/physical-plan/src/projection.rs | 10 +- .../physical-plan/src/repartition/mod.rs | 7 +- datafusion/physical-plan/src/sorts/stream.rs | 7 +- 13 files changed, 153 insertions(+), 99 deletions(-) diff --git a/datafusion/expr-common/src/columnar_value.rs b/datafusion/expr-common/src/columnar_value.rs index a21ad5bbbcc30..f7bbd41120f17 100644 --- a/datafusion/expr-common/src/columnar_value.rs +++ b/datafusion/expr-common/src/columnar_value.rs @@ -113,10 +113,12 @@ impl ColumnarValue { } } - /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified - /// number of rows. [`Self::Scalar`] is converted by repeating the same - /// scalar multiple times which is not as efficient as handling the scalar - /// directly. + /// Convert any [`Self::Scalar`] into an Arrow [`ArrayRef`] with the specified + /// number of rows by repeating the same scalar multiple times, + /// which is not as efficient as handling the scalar directly. + /// [`Self::Array`] will just be returned as is. + /// + /// See [`Self::into_array_of_size`] if you need to validate the length of the output array. /// /// See [`Self::values_to_arrays`] to convert multiple columnar values into /// arrays of the same length. @@ -135,6 +137,38 @@ impl ColumnarValue { /// number of rows. [`Self::Scalar`] is converted by repeating the same /// scalar multiple times which is not as efficient as handling the scalar /// directly. + /// This validates that if this is [`Self::Array`], it has the expected length. + /// + /// See [`Self::values_to_arrays`] to convert multiple columnar values into + /// arrays of the same length. + /// + /// # Errors + /// + /// Errors if `self` is a Scalar that fails to be converted into an array of size or + /// if the array length does not match the expected length + pub fn into_array_of_size(self, num_rows: usize) -> Result { + match self { + ColumnarValue::Array(array) => { + if array.len() == num_rows { + Ok(array) + } else { + internal_err!( + "Array length {} does not match expected length {}", + array.len(), + num_rows + ) + } + } + ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows), + } + } + + /// Convert any [`Self::Scalar`] into an Arrow [`ArrayRef`] with the specified + /// number of rows by repeating the same scalar multiple times, + /// which is not as efficient as handling the scalar directly. + /// [`Self::Array`] will just be returned as is. + /// + /// See [`Self::to_array_of_size`] if you need to validate the length of the output array. /// /// See [`Self::values_to_arrays`] to convert multiple columnar values into /// arrays of the same length. @@ -149,6 +183,36 @@ impl ColumnarValue { }) } + /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified + /// number of rows. [`Self::Scalar`] is converted by repeating the same + /// scalar multiple times which is not as efficient as handling the scalar + /// directly. + /// This validates that if this is [`Self::Array`], it has the expected length. + /// + /// See [`Self::values_to_arrays`] to convert multiple columnar values into + /// arrays of the same length. + /// + /// # Errors + /// + /// Errors if `self` is a Scalar that fails to be converted into an array of size or + /// if the array length does not match the expected length + pub fn to_array_of_size(&self, num_rows: usize) -> Result { + match self { + ColumnarValue::Array(array) => { + if array.len() == num_rows { + Ok(Arc::clone(array)) + } else { + internal_err!( + "Array length {} does not match expected length {}", + array.len(), + num_rows + ) + } + } + ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows), + } + } + /// Null columnar values are implemented as a null array in order to pass batch /// num_rows pub fn create_null_array(num_rows: usize) -> Self { @@ -249,6 +313,34 @@ mod tests { use super::*; use arrow::array::Int32Array; + #[test] + fn into_array_of_size() { + // Array case + let arr = make_array(1, 3); + let arr_columnar_value = ColumnarValue::Array(Arc::clone(&arr)); + assert_eq!(&arr_columnar_value.into_array_of_size(3).unwrap(), &arr); + + // Scalar case + let scalar_columnar_value = ColumnarValue::Scalar(ScalarValue::Int32(Some(42))); + let expected_array = make_array(42, 100); + assert_eq!( + &scalar_columnar_value.into_array_of_size(100).unwrap(), + &expected_array + ); + + // Array case with wrong size + let arr = make_array(1, 3); + let arr_columnar_value = ColumnarValue::Array(Arc::clone(&arr)); + let result = arr_columnar_value.into_array_of_size(5); + let err = result.unwrap_err(); + assert!( + err.to_string().starts_with( + "Internal error: Array length 3 does not match expected length 5" + ), + "Found: {err}" + ); + } + #[test] fn values_to_arrays() { // (input, expected) diff --git a/datafusion/physical-expr-common/src/utils.rs b/datafusion/physical-expr-common/src/utils.rs index 05b216ab75ebc..230252eb8958c 100644 --- a/datafusion/physical-expr-common/src/utils.rs +++ b/datafusion/physical-expr-common/src/utils.rs @@ -22,6 +22,7 @@ use crate::tree_node::ExprContext; use arrow::array::{make_array, Array, ArrayRef, BooleanArray, MutableArrayData}; use arrow::compute::{and_kleene, is_not_null, SlicesIterator}; +use arrow::record_batch::RecordBatch; use datafusion_common::Result; use datafusion_expr_common::sort_properties::ExprProperties; @@ -91,6 +92,26 @@ pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result { Ok(make_array(data)) } +/// Evaluates expressions against a record batch. +/// This will convert the resulting ColumnarValues to ArrayRefs, +/// duplicating any ScalarValues that may have been returned, +/// and validating that the returned arrays all have the same +/// number of rows as the input batch. +#[inline] +pub fn evaluate_expressions_to_arrays<'a>( + exprs: impl IntoIterator>, + batch: &RecordBatch, +) -> Result> { + let num_rows = batch.num_rows(); + exprs + .into_iter() + .map(|e| { + e.evaluate(batch) + .and_then(|col| col.into_array_of_size(num_rows)) + }) + .collect::>>() +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/datafusion/physical-expr/src/window/standard_window_function_expr.rs b/datafusion/physical-expr/src/window/standard_window_function_expr.rs index ca7c3a4db3d4f..9b1213450c2fb 100644 --- a/datafusion/physical-expr/src/window/standard_window_function_expr.rs +++ b/datafusion/physical-expr/src/window/standard_window_function_expr.rs @@ -23,6 +23,7 @@ use arrow::record_batch::RecordBatch; use datafusion_common::Result; use datafusion_expr::{LimitEffect, PartitionEvaluator}; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use std::any::Any; use std::sync::Arc; @@ -57,13 +58,7 @@ pub trait StandardWindowFunctionExpr: Send + Sync + std::fmt::Debug { /// /// Typically, the resulting vector is a single element vector. fn evaluate_args(&self, batch: &RecordBatch) -> Result> { - self.expressions() - .iter() - .map(|e| { - e.evaluate(batch) - .and_then(|v| v.into_array(batch.num_rows())) - }) - .collect() + evaluate_expressions_to_arrays(&self.expressions(), batch) } /// Create a [`PartitionEvaluator`] for evaluating the function on diff --git a/datafusion/physical-expr/src/window/window_expr.rs b/datafusion/physical-expr/src/window/window_expr.rs index a6b5bf1871161..47f970d276e00 100644 --- a/datafusion/physical-expr/src/window/window_expr.rs +++ b/datafusion/physical-expr/src/window/window_expr.rs @@ -41,6 +41,7 @@ use datafusion_expr::window_state::{ use datafusion_expr::{Accumulator, PartitionEvaluator, WindowFrame, WindowFrameBound}; use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use indexmap::IndexMap; /// Common trait for [window function] implementations @@ -90,13 +91,7 @@ pub trait WindowExpr: Send + Sync + Debug { /// Evaluate the window function arguments against the batch and return /// array ref, normally the resulting `Vec` is a single element one. fn evaluate_args(&self, batch: &RecordBatch) -> Result> { - self.expressions() - .iter() - .map(|e| { - e.evaluate(batch) - .and_then(|v| v.into_array(batch.num_rows())) - }) - .collect() + evaluate_expressions_to_arrays(&self.expressions(), batch) } /// Evaluate the window function values against the batch diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 30d1441f5773e..18090b6422f62 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -59,6 +59,7 @@ use datafusion_physical_expr_common::sort_expr::{ }; use datafusion_expr::utils::AggregateOrderSensitivity; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use itertools::Itertools; pub mod group_values; @@ -1434,25 +1435,14 @@ pub fn finalize_aggregation( } } -/// Evaluates expressions against a record batch. -fn evaluate( - expr: &[Arc], - batch: &RecordBatch, -) -> Result> { - expr.iter() - .map(|expr| { - expr.evaluate(batch) - .and_then(|v| v.into_array(batch.num_rows())) - }) - .collect() -} - -/// Evaluates expressions against a record batch. +/// Evaluates groups of expressions against a record batch. pub fn evaluate_many( expr: &[Vec>], batch: &RecordBatch, ) -> Result>> { - expr.iter().map(|expr| evaluate(expr, batch)).collect() + expr.iter() + .map(|expr| evaluate_expressions_to_arrays(expr, batch)) + .collect() } fn evaluate_optional( @@ -1506,23 +1496,14 @@ pub fn evaluate_group_by( group_by: &PhysicalGroupBy, batch: &RecordBatch, ) -> Result>> { - let exprs: Vec = group_by - .expr - .iter() - .map(|(expr, _)| { - let value = expr.evaluate(batch)?; - value.into_array(batch.num_rows()) - }) - .collect::>>()?; - - let null_exprs: Vec = group_by - .null_expr - .iter() - .map(|(expr, _)| { - let value = expr.evaluate(batch)?; - value.into_array(batch.num_rows()) - }) - .collect::>>()?; + let exprs = evaluate_expressions_to_arrays( + group_by.expr.iter().map(|(expr, _)| expr), + batch, + )?; + let null_exprs = evaluate_expressions_to_arrays( + group_by.null_expr.iter().map(|(expr, _)| expr), + batch, + )?; group_by .groups diff --git a/datafusion/physical-plan/src/aggregates/no_grouping.rs b/datafusion/physical-plan/src/aggregates/no_grouping.rs index 9474a5f88c92a..fc398427ac1f0 100644 --- a/datafusion/physical-plan/src/aggregates/no_grouping.rs +++ b/datafusion/physical-plan/src/aggregates/no_grouping.rs @@ -33,12 +33,12 @@ use std::borrow::Cow; use std::sync::Arc; use std::task::{Context, Poll}; +use super::AggregateExec; use crate::filter::batch_filter; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::stream::{Stream, StreamExt}; -use super::AggregateExec; - /// stream struct for aggregation without grouping columns pub(crate) struct AggregateStream { stream: BoxStream<'static, Result>, @@ -219,13 +219,8 @@ fn aggregate_batch( None => Cow::Borrowed(&batch), }; - let n_rows = batch.num_rows(); - // 1.3 - let values = expr - .iter() - .map(|e| e.evaluate(&batch).and_then(|v| v.into_array(n_rows))) - .collect::>>()?; + let values = evaluate_expressions_to_arrays(expr, batch.as_ref())?; // 1.4 let size_pre = accum.size(); diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index a250ce542e6b1..55bdc16306bc7 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -77,6 +77,7 @@ use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef}; use ahash::RandomState; use datafusion_physical_expr_common::physical_expr::fmt_sql; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::TryStreamExt; use parking_lot::Mutex; @@ -1465,13 +1466,7 @@ async fn collect_left_input( BooleanBufferBuilder::new(0) }; - let left_values = on_left - .iter() - .map(|c| { - c.evaluate(&single_batch)? - .into_array(single_batch.num_rows()) - }) - .collect::>>()?; + let left_values = evaluate_expressions_to_arrays(&on_left, &single_batch)?; // Compute bounds for dynamic filter if enabled let bounds = match bounds_accumulators { diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index bb3465365ec96..ced9ef564e142 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -51,6 +51,7 @@ use datafusion_common::{ use datafusion_physical_expr::PhysicalExprRef; use ahash::RandomState; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::{ready, Stream, StreamExt}; /// Represents build-side of hash join. @@ -447,11 +448,7 @@ impl HashJoinStream { } Some(Ok(batch)) => { // Precalculate hash values for fetched batch - let keys_values = self - .on_right - .iter() - .map(|c| c.evaluate(&batch)?.into_array(batch.num_rows())) - .collect::>>()?; + let keys_values = evaluate_expressions_to_arrays(&self.on_right, &batch)?; self.hashes_buffer.clear(); self.hashes_buffer.resize(batch.num_rows(), 0); diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index be4646e88bd76..371dc49c7326f 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -78,6 +78,7 @@ use datafusion_physical_expr_common::physical_expr::{fmt_sql, PhysicalExprRef}; use datafusion_physical_expr_common::sort_expr::{LexOrdering, OrderingRequirements}; use ahash::RandomState; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::{ready, Stream, StreamExt}; use parking_lot::Mutex; @@ -1065,14 +1066,8 @@ fn lookup_join_hashmap( hashes_buffer: &mut Vec, deleted_offset: Option, ) -> Result<(UInt64Array, UInt32Array)> { - let keys_values = probe_on - .iter() - .map(|c| c.evaluate(probe_batch)?.into_array(probe_batch.num_rows())) - .collect::>>()?; - let build_join_values = build_on - .iter() - .map(|c| c.evaluate(build_batch)?.into_array(build_batch.num_rows())) - .collect::>>()?; + let keys_values = evaluate_expressions_to_arrays(probe_on, probe_batch)?; + let build_join_values = evaluate_expressions_to_arrays(build_on, build_batch)?; hashes_buffer.clear(); hashes_buffer.resize(probe_batch.num_rows(), 0); diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 1d03a35d2cbc4..6eb131deab705 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -75,6 +75,7 @@ use datafusion_physical_expr::{ }; use datafusion_physical_expr_common::datum::compare_op_for_nested; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::future::{BoxFuture, Shared}; use futures::{ready, FutureExt}; use parking_lot::Mutex; @@ -1674,10 +1675,7 @@ pub fn update_hash( fifo_hashmap: bool, ) -> Result<()> { // evaluate the keys - let keys_values = on - .iter() - .map(|c| c.evaluate(batch)?.into_array(batch.num_rows())) - .collect::>>()?; + let keys_values = evaluate_expressions_to_arrays(on, batch)?; // calculate the hash values let hash_values = create_hashes(&keys_values, random_state, hashes_buffer)?; diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index ead2196860cde..8bc2bcd6f2e9a 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -57,6 +57,7 @@ pub use datafusion_physical_expr::projection::{ update_expr, ProjectionExpr, ProjectionExprs, }; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::stream::{Stream, StreamExt}; use log::trace; @@ -357,14 +358,7 @@ impl ProjectionStream { fn batch_project(&self, batch: &RecordBatch) -> Result { // Records time on drop let _timer = self.baseline_metrics.elapsed_compute().timer(); - let arrays = self - .expr - .iter() - .map(|expr| { - expr.evaluate(batch) - .and_then(|v| v.into_array(batch.num_rows())) - }) - .collect::>>()?; + let arrays = evaluate_expressions_to_arrays(&self.expr, batch)?; if arrays.is_empty() { let options = diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 74cf798895998..46fba6c520f47 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -58,6 +58,7 @@ use crate::filter_pushdown::{ ChildPushdownResult, FilterDescription, FilterPushdownPhase, FilterPushdownPropagation, }; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::stream::Stream; use futures::{FutureExt, StreamExt, TryStreamExt}; use log::trace; @@ -489,10 +490,8 @@ impl BatchPartitioner { // Tracking time required for distributing indexes across output partitions let timer = self.timer.timer(); - let arrays = exprs - .iter() - .map(|expr| expr.evaluate(&batch)?.into_array(batch.num_rows())) - .collect::>>()?; + let arrays = + evaluate_expressions_to_arrays(exprs.as_slice(), &batch)?; hash_buffer.clear(); hash_buffer.resize(batch.num_rows(), 0); diff --git a/datafusion/physical-plan/src/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs index 97dd1761b14cf..f419247c82b7e 100644 --- a/datafusion/physical-plan/src/sorts/stream.rs +++ b/datafusion/physical-plan/src/sorts/stream.rs @@ -25,6 +25,7 @@ use arrow::row::{RowConverter, Rows, SortField}; use datafusion_common::{internal_datafusion_err, Result}; use datafusion_execution::memory_pool::MemoryReservation; use datafusion_physical_expr_common::sort_expr::LexOrdering; +use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::stream::{Fuse, StreamExt}; use std::marker::PhantomData; use std::sync::Arc; @@ -164,11 +165,7 @@ impl RowCursorStream { batch: &RecordBatch, stream_idx: usize, ) -> Result { - let cols = self - .column_expressions - .iter() - .map(|expr| expr.evaluate(batch)?.into_array(batch.num_rows())) - .collect::>>()?; + let cols = evaluate_expressions_to_arrays(&self.column_expressions, batch)?; // At this point, ownership should of this Rows should be unique let mut rows = self.rows.take_next(stream_idx)?; From ee7d2b520c4a4f0a8ab1c304226e1cd023beb333 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 20 Nov 2025 08:28:09 +0800 Subject: [PATCH 3/6] Refactor state management in `HashJoinExec` and use CASE expressions for more precise filters (#18451) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Background This PR is part of an EPIC to push down hash table references from HashJoinExec into scans. The EPIC is tracked in https://github.com/apache/datafusion/issues/17171. A "target state" is tracked in https://github.com/apache/datafusion/pull/18393. There is a series of PRs to get us to this target state in smaller more reviewable changes that are still valuable on their own: - https://github.com/apache/datafusion/pull/18448 - https://github.com/apache/datafusion/pull/18449 (depends on https://github.com/apache/datafusion/pull/18448) - (This PR): https://github.com/apache/datafusion/pull/18451 ## Changes in this PR This PR refactors state management in HashJoinExec to make filter pushdown more efficient and prepare for pushing down membership tests. - Refactor internal data structures to clean up state management and make usage more idiomatic (use `Option` instead of comparing integers, etc.) - Uses CASE expressions to evaluate pushed-down filters selectively by partition Example: `CASE hash_repartition % N WHEN partition_id THEN condition ELSE false END` --------- Co-authored-by: Lía Adriana (cherry picked from commit 5b0aa37c8562d141c6c9e0a026115b4e6b905ca2) --- .../physical_optimizer/filter_pushdown/mod.rs | 291 ++++++++++++- .../filter_pushdown/util.rs | 14 +- .../physical-plan/src/joins/hash_join/exec.rs | 98 ++--- .../physical-plan/src/joins/hash_join/mod.rs | 1 + .../joins/hash_join/partitioned_hash_eval.rs | 158 +++++++ .../src/joins/hash_join/shared_bounds.rs | 408 +++++++++++------- .../src/joins/hash_join/stream.rs | 54 ++- .../physical-plan/src/repartition/mod.rs | 11 +- 8 files changed, 796 insertions(+), 239 deletions(-) create mode 100644 datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index de61149508904..fcc02988781c3 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -18,7 +18,7 @@ use std::sync::{Arc, LazyLock}; use arrow::{ - array::record_batch, + array::{record_batch, Float64Array, Int32Array, RecordBatch, StringArray}, datatypes::{DataType, Field, Schema, SchemaRef}, util::pretty::pretty_format_batches, }; @@ -278,7 +278,7 @@ async fn test_dynamic_filter_pushdown_through_hash_join_with_topk() { - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false], filter=[e@4 IS NULL OR e@4 < bb] - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 1 WHEN 0 THEN d@0 >= aa AND d@0 <= ab ELSE false END ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ] " ); } @@ -1308,7 +1308,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb OR a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 12 WHEN 2 THEN a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb WHEN 4 THEN a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba ELSE false END ] " ); @@ -1325,7 +1325,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 12 WHEN 0 THEN a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ELSE false END ] " ); @@ -1670,8 +1670,8 @@ async fn test_nested_hashjoin_dynamic_filter_pushdown() { - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab ] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 1 WHEN 0 THEN b@0 >= aa AND b@0 <= ab ELSE false END ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 1 WHEN 0 THEN d@0 >= ca AND d@0 <= cb ELSE false END ] " ); } @@ -2333,3 +2333,282 @@ fn test_pushdown_with_computed_grouping_key() { " ); } + +#[tokio::test] +async fn test_hashjoin_dynamic_filter_all_partitions_empty() { + use datafusion_common::JoinType; + use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; + + // Test scenario where all build-side partitions are empty + // This validates the code path that sets the filter to `false` when no rows can match + + // Create empty build side + let build_batches = vec![]; + let build_side_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + ])); + let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema)) + .with_support(true) + .with_batches(build_batches) + .build(); + + // Create probe side with some data + let probe_batches = vec![record_batch!( + ("a", Utf8, ["aa", "ab", "ac"]), + ("b", Utf8, ["ba", "bb", "bc"]) + ) + .unwrap()]; + let probe_side_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + ])); + let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema)) + .with_support(true) + .with_batches(probe_batches) + .build(); + + // Create RepartitionExec nodes for both sides + let partition_count = 4; + + let build_hash_exprs = vec![ + col("a", &build_side_schema).unwrap(), + col("b", &build_side_schema).unwrap(), + ]; + let build_repartition = Arc::new( + RepartitionExec::try_new( + build_scan, + Partitioning::Hash(build_hash_exprs, partition_count), + ) + .unwrap(), + ); + let build_coalesce = Arc::new(CoalesceBatchesExec::new(build_repartition, 8192)); + + let probe_hash_exprs = vec![ + col("a", &probe_side_schema).unwrap(), + col("b", &probe_side_schema).unwrap(), + ]; + let probe_repartition = Arc::new( + RepartitionExec::try_new( + Arc::clone(&probe_scan), + Partitioning::Hash(probe_hash_exprs, partition_count), + ) + .unwrap(), + ); + let probe_coalesce = Arc::new(CoalesceBatchesExec::new(probe_repartition, 8192)); + + // Create HashJoinExec + let on = vec![ + ( + col("a", &build_side_schema).unwrap(), + col("a", &probe_side_schema).unwrap(), + ), + ( + col("b", &build_side_schema).unwrap(), + col("b", &probe_side_schema).unwrap(), + ), + ]; + let hash_join = Arc::new( + HashJoinExec::try_new( + build_coalesce, + probe_coalesce, + on, + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + datafusion_common::NullEquality::NullEqualsNothing, + ) + .unwrap(), + ); + + let plan = + Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc; + + // Apply the filter pushdown optimizer + let mut config = SessionConfig::new(); + config.options_mut().execution.parquet.pushdown_filters = true; + let optimizer = FilterPushdown::new_post_optimization(); + let plan = optimizer.optimize(plan, config.options()).unwrap(); + + insta::assert_snapshot!( + format_plan_for_test(&plan), + @r" + - CoalesceBatchesExec: target_batch_size=8192 + - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] + - CoalesceBatchesExec: target_batch_size=8192 + - RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1 + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true + - CoalesceBatchesExec: target_batch_size=8192 + - RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1 + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] + " + ); + + // Put some data through the plan to check that the filter is updated to reflect the TopK state + let session_ctx = SessionContext::new_with_config(config); + session_ctx.register_object_store( + ObjectStoreUrl::parse("test://").unwrap().as_ref(), + Arc::new(InMemory::new()), + ); + let state = session_ctx.state(); + let task_ctx = state.task_ctx(); + // Execute all partitions (required for partitioned hash join coordination) + let _batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx)) + .await + .unwrap(); + + // Test that filters are pushed down correctly to each side of the join + insta::assert_snapshot!( + format_plan_for_test(&plan), + @r" + - CoalesceBatchesExec: target_batch_size=8192 + - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] + - CoalesceBatchesExec: target_batch_size=8192 + - RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1 + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true + - CoalesceBatchesExec: target_batch_size=8192 + - RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1 + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ false ] + " + ); +} + +#[tokio::test] +async fn test_hashjoin_dynamic_filter_with_nulls() { + use datafusion_common::JoinType; + use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; + + // Test scenario where build side has NULL values in join keys + // This validates NULL handling in bounds computation and filter generation + + // Create build side with NULL values + let build_batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, true), // nullable + Field::new("b", DataType::Int32, true), // nullable + ])), + vec![ + Arc::new(StringArray::from(vec![Some("aa"), None, Some("ab")])), + Arc::new(Int32Array::from(vec![Some(1), Some(2), None])), + ], + ) + .unwrap(); + let build_batches = vec![build_batch]; + let build_side_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int32, true), + ])); + let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema)) + .with_support(true) + .with_batches(build_batches) + .build(); + + // Create probe side with nullable fields + let probe_batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int32, true), + Field::new("c", DataType::Float64, false), + ])), + vec![ + Arc::new(StringArray::from(vec![ + Some("aa"), + Some("ab"), + Some("ac"), + None, + ])), + Arc::new(Int32Array::from(vec![Some(1), Some(3), Some(4), Some(5)])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ) + .unwrap(); + let probe_batches = vec![probe_batch]; + let probe_side_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int32, true), + Field::new("c", DataType::Float64, false), + ])); + let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema)) + .with_support(true) + .with_batches(probe_batches) + .build(); + + // Create HashJoinExec in CollectLeft mode (simpler for this test) + let on = vec![ + ( + col("a", &build_side_schema).unwrap(), + col("a", &probe_side_schema).unwrap(), + ), + ( + col("b", &build_side_schema).unwrap(), + col("b", &probe_side_schema).unwrap(), + ), + ]; + let hash_join = Arc::new( + HashJoinExec::try_new( + build_scan, + Arc::clone(&probe_scan), + on, + None, + &JoinType::Inner, + None, + PartitionMode::CollectLeft, + datafusion_common::NullEquality::NullEqualsNothing, + ) + .unwrap(), + ); + + let plan = + Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc; + + // Apply the filter pushdown optimizer + let mut config = SessionConfig::new(); + config.options_mut().execution.parquet.pushdown_filters = true; + let optimizer = FilterPushdown::new_post_optimization(); + let plan = optimizer.optimize(plan, config.options()).unwrap(); + + insta::assert_snapshot!( + format_plan_for_test(&plan), + @r" + - CoalesceBatchesExec: target_batch_size=8192 + - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] + " + ); + + // Put some data through the plan to check that the filter is updated to reflect the TopK state + let session_ctx = SessionContext::new_with_config(config); + session_ctx.register_object_store( + ObjectStoreUrl::parse("test://").unwrap().as_ref(), + Arc::new(InMemory::new()), + ); + let state = session_ctx.state(); + let task_ctx = state.task_ctx(); + // Execute all partitions (required for partitioned hash join coordination) + let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx)) + .await + .unwrap(); + + // Test that filters are pushed down correctly to each side of the join + insta::assert_snapshot!( + format_plan_for_test(&plan), + @r" + - CoalesceBatchesExec: target_batch_size=8192 + - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= 1 AND b@1 <= 2 ] + " + ); + + #[rustfmt::skip] + let expected = [ + "+----+---+----+---+-----+", + "| a | b | a | b | c |", + "+----+---+----+---+-----+", + "| aa | 1 | aa | 1 | 1.0 |", + "+----+---+----+---+-----+", + ]; + assert_batches_eq!(&expected, &batches); +} diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index 7d8a9c7c2125c..a89e0d31b1e24 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -61,6 +61,9 @@ pub struct TestOpener { impl FileOpener for TestOpener { fn open(&self, _partitioned_file: PartitionedFile) -> Result { let mut batches = self.batches.clone(); + if self.batches.is_empty() { + return Ok((async { Ok(TestStream::new(vec![]).boxed()) }).boxed()); + } if let Some(batch_size) = self.batch_size { let batch = concat_batches(&batches[0].schema(), &batches)?; let mut new_batches = Vec::new(); @@ -335,11 +338,12 @@ impl TestStream { /// least one entry in data (for the schema) pub fn new(data: Vec) -> Self { // check that there is at least one entry in data and that all batches have the same schema - assert!(!data.is_empty(), "data must not be empty"); - assert!( - data.iter().all(|batch| batch.schema() == data[0].schema()), - "all batches must have the same schema" - ); + if let Some(first) = data.first() { + assert!( + data.iter().all(|batch| batch.schema() == first.schema()), + "all batches must have the same schema" + ); + } Self { data, ..Default::default() diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 55bdc16306bc7..c717f262a5121 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -26,7 +26,9 @@ use crate::filter_pushdown::{ ChildPushdownResult, FilterDescription, FilterPushdownPhase, FilterPushdownPropagation, }; -use crate::joins::hash_join::shared_bounds::{ColumnBounds, SharedBoundsAccumulator}; +use crate::joins::hash_join::shared_bounds::{ + ColumnBounds, PartitionBounds, SharedBuildAccumulator, +}; use crate::joins::hash_join::stream::{ BuildSide, BuildSideInitialState, HashJoinStream, HashJoinStreamState, }; @@ -40,6 +42,7 @@ use crate::projection::{ try_embed_projection, try_pushdown_through_join, EmbeddedProjection, JoinData, ProjectionExec, }; +use crate::repartition::REPARTITION_RANDOM_STATE; use crate::spill::get_record_batch_memory_size; use crate::ExecutionPlanProperties; use crate::{ @@ -88,7 +91,8 @@ const HASH_JOIN_SEED: RandomState = /// HashTable and input data for the left (build side) of a join pub(super) struct JoinLeftData { /// The hash table with indices into `batch` - pub(super) hash_map: Box, + /// Arc is used to allow sharing with SharedBuildAccumulator for hash map pushdown + pub(super) hash_map: Arc, /// The input rows for the build side batch: RecordBatch, /// The build side on expressions values @@ -103,32 +107,13 @@ pub(super) struct JoinLeftData { /// This could hide potential out-of-memory issues, especially when upstream operators increase their memory consumption. /// The MemoryReservation ensures proper tracking of memory resources throughout the join operation's lifecycle. _reservation: MemoryReservation, - /// Bounds computed from the build side for dynamic filter pushdown - pub(super) bounds: Option>, + /// Bounds computed from the build side for dynamic filter pushdown. + /// If the partition is empty (no rows) this will be None. + /// If the partition has some rows this will be Some with the bounds for each join key column. + pub(super) bounds: Option, } impl JoinLeftData { - /// Create a new `JoinLeftData` from its parts - pub(super) fn new( - hash_map: Box, - batch: RecordBatch, - values: Vec, - visited_indices_bitmap: SharedBitmapBuilder, - probe_threads_counter: AtomicUsize, - reservation: MemoryReservation, - bounds: Option>, - ) -> Self { - Self { - hash_map, - batch, - values, - visited_indices_bitmap, - probe_threads_counter, - _reservation: reservation, - bounds, - } - } - /// return a reference to the hash map pub(super) fn hash_map(&self) -> &dyn JoinHashMapType { &*self.hash_map @@ -363,9 +348,9 @@ pub struct HashJoinExec { struct HashJoinExecDynamicFilter { /// Dynamic filter that we'll update with the results of the build side once that is done. filter: Arc, - /// Bounds accumulator to keep track of the min/max bounds on the join keys for each partition. + /// Build accumulator to collect build-side information (hash maps and/or bounds) from each partition. /// It is lazily initialized during execution to make sure we use the actual execution time partition counts. - bounds_accumulator: OnceLock>, + build_accumulator: OnceLock>, } impl fmt::Debug for HashJoinExec { @@ -976,8 +961,10 @@ impl ExecutionPlan for HashJoinExec { let batch_size = context.session_config().batch_size(); - // Initialize bounds_accumulator lazily with runtime partition counts (only if enabled) - let bounds_accumulator = enable_dynamic_filter_pushdown + // Initialize build_accumulator lazily with runtime partition counts (only if enabled) + // Use RepartitionExec's random state (seeds: 0,0,0,0) for partition routing + let repartition_random_state = REPARTITION_RANDOM_STATE; + let build_accumulator = enable_dynamic_filter_pushdown .then(|| { self.dynamic_filter.as_ref().map(|df| { let filter = Arc::clone(&df.filter); @@ -986,13 +973,14 @@ impl ExecutionPlan for HashJoinExec { .iter() .map(|(_, right_expr)| Arc::clone(right_expr)) .collect::>(); - Some(Arc::clone(df.bounds_accumulator.get_or_init(|| { - Arc::new(SharedBoundsAccumulator::new_from_partition_mode( + Some(Arc::clone(df.build_accumulator.get_or_init(|| { + Arc::new(SharedBuildAccumulator::new_from_partition_mode( self.mode, self.left.as_ref(), self.right.as_ref(), filter, on_right, + repartition_random_state, )) }))) }) @@ -1035,7 +1023,7 @@ impl ExecutionPlan for HashJoinExec { batch_size, vec![], self.right.output_ordering().is_some(), - bounds_accumulator, + build_accumulator, self.mode, ))) } @@ -1196,7 +1184,7 @@ impl ExecutionPlan for HashJoinExec { cache: self.cache.clone(), dynamic_filter: Some(HashJoinExecDynamicFilter { filter: dynamic_filter, - bounds_accumulator: OnceLock::new(), + build_accumulator: OnceLock::new(), }), }); result = result.with_updated_node(new_node as Arc); @@ -1302,14 +1290,14 @@ impl BuildSideState { reservation: MemoryReservation, on_left: Vec>, schema: &SchemaRef, - should_compute_bounds: bool, + should_compute_dynamic_filters: bool, ) -> Result { Ok(Self { batches: Vec::new(), num_rows: 0, metrics, reservation, - bounds_accumulators: should_compute_bounds + bounds_accumulators: should_compute_dynamic_filters .then(|| { on_left .iter() @@ -1339,13 +1327,13 @@ impl BuildSideState { /// * `reservation` - Memory reservation tracker for the hash table and data /// * `with_visited_indices_bitmap` - Whether to track visited indices (for outer joins) /// * `probe_threads_count` - Number of threads that will probe this hash table -/// * `should_compute_bounds` - Whether to compute min/max bounds for dynamic filtering +/// * `should_compute_dynamic_filters` - Whether to compute min/max bounds for dynamic filtering /// /// # Dynamic Filter Coordination -/// When `should_compute_bounds` is true, this function computes the min/max bounds +/// When `should_compute_dynamic_filters` is true, this function computes the min/max bounds /// for each join key column but does NOT update the dynamic filter. Instead, the /// bounds are stored in the returned `JoinLeftData` and later coordinated by -/// `SharedBoundsAccumulator` to ensure all partitions contribute their bounds +/// `SharedBuildAccumulator` to ensure all partitions contribute their bounds /// before updating the filter exactly once. /// /// # Returns @@ -1360,7 +1348,7 @@ async fn collect_left_input( reservation: MemoryReservation, with_visited_indices_bitmap: bool, probe_threads_count: usize, - should_compute_bounds: bool, + should_compute_dynamic_filters: bool, ) -> Result { let schema = left_stream.schema(); @@ -1372,7 +1360,7 @@ async fn collect_left_input( reservation, on_left.clone(), &schema, - should_compute_bounds, + should_compute_dynamic_filters, )?; let state = left_stream @@ -1416,6 +1404,7 @@ async fn collect_left_input( // Use `u32` indices for the JoinHashMap when num_rows ≤ u32::MAX, otherwise use the // `u64` indice variant + // Arc is used instead of Box to allow sharing with SharedBuildAccumulator for hash map pushdown let mut hashmap: Box = if num_rows > u32::MAX as usize { let estimated_hashtable_size = estimate_memory_size::<(u64, u64)>(num_rows, fixed_size_u64)?; @@ -1451,22 +1440,22 @@ async fn collect_left_input( offset += batch.num_rows(); } // Merge all batches into a single batch, so we can directly index into the arrays - let single_batch = concat_batches(&schema, batches_iter)?; + let batch = concat_batches(&schema, batches_iter)?; // Reserve additional memory for visited indices bitmap and create shared builder let visited_indices_bitmap = if with_visited_indices_bitmap { - let bitmap_size = bit_util::ceil(single_batch.num_rows(), 8); + let bitmap_size = bit_util::ceil(batch.num_rows(), 8); reservation.try_grow(bitmap_size)?; metrics.build_mem_used.add(bitmap_size); - let mut bitmap_buffer = BooleanBufferBuilder::new(single_batch.num_rows()); + let mut bitmap_buffer = BooleanBufferBuilder::new(batch.num_rows()); bitmap_buffer.append_n(num_rows, false); bitmap_buffer } else { BooleanBufferBuilder::new(0) }; - let left_values = evaluate_expressions_to_arrays(&on_left, &single_batch)?; + let left_values = evaluate_expressions_to_arrays(&on_left, &batch)?; // Compute bounds for dynamic filter if enabled let bounds = match bounds_accumulators { @@ -1475,20 +1464,23 @@ async fn collect_left_input( .into_iter() .map(CollectLeftAccumulator::evaluate) .collect::>>()?; - Some(bounds) + Some(PartitionBounds::new(bounds)) } _ => None, }; - let data = JoinLeftData::new( - hashmap, - single_batch, - left_values.clone(), - Mutex::new(visited_indices_bitmap), - AtomicUsize::new(probe_threads_count), - reservation, + // Convert Box to Arc for sharing with SharedBuildAccumulator + let hash_map: Arc = hashmap.into(); + + let data = JoinLeftData { + hash_map, + batch, + values: left_values, + visited_indices_bitmap: Mutex::new(visited_indices_bitmap), + probe_threads_counter: AtomicUsize::new(probe_threads_count), + _reservation: reservation, bounds, - ); + }; Ok(data) } diff --git a/datafusion/physical-plan/src/joins/hash_join/mod.rs b/datafusion/physical-plan/src/joins/hash_join/mod.rs index 7f1e5cae13a3e..6c073e7a9cff5 100644 --- a/datafusion/physical-plan/src/joins/hash_join/mod.rs +++ b/datafusion/physical-plan/src/joins/hash_join/mod.rs @@ -20,5 +20,6 @@ pub use exec::HashJoinExec; mod exec; +mod partitioned_hash_eval; mod shared_bounds; mod stream; diff --git a/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs new file mode 100644 index 0000000000000..527642ade07e1 --- /dev/null +++ b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Hash computation and hash table lookup expressions for dynamic filtering + +use std::{any::Any, fmt::Display, hash::Hash, sync::Arc}; + +use ahash::RandomState; +use arrow::{ + array::UInt64Array, + datatypes::{DataType, Schema}, +}; +use datafusion_common::Result; +use datafusion_expr::ColumnarValue; +use datafusion_physical_expr_common::physical_expr::{ + DynHash, PhysicalExpr, PhysicalExprRef, +}; + +use crate::hash_utils::create_hashes; + +/// Physical expression that computes hash values for a set of columns +/// +/// This expression computes the hash of join key columns using a specific RandomState. +/// It returns a UInt64Array containing the hash values. +/// +/// This is used for: +/// - Computing routing hashes (with RepartitionExec's 0,0,0,0 seeds) +/// - Computing lookup hashes (with HashJoin's 'J','O','I','N' seeds) +pub(super) struct HashExpr { + /// Columns to hash + on_columns: Vec, + /// Random state for hashing + random_state: RandomState, + /// Description for display + description: String, +} + +impl HashExpr { + /// Create a new HashExpr + /// + /// # Arguments + /// * `on_columns` - Columns to hash + /// * `random_state` - RandomState for hashing + /// * `description` - Description for debugging (e.g., "hash_repartition", "hash_join") + pub(super) fn new( + on_columns: Vec, + random_state: RandomState, + description: String, + ) -> Self { + Self { + on_columns, + random_state, + description, + } + } +} + +impl std::fmt::Debug for HashExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let cols = self + .on_columns + .iter() + .map(|e| e.to_string()) + .collect::>() + .join(", "); + write!(f, "{}({})", self.description, cols) + } +} + +impl Hash for HashExpr { + fn hash(&self, state: &mut H) { + self.on_columns.dyn_hash(state); + self.description.hash(state); + } +} + +impl PartialEq for HashExpr { + fn eq(&self, other: &Self) -> bool { + self.on_columns == other.on_columns && self.description == other.description + } +} + +impl Eq for HashExpr {} + +impl Display for HashExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.description) + } +} + +impl PhysicalExpr for HashExpr { + fn as_any(&self) -> &dyn Any { + self + } + + fn children(&self) -> Vec<&Arc> { + self.on_columns.iter().collect() + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + Ok(Arc::new(HashExpr::new( + children, + self.random_state.clone(), + self.description.clone(), + ))) + } + + fn data_type(&self, _input_schema: &Schema) -> Result { + Ok(DataType::UInt64) + } + + fn nullable(&self, _input_schema: &Schema) -> Result { + Ok(false) + } + + fn evaluate( + &self, + batch: &arrow::record_batch::RecordBatch, + ) -> Result { + let num_rows = batch.num_rows(); + + // Evaluate columns + let keys_values = self + .on_columns + .iter() + .map(|c| c.evaluate(batch)?.into_array(num_rows)) + .collect::>>()?; + + // Compute hashes + let mut hashes_buffer = vec![0; num_rows]; + create_hashes(&keys_values, &self.random_state, &mut hashes_buffer)?; + + Ok(ColumnarValue::Array(Arc::new(UInt64Array::from( + hashes_buffer, + )))) + } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.description) + } +} diff --git a/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs index e83f89e4e5d78..cb727f40a20a2 100644 --- a/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs +++ b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs @@ -15,22 +15,25 @@ // specific language governing permissions and limitations // under the License. -//! Utilities for shared bounds. Used in dynamic filter pushdown in Hash Joins. +//! Utilities for shared build-side information. Used in dynamic filter pushdown in Hash Joins. // TODO: include the link to the Dynamic Filter blog post. use std::fmt; use std::sync::Arc; +use crate::joins::hash_join::partitioned_hash_eval::HashExpr; use crate::joins::PartitionMode; use crate::ExecutionPlan; use crate::ExecutionPlanProperties; +use ahash::RandomState; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::Operator; -use datafusion_physical_expr::expressions::{lit, BinaryExpr, DynamicFilterPhysicalExpr}; +use datafusion_physical_expr::expressions::{ + lit, BinaryExpr, CaseExpr, DynamicFilterPhysicalExpr, +}; use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef}; -use itertools::Itertools; use parking_lot::Mutex; use tokio::sync::Barrier; @@ -54,23 +57,14 @@ impl ColumnBounds { /// This contains the min/max values computed from one partition's build-side data. #[derive(Debug, Clone)] pub(crate) struct PartitionBounds { - /// Partition identifier for debugging and determinism (not strictly necessary) - partition: usize, /// Min/max bounds for each join key column in this partition. /// Index corresponds to the join key expression index. column_bounds: Vec, } impl PartitionBounds { - pub(crate) fn new(partition: usize, column_bounds: Vec) -> Self { - Self { - partition, - column_bounds, - } - } - - pub(crate) fn len(&self) -> usize { - self.column_bounds.len() + pub(crate) fn new(column_bounds: Vec) -> Self { + Self { column_bounds } } pub(crate) fn get_column_bounds(&self, index: usize) -> Option<&ColumnBounds> { @@ -78,18 +72,70 @@ impl PartitionBounds { } } -/// Coordinates dynamic filter bounds collection across multiple partitions +/// Creates a bounds predicate from partition bounds. +/// +/// Returns a bound predicate (col >= min AND col <= max) for all key columns in the ON expression that have computed bounds from the build phase. +/// +/// Returns `None` if no column bounds are available. +fn create_bounds_predicate( + on_right: &[PhysicalExprRef], + bounds: &PartitionBounds, +) -> Option> { + let mut column_predicates = Vec::new(); + + for (col_idx, right_expr) in on_right.iter().enumerate() { + if let Some(column_bounds) = bounds.get_column_bounds(col_idx) { + // Create predicate: col >= min AND col <= max + let min_expr = Arc::new(BinaryExpr::new( + Arc::clone(right_expr), + Operator::GtEq, + lit(column_bounds.min.clone()), + )) as Arc; + let max_expr = Arc::new(BinaryExpr::new( + Arc::clone(right_expr), + Operator::LtEq, + lit(column_bounds.max.clone()), + )) as Arc; + let range_expr = Arc::new(BinaryExpr::new(min_expr, Operator::And, max_expr)) + as Arc; + column_predicates.push(range_expr); + } + } + + if column_predicates.is_empty() { + None + } else { + Some( + column_predicates + .into_iter() + .reduce(|acc, pred| { + Arc::new(BinaryExpr::new(acc, Operator::And, pred)) + as Arc + }) + .unwrap(), + ) + } +} + +/// Coordinates build-side information collection across multiple partitions /// -/// This structure ensures that dynamic filters are built with complete information from all -/// relevant partitions before being applied to probe-side scans. Incomplete filters would +/// This structure collects information from the build side (hash tables and/or bounds) and +/// ensures that dynamic filters are built with complete information from all relevant +/// partitions before being applied to probe-side scans. Incomplete filters would /// incorrectly eliminate valid join results. /// /// ## Synchronization Strategy /// -/// 1. Each partition computes bounds from its build-side data -/// 2. Bounds are stored in the shared vector -/// 3. A barrier tracks how many partitions have reported their bounds -/// 4. When the last partition reports, bounds are merged and the filter is updated exactly once +/// 1. Each partition computes information from its build-side data (hash maps and/or bounds) +/// 2. Information is stored in the shared state +/// 3. A barrier tracks how many partitions have reported +/// 4. When the last partition reports, information is merged and the filter is updated exactly once +/// +/// ## Hash Map vs Bounds +/// +/// - **Hash Maps (Partitioned mode)**: Collects Arc references to hash tables from each partition. +/// Creates a `PartitionedHashLookupPhysicalExpr` that routes rows to the correct partition's hash table. +/// - **Bounds (CollectLeft mode)**: Collects min/max bounds and creates range predicates. /// /// ## Partition Counting /// @@ -101,25 +147,57 @@ impl PartitionBounds { /// /// All fields use a single mutex to ensure correct coordination between concurrent /// partition executions. -pub(crate) struct SharedBoundsAccumulator { - /// Shared state protected by a single mutex to avoid ordering concerns - inner: Mutex, +pub(crate) struct SharedBuildAccumulator { + /// Build-side data protected by a single mutex to avoid ordering concerns + inner: Mutex, barrier: Barrier, /// Dynamic filter for pushdown to probe side dynamic_filter: Arc, - /// Right side join expressions needed for creating filter bounds + /// Right side join expressions needed for creating filter expressions on_right: Vec, + /// Random state for partitioning (RepartitionExec's hash function with 0,0,0,0 seeds) + /// Used for PartitionedHashLookupPhysicalExpr + repartition_random_state: RandomState, +} + +#[derive(Clone)] +pub(crate) enum PartitionBuildDataReport { + Partitioned { + partition_id: usize, + /// Bounds computed from this partition's build side. + /// If the partition is empty (no rows) this will be None. + bounds: Option, + }, + CollectLeft { + /// Bounds computed from the collected build side. + /// If the build side is empty (no rows) this will be None. + bounds: Option, + }, +} + +#[derive(Clone)] +struct PartitionedBuildData { + partition_id: usize, + bounds: PartitionBounds, +} + +#[derive(Clone)] +struct CollectLeftBuildData { + bounds: PartitionBounds, } -/// State protected by SharedBoundsAccumulator's mutex -struct SharedBoundsState { - /// Bounds from completed partitions. - /// Each element represents the column bounds computed by one partition. - bounds: Vec, +/// Build-side data organized by partition mode +enum AccumulatedBuildData { + Partitioned { + partitions: Vec>, + }, + CollectLeft { + data: Option, + }, } -impl SharedBoundsAccumulator { - /// Creates a new SharedBoundsAccumulator configured for the given partition mode +impl SharedBuildAccumulator { + /// Creates a new SharedBuildAccumulator configured for the given partition mode /// /// This method calculates how many times `collect_build_side` will be called based on the /// partition mode's execution pattern. This count is critical for determining when we have @@ -137,12 +215,12 @@ impl SharedBoundsAccumulator { /// `collect_build_side` once. Expected calls = number of build partitions. /// /// - **Auto**: Placeholder mode resolved during optimization. Uses 1 as safe default since - /// the actual mode will be determined and a new bounds_accumulator created before execution. + /// the actual mode will be determined and a new accumulator created before execution. /// /// ## Why This Matters /// /// We cannot build a partial filter from some partitions - it would incorrectly eliminate - /// valid join results. We must wait until we have complete bounds information from ALL + /// valid join results. We must wait until we have complete information from ALL /// relevant partitions before updating the dynamic filter. pub(crate) fn new_from_partition_mode( partition_mode: PartitionMode, @@ -150,6 +228,7 @@ impl SharedBoundsAccumulator { right_child: &dyn ExecutionPlan, dynamic_filter: Arc, on_right: Vec, + repartition_random_state: RandomState, ) -> Self { // Troubleshooting: If partition counts are incorrect, verify this logic matches // the actual execution pattern in collect_build_side() @@ -165,140 +244,171 @@ impl SharedBoundsAccumulator { // Default value, will be resolved during optimization (does not exist once `execute()` is called; will be replaced by one of the other two) PartitionMode::Auto => unreachable!("PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"), }; + + let mode_data = match partition_mode { + PartitionMode::Partitioned => AccumulatedBuildData::Partitioned { + partitions: vec![None; left_child.output_partitioning().partition_count()], + }, + PartitionMode::CollectLeft => AccumulatedBuildData::CollectLeft { + data: None, + }, + PartitionMode::Auto => unreachable!("PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"), + }; + Self { - inner: Mutex::new(SharedBoundsState { - bounds: Vec::with_capacity(expected_calls), - }), + inner: Mutex::new(mode_data), barrier: Barrier::new(expected_calls), dynamic_filter, on_right, + repartition_random_state, } } - /// Create a filter expression from individual partition bounds using OR logic. - /// - /// This creates a filter where each partition's bounds form a conjunction (AND) - /// of column range predicates, and all partitions are combined with OR. - /// - /// For example, with 2 partitions and 2 columns: - /// ((col0 >= p0_min0 AND col0 <= p0_max0 AND col1 >= p0_min1 AND col1 <= p0_max1) - /// OR - /// (col0 >= p1_min0 AND col0 <= p1_max0 AND col1 >= p1_min1 AND col1 <= p1_max1)) - pub(crate) fn create_filter_from_partition_bounds( - &self, - bounds: &[PartitionBounds], - ) -> Result> { - if bounds.is_empty() { - return Ok(lit(true)); - } - - // Create a predicate for each partition - let mut partition_predicates = Vec::with_capacity(bounds.len()); - - for partition_bounds in bounds.iter().sorted_by_key(|b| b.partition) { - // Create range predicates for each join key in this partition - let mut column_predicates = Vec::with_capacity(partition_bounds.len()); - - for (col_idx, right_expr) in self.on_right.iter().enumerate() { - if let Some(column_bounds) = partition_bounds.get_column_bounds(col_idx) { - // Create predicate: col >= min AND col <= max - let min_expr = Arc::new(BinaryExpr::new( - Arc::clone(right_expr), - Operator::GtEq, - lit(column_bounds.min.clone()), - )) as Arc; - let max_expr = Arc::new(BinaryExpr::new( - Arc::clone(right_expr), - Operator::LtEq, - lit(column_bounds.max.clone()), - )) as Arc; - let range_expr = - Arc::new(BinaryExpr::new(min_expr, Operator::And, max_expr)) - as Arc; - column_predicates.push(range_expr); - } - } - - // Combine all column predicates for this partition with AND - if !column_predicates.is_empty() { - let partition_predicate = column_predicates - .into_iter() - .reduce(|acc, pred| { - Arc::new(BinaryExpr::new(acc, Operator::And, pred)) - as Arc - }) - .unwrap(); - partition_predicates.push(partition_predicate); - } - } - - // Combine all partition predicates with OR - let combined_predicate = partition_predicates - .into_iter() - .reduce(|acc, pred| { - Arc::new(BinaryExpr::new(acc, Operator::Or, pred)) - as Arc - }) - .unwrap_or_else(|| lit(true)); - - Ok(combined_predicate) - } - - /// Report bounds from a completed partition and update dynamic filter if all partitions are done - /// - /// This method coordinates the dynamic filter updates across all partitions. It stores the - /// bounds from the current partition, increments the completion counter, and when all - /// partitions have reported, creates an OR'd filter from individual partition bounds. + /// Report build-side data from a partition /// - /// This method is async and uses a [`tokio::sync::Barrier`] to wait for all partitions - /// to report their bounds. Once that occurs, the method will resolve for all callers and the - /// dynamic filter will be updated exactly once. - /// - /// # Note - /// - /// As barriers are reusable, it is likely an error to call this method more times than the - /// total number of partitions - as it can lead to pending futures that never resolve. We rely - /// on correct usage from the caller rather than imposing additional checks here. If this is a concern, - /// consider making the resulting future shared so the ready result can be reused. + /// This unified method handles both CollectLeft and Partitioned modes. When all partitions + /// have reported (barrier wait), the leader builds the appropriate filter expression: + /// - CollectLeft: Simple conjunction of bounds and membership check + /// - Partitioned: CASE expression routing to per-partition filters /// /// # Arguments - /// * `left_side_partition_id` - The identifier for the **left-side** partition reporting its bounds - /// * `partition_bounds` - The bounds computed by this partition (if any) + /// * `data` - Build data including hash map, pushdown strategy, and bounds /// /// # Returns - /// * `Result<()>` - Ok if successful, Err if filter update failed - pub(crate) async fn report_partition_bounds( + /// * `Result<()>` - Ok if successful, Err if filter update failed or mode mismatch + pub(crate) async fn report_build_data( &self, - left_side_partition_id: usize, - partition_bounds: Option>, + data: PartitionBuildDataReport, ) -> Result<()> { - // Store bounds in the accumulator - this runs once per partition - if let Some(bounds) = partition_bounds { + // Store data in the accumulator + { let mut guard = self.inner.lock(); - let should_push = if let Some(last_bound) = guard.bounds.last() { - // In `PartitionMode::CollectLeft`, all streams on the left side share the same partition id (0). - // Since this function can be called multiple times for that same partition, we must deduplicate - // by checking against the last recorded bound. - last_bound.partition != left_side_partition_id - } else { - true - }; - - if should_push { - guard - .bounds - .push(PartitionBounds::new(left_side_partition_id, bounds)); + match (data, &mut *guard) { + // Partitioned mode + ( + PartitionBuildDataReport::Partitioned { + partition_id, + bounds, + }, + AccumulatedBuildData::Partitioned { partitions }, + ) => { + if let Some(bounds) = bounds { + partitions[partition_id] = Some(PartitionedBuildData { + partition_id, + bounds, + }); + } + } + // CollectLeft mode (store once, deduplicate across partitions) + ( + PartitionBuildDataReport::CollectLeft { bounds }, + AccumulatedBuildData::CollectLeft { data }, + ) => { + match (bounds, data) { + (None, _) | (_, Some(_)) => { + // No bounds reported or already reported; do nothing + } + (Some(new_bounds), data) => { + // First report, store the bounds + *data = Some(CollectLeftBuildData { bounds: new_bounds }); + } + } + } + // Mismatched modes - should never happen + _ => { + return datafusion_common::internal_err!( + "Build data mode mismatch in report_build_data" + ); + } } } + // Wait for all partitions to report if self.barrier.wait().await.is_leader() { - // All partitions have reported, so we can update the filter + // All partitions have reported, so we can create and update the filter let inner = self.inner.lock(); - if !inner.bounds.is_empty() { - let filter_expr = - self.create_filter_from_partition_bounds(&inner.bounds)?; - self.dynamic_filter.update(filter_expr)?; + + match &*inner { + // CollectLeft: Simple conjunction of bounds and membership check + AccumulatedBuildData::CollectLeft { data } => { + if let Some(partition_data) = data { + // Create bounds check expression (if bounds available) + let Some(filter_expr) = create_bounds_predicate( + &self.on_right, + &partition_data.bounds, + ) else { + // No bounds available, nothing to update + return Ok(()); + }; + + self.dynamic_filter.update(filter_expr)?; + } + } + // Partitioned: CASE expression routing to per-partition filters + AccumulatedBuildData::Partitioned { partitions } => { + // Collect all partition data, skipping empty partitions + let partition_data: Vec<_> = + partitions.iter().filter_map(|p| p.as_ref()).collect(); + + if partition_data.is_empty() { + // All partitions are empty: no rows can match, skip the probe side entirely + self.dynamic_filter.update(lit(false))?; + return Ok(()); + } + + // Build a CASE expression that combines range checks AND membership checks + // CASE (hash_repartition(join_keys) % num_partitions) + // WHEN 0 THEN (col >= min_0 AND col <= max_0 AND ...) + // WHEN 1 THEN (col >= min_1 AND col <= max_1 AND ...) + // ... + // ELSE false + // END + + let num_partitions = partitions.len(); + + // Create base expression: hash_repartition(join_keys) % num_partitions + let routing_hash_expr = Arc::new(HashExpr::new( + self.on_right.clone(), + self.repartition_random_state.clone(), + "hash_repartition".to_string(), + )) + as Arc; + + let modulo_expr = Arc::new(BinaryExpr::new( + routing_hash_expr, + Operator::Modulo, + lit(ScalarValue::UInt64(Some(num_partitions as u64))), + )) as Arc; + + // Create WHEN branches for each partition + let when_then_branches: Vec<( + Arc, + Arc, + )> = partition_data + .into_iter() + .map(|pdata| -> Result<_> { + // WHEN partition_id + let when_expr = + lit(ScalarValue::UInt64(Some(pdata.partition_id as u64))); + + // Create bounds check expression for this partition (if bounds available) + let bounds_expr = + create_bounds_predicate(&self.on_right, &pdata.bounds) + .unwrap_or_else(|| lit(true)); // No bounds means all rows pass + + Ok((when_expr, bounds_expr)) + }) + .collect::>>()?; + + let case_expr = Arc::new(CaseExpr::try_new( + Some(modulo_expr), + when_then_branches, + Some(lit(false)), // ELSE false + )?) as Arc; + + self.dynamic_filter.update(case_expr)?; + } } self.dynamic_filter.mark_complete(); } @@ -307,8 +417,8 @@ impl SharedBoundsAccumulator { } } -impl fmt::Debug for SharedBoundsAccumulator { +impl fmt::Debug for SharedBuildAccumulator { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "SharedBoundsAccumulator") + write!(f, "SharedBuildAccumulator") } } diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index ced9ef564e142..ea6584c61dcd2 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -24,7 +24,9 @@ use std::sync::Arc; use std::task::Poll; use crate::joins::hash_join::exec::JoinLeftData; -use crate::joins::hash_join::shared_bounds::SharedBoundsAccumulator; +use crate::joins::hash_join::shared_bounds::{ + PartitionBuildDataReport, SharedBuildAccumulator, +}; use crate::joins::utils::{ equal_rows_arr, get_final_indices_from_shared_bitmap, OnceFut, }; @@ -206,11 +208,11 @@ pub(super) struct HashJoinStream { hashes_buffer: Vec, /// Specifies whether the right side has an ordering to potentially preserve right_side_ordered: bool, - /// Shared bounds accumulator for coordinating dynamic filter updates (optional) - bounds_accumulator: Option>, - /// Optional future to signal when bounds have been reported by all partitions + /// Shared build accumulator for coordinating dynamic filter updates (collects hash maps and/or bounds, optional) + build_accumulator: Option>, + /// Optional future to signal when build information has been reported by all partitions /// and the dynamic filter has been updated - bounds_waiter: Option>, + build_waiter: Option>, /// Partitioning mode to use mode: PartitionMode, @@ -315,7 +317,7 @@ impl HashJoinStream { batch_size: usize, hashes_buffer: Vec, right_side_ordered: bool, - bounds_accumulator: Option>, + build_accumulator: Option>, mode: PartitionMode, ) -> Self { Self { @@ -334,8 +336,8 @@ impl HashJoinStream { batch_size, hashes_buffer, right_side_ordered, - bounds_accumulator, - bounds_waiter: None, + build_accumulator, + build_waiter: None, mode, } } @@ -370,12 +372,12 @@ impl HashJoinStream { } } - /// Optional step to wait until bounds have been reported by all partitions. - /// This state is only entered if a bounds accumulator is present. + /// Optional step to wait until build-side information (hash maps or bounds) has been reported by all partitions. + /// This state is only entered if a build accumulator is present. /// /// ## Why wait? /// - /// The dynamic filter is only built once all partitions have reported their bounds. + /// The dynamic filter is only built once all partitions have reported their information (hash maps or bounds). /// If we do not wait here, the probe-side scan may start before the filter is ready. /// This can lead to the probe-side scan missing the opportunity to apply the filter /// and skip reading unnecessary data. @@ -383,7 +385,7 @@ impl HashJoinStream { &mut self, cx: &mut std::task::Context<'_>, ) -> Poll>>> { - if let Some(ref mut fut) = self.bounds_waiter { + if let Some(ref mut fut) = self.build_waiter { ready!(fut.get_shared(cx))?; } self.state = HashJoinStreamState::FetchProbeBatch; @@ -406,12 +408,13 @@ impl HashJoinStream { .get_shared(cx))?; build_timer.done(); - // Handle dynamic filter bounds accumulation + // Handle dynamic filter build-side information accumulation // // Dynamic filter coordination between partitions: - // Report bounds to the accumulator which will handle synchronization and filter updates - if let Some(ref bounds_accumulator) = self.bounds_accumulator { - let bounds_accumulator = Arc::clone(bounds_accumulator); + // Report hash maps (Partitioned mode) or bounds (CollectLeft mode) to the accumulator + // which will handle synchronization and filter updates + if let Some(ref build_accumulator) = self.build_accumulator { + let build_accumulator = Arc::clone(build_accumulator); let left_side_partition_id = match self.mode { PartitionMode::Partitioned => self.partition, @@ -419,11 +422,20 @@ impl HashJoinStream { PartitionMode::Auto => unreachable!("PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"), }; - let left_data_bounds = left_data.bounds.clone(); - self.bounds_waiter = Some(OnceFut::new(async move { - bounds_accumulator - .report_partition_bounds(left_side_partition_id, left_data_bounds) - .await + let build_data = match self.mode { + PartitionMode::Partitioned => PartitionBuildDataReport::Partitioned { + partition_id: left_side_partition_id, + bounds: left_data.bounds.clone(), + }, + PartitionMode::CollectLeft => PartitionBuildDataReport::CollectLeft { + bounds: left_data.bounds.clone(), + }, + PartitionMode::Auto => unreachable!( + "PartitionMode::Auto should not be present at execution time" + ), + }; + self.build_waiter = Some(OnceFut::new(async move { + build_accumulator.report_build_data(build_data).await })); self.state = HashJoinStreamState::WaitPartitionBoundsReport; } else { diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 46fba6c520f47..09ac0e58d3ff1 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -407,7 +407,6 @@ pub struct BatchPartitioner { enum BatchPartitionerState { Hash { - random_state: ahash::RandomState, exprs: Vec>, num_partitions: usize, hash_buffer: Vec, @@ -418,6 +417,11 @@ enum BatchPartitionerState { }, } +/// Fixed RandomState used for hash repartitioning to ensure consistent behavior across +/// executions and runs. +pub const REPARTITION_RANDOM_STATE: ahash::RandomState = + ahash::RandomState::with_seeds(0, 0, 0, 0); + impl BatchPartitioner { /// Create a new [`BatchPartitioner`] with the provided [`Partitioning`] /// @@ -433,8 +437,6 @@ impl BatchPartitioner { Partitioning::Hash(exprs, num_partitions) => BatchPartitionerState::Hash { exprs, num_partitions, - // Use fixed random hash - random_state: ahash::RandomState::with_seeds(0, 0, 0, 0), hash_buffer: vec![], }, other => return not_impl_err!("Unsupported repartitioning scheme {other:?}"), @@ -482,7 +484,6 @@ impl BatchPartitioner { Box::new(std::iter::once(Ok((idx, batch)))) } BatchPartitionerState::Hash { - random_state, exprs, num_partitions: partitions, hash_buffer, @@ -496,7 +497,7 @@ impl BatchPartitioner { hash_buffer.clear(); hash_buffer.resize(batch.num_rows(), 0); - create_hashes(&arrays, random_state, hash_buffer)?; + create_hashes(&arrays, &REPARTITION_RANDOM_STATE, hash_buffer)?; let mut indices: Vec<_> = (0..*partitions) .map(|_| Vec::with_capacity(batch.num_rows())) From e49840fed7b91c9084b1714b14ccb833c644caeb Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 9 Dec 2025 10:54:45 -0600 Subject: [PATCH 4/6] Push down InList or hash table references from HashJoinExec depending on the size of the build side (#18393) This PR is part of an EPIC to push down hash table references from HashJoinExec into scans. The EPIC is tracked in https://github.com/apache/datafusion/issues/17171. A "target state" is tracked in https://github.com/apache/datafusion/pull/18393 (*this PR*). There is a series of PRs to get us to this target state in smaller more reviewable changes that are still valuable on their own: - https://github.com/apache/datafusion/pull/18448 - https://github.com/apache/datafusion/pull/18449 (depends on https://github.com/apache/datafusion/pull/18448) - https://github.com/apache/datafusion/pull/18451 As those are merged I will rebase this PR to keep track of the "remaining work", and we can use this PR to explore big picture ideas or benchmarks of the final state. (cherry picked from commit c0e8bb501a7b62bb40c87edf16d46ee53bdaff73) --- Cargo.lock | 1 + datafusion/common/src/config.rs | 30 ++ .../physical_optimizer/filter_pushdown/mod.rs | 456 +++++++++++++++++- .../physical-expr/src/expressions/in_list.rs | 8 + datafusion/physical-plan/Cargo.toml | 1 + .../physical-plan/src/joins/hash_join/exec.rs | 63 ++- .../src/joins/hash_join/inlist_builder.rs | 133 +++++ .../physical-plan/src/joins/hash_join/mod.rs | 1 + .../joins/hash_join/partitioned_hash_eval.rs | 140 +++++- .../src/joins/hash_join/shared_bounds.rs | 380 +++++++++++---- .../src/joins/hash_join/stream.rs | 23 +- .../physical-plan/src/joins/join_hash_map.rs | 11 + .../src/joins/stream_join_utils.rs | 4 + datafusion/sqllogictest/test_files/array.slt | 35 +- .../test_files/information_schema.slt | 4 + datafusion/sqllogictest/test_files/joins.slt | 63 +++ docs/source/user-guide/configs.md | 238 ++++----- 17 files changed, 1333 insertions(+), 258 deletions(-) create mode 100644 datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs diff --git a/Cargo.lock b/Cargo.lock index 3780a8be542d2..832ec5d5b4813 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2551,6 +2551,7 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-aggregate-common", "datafusion-functions-window", diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 0ed499da04757..4fd794a517562 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -971,6 +971,36 @@ config_namespace! { /// will be collected into a single partition pub hash_join_single_partition_threshold_rows: usize, default = 1024 * 128 + /// Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. + /// Build sides larger than this will use hash table lookups instead. + /// Set to 0 to always use hash table lookups. + /// + /// InList pushdown can be more efficient for small build sides because it can result in better + /// statistics pruning as well as use any bloom filters present on the scan side. + /// InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. + /// On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. + /// + /// This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory. + /// + /// The default is 128kB per partition. + /// This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases + /// but avoids excessive memory usage or overhead for larger joins. + pub hash_join_inlist_pushdown_max_size: usize, default = 128 * 1024 + + /// Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. + /// Build sides with more rows than this will use hash table lookups instead. + /// Set to 0 to always use hash table lookups. + /// + /// This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent + /// very large IN lists that might not provide much benefit over hash table lookups. + /// + /// This uses the deduplicated row count once the build side has been evaluated. + /// + /// The default is 150 values per partition. + /// This is inspired by Trino's `max-filter-keys-per-column` setting. + /// See: + pub hash_join_inlist_pushdown_max_distinct_values: usize, default = 150 + /// The default filter selectivity used by Filter Statistics /// when an exact selectivity cannot be determined. Valid values are /// between 0 (no selectivity) and 100 (all rows are selected). diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index fcc02988781c3..880c330690540 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -272,13 +272,14 @@ async fn test_dynamic_filter_pushdown_through_hash_join_with_topk() { stream.next().await.unwrap().unwrap(); // Test that filters are pushed down correctly to each side of the join + // NOTE: We dropped the CASE expression here because we now optimize that away if there's only 1 partition insta::assert_snapshot!( format_plan_for_test(&plan), @r" - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false], filter=[e@4 IS NULL OR e@4 < bb] - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 1 WHEN 0 THEN d@0 >= aa AND d@0 <= ab ELSE false END ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab AND d@0 IN (SET) ([aa, ab]) ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ] " ); } @@ -1077,7 +1078,7 @@ async fn test_hashjoin_dynamic_filter_pushdown() { @r" - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ] " ); } @@ -1308,10 +1309,14 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 12 WHEN 2 THEN a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb WHEN 4 THEN a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba ELSE false END ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 12 WHEN 2 THEN a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:ab,c1:bb}]) WHEN 4 THEN a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}]) ELSE false END ] " ); + // When hash collisions force all data into a single partition, we optimize away the CASE expression. + // This avoids calling create_hashes() for every row on the probe side, since hash % 1 == 0 always, + // meaning the WHEN 0 branch would always match. This optimization is also important for primary key + // joins or any scenario where all build-side data naturally lands in one partition. #[cfg(feature = "force_hash_collisions")] insta::assert_snapshot!( format!("{}", format_plan_for_test(&plan)), @@ -1325,7 +1330,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 12 WHEN 0 THEN a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ELSE false END ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ] " ); @@ -1502,7 +1507,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_collect_left() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ] " ); @@ -1670,8 +1675,8 @@ async fn test_nested_hashjoin_dynamic_filter_pushdown() { - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 1 WHEN 0 THEN b@0 >= aa AND b@0 <= ab ELSE false END ] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 1 WHEN 0 THEN d@0 >= ca AND d@0 <= cb ELSE false END ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab AND b@0 IN (SET) ([aa, ab]) ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb AND d@0 IN (SET) ([ca, cb]) ] " ); } @@ -2598,7 +2603,7 @@ async fn test_hashjoin_dynamic_filter_with_nulls() { - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= 1 AND b@1 <= 2 ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= 1 AND b@1 <= 2 AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:1}, {c0:,c1:2}, {c0:ab,c1:}]) ] " ); @@ -2612,3 +2617,438 @@ async fn test_hashjoin_dynamic_filter_with_nulls() { ]; assert_batches_eq!(&expected, &batches); } + +/// Test that when hash_join_inlist_pushdown_max_size is set to a very small value, +/// the HashTable strategy is used instead of InList strategy, even with small build sides. +/// This test is identical to test_hashjoin_dynamic_filter_pushdown_partitioned except +/// for the config setting that forces the HashTable strategy. +#[tokio::test] +async fn test_hashjoin_hash_table_pushdown_partitioned() { + use datafusion_common::JoinType; + use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; + + // Create build side with limited values + let build_batches = vec![record_batch!( + ("a", Utf8, ["aa", "ab"]), + ("b", Utf8, ["ba", "bb"]), + ("c", Float64, [1.0, 2.0]) // Extra column not used in join + ) + .unwrap()]; + let build_side_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + Field::new("c", DataType::Float64, false), + ])); + let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema)) + .with_support(true) + .with_batches(build_batches) + .build(); + + // Create probe side with more values + let probe_batches = vec![record_batch!( + ("a", Utf8, ["aa", "ab", "ac", "ad"]), + ("b", Utf8, ["ba", "bb", "bc", "bd"]), + ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join + ) + .unwrap()]; + let probe_side_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + Field::new("e", DataType::Float64, false), + ])); + let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema)) + .with_support(true) + .with_batches(probe_batches) + .build(); + + // Create RepartitionExec nodes for both sides with hash partitioning on join keys + let partition_count = 12; + + // Build side: DataSource -> RepartitionExec (Hash) -> CoalesceBatchesExec + let build_hash_exprs = vec![ + col("a", &build_side_schema).unwrap(), + col("b", &build_side_schema).unwrap(), + ]; + let build_repartition = Arc::new( + RepartitionExec::try_new( + build_scan, + Partitioning::Hash(build_hash_exprs, partition_count), + ) + .unwrap(), + ); + let build_coalesce = Arc::new(CoalesceBatchesExec::new(build_repartition, 8192)); + + // Probe side: DataSource -> RepartitionExec (Hash) -> CoalesceBatchesExec + let probe_hash_exprs = vec![ + col("a", &probe_side_schema).unwrap(), + col("b", &probe_side_schema).unwrap(), + ]; + let probe_repartition = Arc::new( + RepartitionExec::try_new( + Arc::clone(&probe_scan), + Partitioning::Hash(probe_hash_exprs, partition_count), + ) + .unwrap(), + ); + let probe_coalesce = Arc::new(CoalesceBatchesExec::new(probe_repartition, 8192)); + + // Create HashJoinExec with partitioned inputs + let on = vec![ + ( + col("a", &build_side_schema).unwrap(), + col("a", &probe_side_schema).unwrap(), + ), + ( + col("b", &build_side_schema).unwrap(), + col("b", &probe_side_schema).unwrap(), + ), + ]; + let hash_join = Arc::new( + HashJoinExec::try_new( + build_coalesce, + probe_coalesce, + on, + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + datafusion_common::NullEquality::NullEqualsNothing, + ) + .unwrap(), + ); + + // Top-level CoalesceBatchesExec + let cb = + Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc; + // Top-level CoalescePartitionsExec + let cp = Arc::new(CoalescePartitionsExec::new(cb)) as Arc; + // Add a sort for deterministic output + let plan = Arc::new(SortExec::new( + LexOrdering::new(vec![PhysicalSortExpr::new( + col("a", &probe_side_schema).unwrap(), + SortOptions::new(true, false), // descending, nulls_first + )]) + .unwrap(), + cp, + )) as Arc; + + // Apply the optimization with config setting that forces HashTable strategy + let session_config = SessionConfig::default() + .with_batch_size(10) + .set_usize("datafusion.optimizer.hash_join_inlist_pushdown_max_size", 1) + .set_bool("datafusion.execution.parquet.pushdown_filters", true) + .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true); + let plan = FilterPushdown::new_post_optimization() + .optimize(plan, session_config.options()) + .unwrap(); + let session_ctx = SessionContext::new_with_config(session_config); + session_ctx.register_object_store( + ObjectStoreUrl::parse("test://").unwrap().as_ref(), + Arc::new(InMemory::new()), + ); + let state = session_ctx.state(); + let task_ctx = state.task_ctx(); + let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx)) + .await + .unwrap(); + + // Verify that hash_lookup is used instead of IN (SET) + let plan_str = format_plan_for_test(&plan).to_string(); + assert!( + plan_str.contains("hash_lookup"), + "Expected hash_lookup in plan but got: {plan_str}" + ); + assert!( + !plan_str.contains("IN (SET)"), + "Expected no IN (SET) in plan but got: {plan_str}" + ); + + let result = format!("{}", pretty_format_batches(&batches).unwrap()); + + let probe_scan_metrics = probe_scan.metrics().unwrap(); + + // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain. + assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2); + + // Results should be identical to the InList version + insta::assert_snapshot!( + result, + @r" + +----+----+-----+----+----+-----+ + | a | b | c | a | b | e | + +----+----+-----+----+----+-----+ + | ab | bb | 2.0 | ab | bb | 2.0 | + | aa | ba | 1.0 | aa | ba | 1.0 | + +----+----+-----+----+----+-----+ + ", + ); +} + +/// Test that when hash_join_inlist_pushdown_max_size is set to a very small value, +/// the HashTable strategy is used instead of InList strategy in CollectLeft mode. +/// This test is identical to test_hashjoin_dynamic_filter_pushdown_collect_left except +/// for the config setting that forces the HashTable strategy. +#[tokio::test] +async fn test_hashjoin_hash_table_pushdown_collect_left() { + use datafusion_common::JoinType; + use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; + + let build_batches = vec![record_batch!( + ("a", Utf8, ["aa", "ab"]), + ("b", Utf8, ["ba", "bb"]), + ("c", Float64, [1.0, 2.0]) // Extra column not used in join + ) + .unwrap()]; + let build_side_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + Field::new("c", DataType::Float64, false), + ])); + let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema)) + .with_support(true) + .with_batches(build_batches) + .build(); + + // Create probe side with more values + let probe_batches = vec![record_batch!( + ("a", Utf8, ["aa", "ab", "ac", "ad"]), + ("b", Utf8, ["ba", "bb", "bc", "bd"]), + ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join + ) + .unwrap()]; + let probe_side_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + Field::new("e", DataType::Float64, false), + ])); + let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema)) + .with_support(true) + .with_batches(probe_batches) + .build(); + + // Create RepartitionExec nodes for both sides with hash partitioning on join keys + let partition_count = 12; + + // Probe side: DataSource -> RepartitionExec(Hash) -> CoalesceBatchesExec + let probe_hash_exprs = vec![ + col("a", &probe_side_schema).unwrap(), + col("b", &probe_side_schema).unwrap(), + ]; + let probe_repartition = Arc::new( + RepartitionExec::try_new( + Arc::clone(&probe_scan), + Partitioning::Hash(probe_hash_exprs, partition_count), // create multi partitions on probSide + ) + .unwrap(), + ); + let probe_coalesce = Arc::new(CoalesceBatchesExec::new(probe_repartition, 8192)); + + let on = vec![ + ( + col("a", &build_side_schema).unwrap(), + col("a", &probe_side_schema).unwrap(), + ), + ( + col("b", &build_side_schema).unwrap(), + col("b", &probe_side_schema).unwrap(), + ), + ]; + let hash_join = Arc::new( + HashJoinExec::try_new( + build_scan, + probe_coalesce, + on, + None, + &JoinType::Inner, + None, + PartitionMode::CollectLeft, + datafusion_common::NullEquality::NullEqualsNothing, + ) + .unwrap(), + ); + + // Top-level CoalesceBatchesExec + let cb = + Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc; + // Top-level CoalescePartitionsExec + let cp = Arc::new(CoalescePartitionsExec::new(cb)) as Arc; + // Add a sort for deterministic output + let plan = Arc::new(SortExec::new( + LexOrdering::new(vec![PhysicalSortExpr::new( + col("a", &probe_side_schema).unwrap(), + SortOptions::new(true, false), // descending, nulls_first + )]) + .unwrap(), + cp, + )) as Arc; + + // Apply the optimization with config setting that forces HashTable strategy + let session_config = SessionConfig::default() + .with_batch_size(10) + .set_usize("datafusion.optimizer.hash_join_inlist_pushdown_max_size", 1) + .set_bool("datafusion.execution.parquet.pushdown_filters", true) + .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true); + let plan = FilterPushdown::new_post_optimization() + .optimize(plan, session_config.options()) + .unwrap(); + let session_ctx = SessionContext::new_with_config(session_config); + session_ctx.register_object_store( + ObjectStoreUrl::parse("test://").unwrap().as_ref(), + Arc::new(InMemory::new()), + ); + let state = session_ctx.state(); + let task_ctx = state.task_ctx(); + let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx)) + .await + .unwrap(); + + // Verify that hash_lookup is used instead of IN (SET) + let plan_str = format_plan_for_test(&plan).to_string(); + assert!( + plan_str.contains("hash_lookup"), + "Expected hash_lookup in plan but got: {plan_str}" + ); + assert!( + !plan_str.contains("IN (SET)"), + "Expected no IN (SET) in plan but got: {plan_str}" + ); + + let result = format!("{}", pretty_format_batches(&batches).unwrap()); + + let probe_scan_metrics = probe_scan.metrics().unwrap(); + + // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain. + assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2); + + // Results should be identical to the InList version + insta::assert_snapshot!( + result, + @r" + +----+----+-----+----+----+-----+ + | a | b | c | a | b | e | + +----+----+-----+----+----+-----+ + | ab | bb | 2.0 | ab | bb | 2.0 | + | aa | ba | 1.0 | aa | ba | 1.0 | + +----+----+-----+----+----+-----+ + ", + ); +} + +/// Test HashTable strategy with integer multi-column join keys. +/// Verifies that hash_lookup works correctly with integer data types. +#[tokio::test] +async fn test_hashjoin_hash_table_pushdown_integer_keys() { + use datafusion_common::JoinType; + use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; + + // Create build side with integer keys + let build_batches = vec![record_batch!( + ("id1", Int32, [1, 2]), + ("id2", Int32, [10, 20]), + ("value", Float64, [100.0, 200.0]) + ) + .unwrap()]; + let build_side_schema = Arc::new(Schema::new(vec![ + Field::new("id1", DataType::Int32, false), + Field::new("id2", DataType::Int32, false), + Field::new("value", DataType::Float64, false), + ])); + let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema)) + .with_support(true) + .with_batches(build_batches) + .build(); + + // Create probe side with more integer rows + let probe_batches = vec![record_batch!( + ("id1", Int32, [1, 2, 3, 4]), + ("id2", Int32, [10, 20, 30, 40]), + ("data", Utf8, ["a", "b", "c", "d"]) + ) + .unwrap()]; + let probe_side_schema = Arc::new(Schema::new(vec![ + Field::new("id1", DataType::Int32, false), + Field::new("id2", DataType::Int32, false), + Field::new("data", DataType::Utf8, false), + ])); + let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema)) + .with_support(true) + .with_batches(probe_batches) + .build(); + + // Create join on multiple integer columns + let on = vec![ + ( + col("id1", &build_side_schema).unwrap(), + col("id1", &probe_side_schema).unwrap(), + ), + ( + col("id2", &build_side_schema).unwrap(), + col("id2", &probe_side_schema).unwrap(), + ), + ]; + let hash_join = Arc::new( + HashJoinExec::try_new( + build_scan, + Arc::clone(&probe_scan), + on, + None, + &JoinType::Inner, + None, + PartitionMode::CollectLeft, + datafusion_common::NullEquality::NullEqualsNothing, + ) + .unwrap(), + ); + + let plan = + Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc; + + // Apply optimization with forced HashTable strategy + let session_config = SessionConfig::default() + .with_batch_size(10) + .set_usize("datafusion.optimizer.hash_join_inlist_pushdown_max_size", 1) + .set_bool("datafusion.execution.parquet.pushdown_filters", true) + .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true); + let plan = FilterPushdown::new_post_optimization() + .optimize(plan, session_config.options()) + .unwrap(); + let session_ctx = SessionContext::new_with_config(session_config); + session_ctx.register_object_store( + ObjectStoreUrl::parse("test://").unwrap().as_ref(), + Arc::new(InMemory::new()), + ); + let state = session_ctx.state(); + let task_ctx = state.task_ctx(); + let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx)) + .await + .unwrap(); + + // Verify hash_lookup is used + let plan_str = format_plan_for_test(&plan).to_string(); + assert!( + plan_str.contains("hash_lookup"), + "Expected hash_lookup in plan but got: {plan_str}" + ); + assert!( + !plan_str.contains("IN (SET)"), + "Expected no IN (SET) in plan but got: {plan_str}" + ); + + let result = format!("{}", pretty_format_batches(&batches).unwrap()); + + let probe_scan_metrics = probe_scan.metrics().unwrap(); + // Only 2 rows from probe side match the build side + assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2); + + insta::assert_snapshot!( + result, + @r" + +-----+-----+-------+-----+-----+------+ + | id1 | id2 | value | id1 | id2 | data | + +-----+-----+-------+-----+-----+------+ + | 1 | 10 | 100.0 | 1 | 10 | a | + | 2 | 20 | 200.0 | 2 | 20 | b | + +-----+-----+-------+-----+-----+------+ + ", + ); +} diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 10197f1e97b28..7b258ddf7435b 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -320,6 +320,14 @@ impl InListExpr { &self.list } + pub fn is_empty(&self) -> bool { + self.list.is_empty() + } + + pub fn len(&self) -> usize { + self.list.len() + } + /// Is this negated e.g. NOT IN LIST pub fn negated(&self) -> bool { self.negated diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 5858deb83c83c..4049063d34850 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -56,6 +56,7 @@ datafusion-common = { workspace = true } datafusion-common-runtime = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-functions = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr = { workspace = true, default-features = true } diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index c717f262a5121..5fb09490bfd33 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -26,8 +26,9 @@ use crate::filter_pushdown::{ ChildPushdownResult, FilterDescription, FilterPushdownPhase, FilterPushdownPropagation, }; +use crate::joins::hash_join::inlist_builder::build_struct_inlist_values; use crate::joins::hash_join::shared_bounds::{ - ColumnBounds, PartitionBounds, SharedBuildAccumulator, + ColumnBounds, PartitionBounds, PushdownStrategy, SharedBuildAccumulator, }; use crate::joins::hash_join::stream::{ BuildSide, BuildSideInitialState, HashJoinStream, HashJoinStreamState, @@ -85,7 +86,7 @@ use futures::TryStreamExt; use parking_lot::Mutex; /// Hard-coded seed to ensure hash values from the hash join differ from `RepartitionExec`, avoiding collisions. -const HASH_JOIN_SEED: RandomState = +pub(crate) const HASH_JOIN_SEED: RandomState = RandomState::with_seeds('J' as u64, 'O' as u64, 'I' as u64, 'N' as u64); /// HashTable and input data for the left (build side) of a join @@ -111,6 +112,9 @@ pub(super) struct JoinLeftData { /// If the partition is empty (no rows) this will be None. /// If the partition has some rows this will be Some with the bounds for each join key column. pub(super) bounds: Option, + /// Membership testing strategy for filter pushdown + /// Contains either InList values for small build sides or hash table reference for large build sides + pub(super) membership: PushdownStrategy, } impl JoinLeftData { @@ -134,6 +138,11 @@ impl JoinLeftData { &self.visited_indices_bitmap } + /// returns a reference to the InList values for filter pushdown + pub(super) fn membership(&self) -> &PushdownStrategy { + &self.membership + } + /// Decrements the counter of running threads, and returns `true` /// if caller is the last running thread pub(super) fn report_probe_completed(&self) -> bool { @@ -931,6 +940,16 @@ impl ExecutionPlan for HashJoinExec { need_produce_result_in_final(self.join_type), self.right().output_partitioning().partition_count(), enable_dynamic_filter_pushdown, + context + .session_config() + .options() + .optimizer + .hash_join_inlist_pushdown_max_size, + context + .session_config() + .options() + .optimizer + .hash_join_inlist_pushdown_max_distinct_values, )) })?, PartitionMode::Partitioned => { @@ -949,6 +968,16 @@ impl ExecutionPlan for HashJoinExec { need_produce_result_in_final(self.join_type), 1, enable_dynamic_filter_pushdown, + context + .session_config() + .options() + .optimizer + .hash_join_inlist_pushdown_max_size, + context + .session_config() + .options() + .optimizer + .hash_join_inlist_pushdown_max_distinct_values, )) } PartitionMode::Auto => { @@ -1349,6 +1378,8 @@ async fn collect_left_input( with_visited_indices_bitmap: bool, probe_threads_count: usize, should_compute_dynamic_filters: bool, + max_inlist_size: usize, + max_inlist_distinct_values: usize, ) -> Result { let schema = left_stream.schema(); @@ -1472,6 +1503,29 @@ async fn collect_left_input( // Convert Box to Arc for sharing with SharedBuildAccumulator let hash_map: Arc = hashmap.into(); + let membership = if num_rows == 0 { + PushdownStrategy::Empty + } else { + // If the build side is small enough we can use IN list pushdown. + // If it's too big we fall back to pushing down a reference to the hash table. + // See `PushdownStrategy` for more details. + let estimated_size = left_values + .iter() + .map(|arr| arr.get_array_memory_size()) + .sum::(); + if left_values.is_empty() + || left_values[0].is_empty() + || estimated_size > max_inlist_size + || hash_map.len() > max_inlist_distinct_values + { + PushdownStrategy::HashTable(Arc::clone(&hash_map)) + } else if let Some(in_list_values) = build_struct_inlist_values(&left_values)? { + PushdownStrategy::InList(in_list_values) + } else { + PushdownStrategy::HashTable(Arc::clone(&hash_map)) + } + }; + let data = JoinLeftData { hash_map, batch, @@ -1480,6 +1534,7 @@ async fn collect_left_input( probe_threads_counter: AtomicUsize::new(probe_threads_count), _reservation: reservation, bounds, + membership, }; Ok(data) @@ -4525,7 +4580,7 @@ mod tests { )?; join.dynamic_filter = Some(HashJoinExecDynamicFilter { filter: dynamic_filter, - bounds_accumulator: OnceLock::new(), + build_accumulator: OnceLock::new(), }); // Execute the join @@ -4573,7 +4628,7 @@ mod tests { )?; join.dynamic_filter = Some(HashJoinExecDynamicFilter { filter: dynamic_filter, - bounds_accumulator: OnceLock::new(), + build_accumulator: OnceLock::new(), }); // Execute the join diff --git a/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs new file mode 100644 index 0000000000000..7dccc5b0ba7c2 --- /dev/null +++ b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities for building InList expressions from hash join build side data + +use std::sync::Arc; + +use arrow::array::{ArrayRef, StructArray}; +use arrow::datatypes::{Field, FieldRef, Fields}; +use arrow::downcast_dictionary_array; +use arrow_schema::DataType; +use datafusion_common::Result; + +pub(super) fn build_struct_fields(data_types: &[DataType]) -> Result { + data_types + .iter() + .enumerate() + .map(|(i, dt)| Ok(Field::new(format!("c{i}"), dt.clone(), true))) + .collect() +} + +/// Flattens dictionary-encoded arrays to their underlying value arrays. +/// Non-dictionary arrays are returned as-is. +fn flatten_dictionary_array(array: &ArrayRef) -> ArrayRef { + downcast_dictionary_array! { + array => { + // Recursively flatten in case of nested dictionaries + flatten_dictionary_array(array.values()) + } + _ => Arc::clone(array) + } +} + +/// Builds InList values from join key column arrays. +/// +/// If `join_key_arrays` is: +/// 1. A single array, let's say Int32, this will produce a flat +/// InList expression where the lookup is expected to be scalar Int32 values, +/// that is: this will produce `IN LIST (1, 2, 3)` expected to be used as `2 IN LIST (1, 2, 3)`. +/// 2. An Int32 array and a Utf8 array, this will produce a Struct InList expression +/// where the lookup is expected to be Struct values with two fields (Int32, Utf8), +/// that is: this will produce `IN LIST ((1, "a"), (2, "b"))` expected to be used as `(2, "b") IN LIST ((1, "a"), (2, "b"))`. +/// The field names of the struct are auto-generated as "c0", "c1", ... and should match the struct expression used in the join keys. +/// +/// Note that this function does not deduplicate values - deduplication will happen later +/// when building an InList expression from this array via `InListExpr::try_new_from_array`. +/// +/// Returns `None` if the estimated size exceeds `max_size_bytes` or if the number of rows +/// exceeds `max_distinct_values`. +pub(super) fn build_struct_inlist_values( + join_key_arrays: &[ArrayRef], +) -> Result> { + // Flatten any dictionary-encoded arrays + let flattened_arrays: Vec = join_key_arrays + .iter() + .map(flatten_dictionary_array) + .collect(); + + // Build the source array/struct + let source_array: ArrayRef = if flattened_arrays.len() == 1 { + // Single column: use directly + Arc::clone(&flattened_arrays[0]) + } else { + // Multi-column: build StructArray once from all columns + let fields = build_struct_fields( + &flattened_arrays + .iter() + .map(|arr| arr.data_type().clone()) + .collect::>(), + )?; + + // Build field references with proper Arc wrapping + let arrays_with_fields: Vec<(FieldRef, ArrayRef)> = fields + .iter() + .cloned() + .zip(flattened_arrays.iter().cloned()) + .collect(); + + Arc::new(StructArray::from(arrays_with_fields)) + }; + + Ok(Some(source_array)) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{Int32Array, StringArray}; + use arrow_schema::DataType; + use std::sync::Arc; + + #[test] + fn test_build_single_column_inlist_array() { + let array = Arc::new(Int32Array::from(vec![1, 2, 3, 2, 1])) as ArrayRef; + let result = build_struct_inlist_values(std::slice::from_ref(&array)) + .unwrap() + .unwrap(); + + assert!(array.eq(&result)); + } + + #[test] + fn test_build_multi_column_inlist() { + let array1 = Arc::new(Int32Array::from(vec![1, 2, 3, 2, 1])) as ArrayRef; + let array2 = + Arc::new(StringArray::from(vec!["a", "b", "c", "b", "a"])) as ArrayRef; + + let result = build_struct_inlist_values(&[array1, array2]) + .unwrap() + .unwrap(); + + assert_eq!( + *result.data_type(), + DataType::Struct( + build_struct_fields(&[DataType::Int32, DataType::Utf8]).unwrap() + ) + ); + } +} diff --git a/datafusion/physical-plan/src/joins/hash_join/mod.rs b/datafusion/physical-plan/src/joins/hash_join/mod.rs index 6c073e7a9cff5..ac1c54f4f6034 100644 --- a/datafusion/physical-plan/src/joins/hash_join/mod.rs +++ b/datafusion/physical-plan/src/joins/hash_join/mod.rs @@ -20,6 +20,7 @@ pub use exec::HashJoinExec; mod exec; +mod inlist_builder; mod partitioned_hash_eval; mod shared_bounds; mod stream; diff --git a/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs index 527642ade07e1..9b0ae2ab47a42 100644 --- a/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs +++ b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs @@ -21,16 +21,18 @@ use std::{any::Any, fmt::Display, hash::Hash, sync::Arc}; use ahash::RandomState; use arrow::{ - array::UInt64Array, + array::{BooleanArray, UInt64Array}, + buffer::MutableBuffer, datatypes::{DataType, Schema}, + util::bit_util, }; -use datafusion_common::Result; +use datafusion_common::{internal_datafusion_err, internal_err, Result}; use datafusion_expr::ColumnarValue; use datafusion_physical_expr_common::physical_expr::{ DynHash, PhysicalExpr, PhysicalExprRef, }; -use crate::hash_utils::create_hashes; +use crate::{hash_utils::create_hashes, joins::utils::JoinHashMapType}; /// Physical expression that computes hash values for a set of columns /// @@ -156,3 +158,135 @@ impl PhysicalExpr for HashExpr { write!(f, "{}", self.description) } } + +/// Physical expression that checks if hash values exist in a hash table +/// +/// Takes a UInt64Array of hash values and checks membership in a hash table. +/// Returns a BooleanArray indicating which hashes exist. +pub struct HashTableLookupExpr { + /// Expression that computes hash values (should be a HashExpr) + hash_expr: PhysicalExprRef, + /// Hash table to check against + hash_map: Arc, + /// Description for display + description: String, +} + +impl HashTableLookupExpr { + /// Create a new HashTableLookupExpr + /// + /// # Arguments + /// * `hash_expr` - Expression that computes hash values + /// * `hash_map` - Hash table to check membership + /// * `description` - Description for debugging + pub(super) fn new( + hash_expr: PhysicalExprRef, + hash_map: Arc, + description: String, + ) -> Self { + Self { + hash_expr, + hash_map, + description, + } + } +} + +impl std::fmt::Debug for HashTableLookupExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}({:?})", self.description, self.hash_expr) + } +} + +impl Hash for HashTableLookupExpr { + fn hash(&self, state: &mut H) { + self.hash_expr.dyn_hash(state); + self.description.hash(state); + } +} + +impl PartialEq for HashTableLookupExpr { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.hash_expr, &other.hash_expr) + && self.description == other.description + } +} + +impl Eq for HashTableLookupExpr {} + +impl Display for HashTableLookupExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.description) + } +} + +impl PhysicalExpr for HashTableLookupExpr { + fn as_any(&self) -> &dyn Any { + self + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.hash_expr] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if children.len() != 1 { + return internal_err!( + "HashTableLookupExpr expects exactly 1 child, got {}", + children.len() + ); + } + Ok(Arc::new(HashTableLookupExpr::new( + Arc::clone(&children[0]), + Arc::clone(&self.hash_map), + self.description.clone(), + ))) + } + + fn data_type(&self, _input_schema: &Schema) -> Result { + Ok(DataType::Boolean) + } + + fn nullable(&self, _input_schema: &Schema) -> Result { + Ok(false) + } + + fn evaluate( + &self, + batch: &arrow::record_batch::RecordBatch, + ) -> Result { + let num_rows = batch.num_rows(); + + // Evaluate hash expression to get hash values + let hash_array = self.hash_expr.evaluate(batch)?.into_array(num_rows)?; + let hash_array = hash_array.as_any().downcast_ref::().ok_or( + internal_datafusion_err!( + "HashTableLookupExpr expects UInt64Array from hash expression" + ), + )?; + + // Check each hash against the hash table + let mut buf = MutableBuffer::from_len_zeroed(bit_util::ceil(num_rows, 8)); + for (idx, hash_value) in hash_array.values().iter().enumerate() { + // Use get_matched_indices to check - if it returns any indices, the hash exists + let (matched_indices, _) = self + .hash_map + .get_matched_indices(Box::new(std::iter::once((idx, hash_value))), None); + + if !matched_indices.is_empty() { + bit_util::set_bit(buf.as_slice_mut(), idx); + } + } + + Ok(ColumnarValue::Array(Arc::new( + BooleanArray::new_from_packed(buf, 0, num_rows), + ))) + } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.description) + } +} diff --git a/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs index cb727f40a20a2..a77dd14075975 100644 --- a/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs +++ b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs @@ -21,18 +21,25 @@ use std::fmt; use std::sync::Arc; -use crate::joins::hash_join::partitioned_hash_eval::HashExpr; +use crate::joins::hash_join::exec::HASH_JOIN_SEED; +use crate::joins::hash_join::inlist_builder::build_struct_fields; +use crate::joins::hash_join::partitioned_hash_eval::{HashExpr, HashTableLookupExpr}; +use crate::joins::utils::JoinHashMapType; use crate::joins::PartitionMode; use crate::ExecutionPlan; use crate::ExecutionPlanProperties; use ahash::RandomState; +use arrow::array::ArrayRef; +use arrow::datatypes::{DataType, Field, Schema}; +use datafusion_common::config::ConfigOptions; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::Operator; +use datafusion_functions::core::r#struct as struct_func; use datafusion_physical_expr::expressions::{ - lit, BinaryExpr, CaseExpr, DynamicFilterPhysicalExpr, + lit, BinaryExpr, CaseExpr, DynamicFilterPhysicalExpr, InListExpr, }; -use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef}; +use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef, ScalarFunctionExpr}; use parking_lot::Mutex; use tokio::sync::Barrier; @@ -72,11 +79,77 @@ impl PartitionBounds { } } -/// Creates a bounds predicate from partition bounds. +/// Creates a membership predicate for filter pushdown. /// -/// Returns a bound predicate (col >= min AND col <= max) for all key columns in the ON expression that have computed bounds from the build phase. +/// If `inlist_values` is provided (for small build sides), creates an InList expression. +/// Otherwise, creates a HashTableLookup expression (for large build sides). +/// +/// Supports both single-column and multi-column joins using struct expressions. +fn create_membership_predicate( + on_right: &[PhysicalExprRef], + pushdown: PushdownStrategy, + random_state: &RandomState, + schema: &Schema, +) -> Result>> { + match pushdown { + // Use InList expression for small build sides + PushdownStrategy::InList(in_list_array) => { + // Build the expression to compare against + let expr = if on_right.len() == 1 { + // Single column: col IN (val1, val2, ...) + Arc::clone(&on_right[0]) + } else { + let fields = build_struct_fields( + on_right + .iter() + .map(|r| r.data_type(schema)) + .collect::>>()? + .as_ref(), + )?; + + // The return field name and the function field name don't really matter here. + let return_field = + Arc::new(Field::new("struct", DataType::Struct(fields), true)); + + Arc::new(ScalarFunctionExpr::new( + "struct", + struct_func(), + on_right.to_vec(), + return_field, + Arc::new(ConfigOptions::default()), + )) as Arc + }; + + // Use in_list_from_array() helper to create InList with static_filter optimization (hash-based lookup) + Ok(Some(Arc::new(InListExpr::try_new_from_array( + expr, + in_list_array, + false, + )?))) + } + // Use hash table lookup for large build sides + PushdownStrategy::HashTable(hash_map) => { + let lookup_hash_expr = Arc::new(HashExpr::new( + on_right.to_vec(), + random_state.clone(), + "hash_join".to_string(), + )) as Arc; + + Ok(Some(Arc::new(HashTableLookupExpr::new( + lookup_hash_expr, + hash_map, + "hash_lookup".to_string(), + )) as Arc)) + } + // Empty partition - should not create a filter for this + PushdownStrategy::Empty => Ok(None), + } +} + +/// Creates a bounds predicate from partition bounds. /// /// Returns `None` if no column bounds are available. +/// Returns a combined predicate (col >= min AND col <= max) for all columns with bounds. fn create_bounds_predicate( on_right: &[PhysicalExprRef], bounds: &PartitionBounds, @@ -158,41 +231,48 @@ pub(crate) struct SharedBuildAccumulator { /// Random state for partitioning (RepartitionExec's hash function with 0,0,0,0 seeds) /// Used for PartitionedHashLookupPhysicalExpr repartition_random_state: RandomState, + /// Schema of the probe (right) side for evaluating filter expressions + probe_schema: Arc, } +/// Strategy for filter pushdown (decided at collection time) #[derive(Clone)] -pub(crate) enum PartitionBuildDataReport { +pub(crate) enum PushdownStrategy { + /// Use InList for small build sides (< 128MB) + InList(ArrayRef), + /// Use hash table lookup for large build sides + HashTable(Arc), + /// There was no data in this partition, do not build a dynamic filter for it + Empty, +} + +/// Build-side data reported by a single partition +pub(crate) enum PartitionBuildData { Partitioned { partition_id: usize, - /// Bounds computed from this partition's build side. - /// If the partition is empty (no rows) this will be None. - bounds: Option, + pushdown: PushdownStrategy, + bounds: PartitionBounds, }, CollectLeft { - /// Bounds computed from the collected build side. - /// If the build side is empty (no rows) this will be None. - bounds: Option, + pushdown: PushdownStrategy, + bounds: PartitionBounds, }, } +/// Per-partition accumulated data (Partitioned mode) #[derive(Clone)] -struct PartitionedBuildData { - partition_id: usize, - bounds: PartitionBounds, -} - -#[derive(Clone)] -struct CollectLeftBuildData { +struct PartitionData { bounds: PartitionBounds, + pushdown: PushdownStrategy, } /// Build-side data organized by partition mode enum AccumulatedBuildData { Partitioned { - partitions: Vec>, + partitions: Vec>, }, CollectLeft { - data: Option, + data: Option, }, } @@ -261,6 +341,7 @@ impl SharedBuildAccumulator { dynamic_filter, on_right, repartition_random_state, + probe_schema: right_child.schema(), } } @@ -276,10 +357,7 @@ impl SharedBuildAccumulator { /// /// # Returns /// * `Result<()>` - Ok if successful, Err if filter update failed or mode mismatch - pub(crate) async fn report_build_data( - &self, - data: PartitionBuildDataReport, - ) -> Result<()> { + pub(crate) async fn report_build_data(&self, data: PartitionBuildData) -> Result<()> { // Store data in the accumulator { let mut guard = self.inner.lock(); @@ -287,32 +365,23 @@ impl SharedBuildAccumulator { match (data, &mut *guard) { // Partitioned mode ( - PartitionBuildDataReport::Partitioned { + PartitionBuildData::Partitioned { partition_id, + pushdown, bounds, }, AccumulatedBuildData::Partitioned { partitions }, ) => { - if let Some(bounds) = bounds { - partitions[partition_id] = Some(PartitionedBuildData { - partition_id, - bounds, - }); - } + partitions[partition_id] = Some(PartitionData { pushdown, bounds }); } // CollectLeft mode (store once, deduplicate across partitions) ( - PartitionBuildDataReport::CollectLeft { bounds }, + PartitionBuildData::CollectLeft { pushdown, bounds }, AccumulatedBuildData::CollectLeft { data }, ) => { - match (bounds, data) { - (None, _) | (_, Some(_)) => { - // No bounds reported or already reported; do nothing - } - (Some(new_bounds), data) => { - // First report, store the bounds - *data = Some(CollectLeftBuildData { bounds: new_bounds }); - } + // Deduplicate - all partitions report the same data in CollectLeft + if data.is_none() { + *data = Some(PartitionData { pushdown, bounds }); } } // Mismatched modes - should never happen @@ -333,81 +402,182 @@ impl SharedBuildAccumulator { // CollectLeft: Simple conjunction of bounds and membership check AccumulatedBuildData::CollectLeft { data } => { if let Some(partition_data) = data { + // Create membership predicate (InList for small build sides, hash lookup otherwise) + let membership_expr = create_membership_predicate( + &self.on_right, + partition_data.pushdown.clone(), + &HASH_JOIN_SEED, + self.probe_schema.as_ref(), + )?; + // Create bounds check expression (if bounds available) - let Some(filter_expr) = create_bounds_predicate( + let bounds_expr = create_bounds_predicate( &self.on_right, &partition_data.bounds, - ) else { - // No bounds available, nothing to update - return Ok(()); - }; - - self.dynamic_filter.update(filter_expr)?; + ); + + // Combine membership and bounds expressions for multi-layer optimization: + // - Bounds (min/max): Enable statistics-based pruning (Parquet row group/file skipping) + // - Membership (InList/hash lookup): Enables: + // * Precise filtering (exact value matching) + // * Bloom filter utilization (if present in Parquet files) + // * Better pruning for data types where min/max isn't effective (e.g., UUIDs) + // Together, they provide complementary benefits and maximize data skipping. + // Only update the filter if we have something to push down + if let Some(filter_expr) = match (membership_expr, bounds_expr) { + (Some(membership), Some(bounds)) => { + // Both available: combine with AND + Some(Arc::new(BinaryExpr::new( + bounds, + Operator::And, + membership, + )) + as Arc) + } + (Some(membership), None) => { + // Membership available but no bounds + // This is reachable when we have data but bounds aren't available + // (e.g., unsupported data types or no columns with bounds) + Some(membership) + } + (None, Some(bounds)) => { + // Bounds available but no membership. + // This should be unreachable in practice: we can always push down a reference + // to the hash table. + // But it seems safer to handle it defensively. + Some(bounds) + } + (None, None) => { + // No filter available (e.g., empty build side) + // Don't update the filter, but continue to mark complete + None + } + } { + self.dynamic_filter.update(filter_expr)?; + } } } // Partitioned: CASE expression routing to per-partition filters AccumulatedBuildData::Partitioned { partitions } => { - // Collect all partition data, skipping empty partitions + // Collect all partition data (should all be Some at this point) let partition_data: Vec<_> = partitions.iter().filter_map(|p| p.as_ref()).collect(); - if partition_data.is_empty() { - // All partitions are empty: no rows can match, skip the probe side entirely - self.dynamic_filter.update(lit(false))?; - return Ok(()); - } + if !partition_data.is_empty() { + // Build a CASE expression that combines range checks AND membership checks + // CASE (hash_repartition(join_keys) % num_partitions) + // WHEN 0 THEN (col >= min_0 AND col <= max_0 AND ...) AND membership_check_0 + // WHEN 1 THEN (col >= min_1 AND col <= max_1 AND ...) AND membership_check_1 + // ... + // ELSE false + // END + + let num_partitions = partition_data.len(); + + // Create base expression: hash_repartition(join_keys) % num_partitions + let routing_hash_expr = Arc::new(HashExpr::new( + self.on_right.clone(), + self.repartition_random_state.clone(), + "hash_repartition".to_string(), + )) + as Arc; + + let modulo_expr = Arc::new(BinaryExpr::new( + routing_hash_expr, + Operator::Modulo, + lit(ScalarValue::UInt64(Some(num_partitions as u64))), + )) + as Arc; + + // Create WHEN branches for each partition + let when_then_branches: Vec<( + Arc, + Arc, + )> = partitions + .iter() + .enumerate() + .filter_map(|(partition_id, partition_opt)| { + partition_opt.as_ref().and_then(|partition| { + // Skip empty partitions - they would always return false anyway + match &partition.pushdown { + PushdownStrategy::Empty => None, + _ => Some((partition_id, partition)), + } + }) + }) + .map(|(partition_id, partition)| -> Result<_> { + // WHEN partition_id + let when_expr = + lit(ScalarValue::UInt64(Some(partition_id as u64))); + + // THEN: Combine bounds check AND membership predicate + + // 1. Create membership predicate (InList for small build sides, hash lookup otherwise) + let membership_expr = create_membership_predicate( + &self.on_right, + partition.pushdown.clone(), + &HASH_JOIN_SEED, + self.probe_schema.as_ref(), + )?; + + // 2. Create bounds check expression for this partition (if bounds available) + let bounds_expr = create_bounds_predicate( + &self.on_right, + &partition.bounds, + ); + + // 3. Combine membership and bounds expressions + let then_expr = match (membership_expr, bounds_expr) { + (Some(membership), Some(bounds)) => { + // Both available: combine with AND + Arc::new(BinaryExpr::new( + bounds, + Operator::And, + membership, + )) + as Arc + } + (Some(membership), None) => { + // Membership available but no bounds (e.g., unsupported data types) + membership + } + (None, Some(bounds)) => { + // Bounds available but no membership. + // This should be unreachable in practice: we can always push down a reference + // to the hash table. + // But it seems safer to handle it defensively. + bounds + } + (None, None) => { + // No filter for this partition - should not happen due to filter_map above + // but handle defensively by returning a "true" literal + lit(true) + } + }; + + Ok((when_expr, then_expr)) + }) + .collect::>>()?; + + // Optimize for single partition: skip CASE expression entirely + let filter_expr = if when_then_branches.is_empty() { + // All partitions are empty: no rows can match + lit(false) + } else if when_then_branches.len() == 1 { + // Single partition: just use the condition directly + // since hash % 1 == 0 always, the WHEN 0 branch will always match + Arc::clone(&when_then_branches[0].1) + } else { + // Multiple partitions: create CASE expression + Arc::new(CaseExpr::try_new( + Some(modulo_expr), + when_then_branches, + Some(lit(false)), // ELSE false + )?) as Arc + }; - // Build a CASE expression that combines range checks AND membership checks - // CASE (hash_repartition(join_keys) % num_partitions) - // WHEN 0 THEN (col >= min_0 AND col <= max_0 AND ...) - // WHEN 1 THEN (col >= min_1 AND col <= max_1 AND ...) - // ... - // ELSE false - // END - - let num_partitions = partitions.len(); - - // Create base expression: hash_repartition(join_keys) % num_partitions - let routing_hash_expr = Arc::new(HashExpr::new( - self.on_right.clone(), - self.repartition_random_state.clone(), - "hash_repartition".to_string(), - )) - as Arc; - - let modulo_expr = Arc::new(BinaryExpr::new( - routing_hash_expr, - Operator::Modulo, - lit(ScalarValue::UInt64(Some(num_partitions as u64))), - )) as Arc; - - // Create WHEN branches for each partition - let when_then_branches: Vec<( - Arc, - Arc, - )> = partition_data - .into_iter() - .map(|pdata| -> Result<_> { - // WHEN partition_id - let when_expr = - lit(ScalarValue::UInt64(Some(pdata.partition_id as u64))); - - // Create bounds check expression for this partition (if bounds available) - let bounds_expr = - create_bounds_predicate(&self.on_right, &pdata.bounds) - .unwrap_or_else(|| lit(true)); // No bounds means all rows pass - - Ok((when_expr, bounds_expr)) - }) - .collect::>>()?; - - let case_expr = Arc::new(CaseExpr::try_new( - Some(modulo_expr), - when_then_branches, - Some(lit(false)), // ELSE false - )?) as Arc; - - self.dynamic_filter.update(case_expr)?; + self.dynamic_filter.update(filter_expr)?; + } } } self.dynamic_filter.mark_complete(); diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index ea6584c61dcd2..626516c11e0c1 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -25,7 +25,7 @@ use std::task::Poll; use crate::joins::hash_join::exec::JoinLeftData; use crate::joins::hash_join::shared_bounds::{ - PartitionBuildDataReport, SharedBuildAccumulator, + PartitionBounds, PartitionBuildData, SharedBuildAccumulator, }; use crate::joins::utils::{ equal_rows_arr, get_final_indices_from_shared_bitmap, OnceFut, @@ -422,18 +422,31 @@ impl HashJoinStream { PartitionMode::Auto => unreachable!("PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"), }; + // Determine pushdown strategy based on availability of InList values + let pushdown = left_data.membership().clone(); + + // Construct the appropriate build data enum variant based on partition mode let build_data = match self.mode { - PartitionMode::Partitioned => PartitionBuildDataReport::Partitioned { + PartitionMode::Partitioned => PartitionBuildData::Partitioned { partition_id: left_side_partition_id, - bounds: left_data.bounds.clone(), + pushdown, + bounds: left_data + .bounds + .clone() + .unwrap_or_else(|| PartitionBounds::new(vec![])), }, - PartitionMode::CollectLeft => PartitionBuildDataReport::CollectLeft { - bounds: left_data.bounds.clone(), + PartitionMode::CollectLeft => PartitionBuildData::CollectLeft { + pushdown, + bounds: left_data + .bounds + .clone() + .unwrap_or_else(|| PartitionBounds::new(vec![])), }, PartitionMode::Auto => unreachable!( "PartitionMode::Auto should not be present at execution time" ), }; + self.build_waiter = Some(OnceFut::new(async move { build_accumulator.report_build_data(build_data).await })); diff --git a/datafusion/physical-plan/src/joins/join_hash_map.rs b/datafusion/physical-plan/src/joins/join_hash_map.rs index bdd4bfeeb0fbe..16012fe9014c4 100644 --- a/datafusion/physical-plan/src/joins/join_hash_map.rs +++ b/datafusion/physical-plan/src/joins/join_hash_map.rs @@ -117,6 +117,9 @@ pub trait JoinHashMapType: Send + Sync { /// Returns `true` if the join hash map contains no entries. fn is_empty(&self) -> bool; + + /// Returns the number of entries in the join hash map. + fn len(&self) -> usize; } pub struct JoinHashMapU32 { @@ -183,6 +186,10 @@ impl JoinHashMapType for JoinHashMapU32 { fn is_empty(&self) -> bool { self.map.is_empty() } + + fn len(&self) -> usize { + self.map.len() + } } pub struct JoinHashMapU64 { @@ -249,6 +256,10 @@ impl JoinHashMapType for JoinHashMapU64 { fn is_empty(&self) -> bool { self.map.is_empty() } + + fn len(&self) -> usize { + self.map.len() + } } // Type of offsets for obtaining indices from JoinHashMap. diff --git a/datafusion/physical-plan/src/joins/stream_join_utils.rs b/datafusion/physical-plan/src/joins/stream_join_utils.rs index 80221a77992ce..0be7e0782c904 100644 --- a/datafusion/physical-plan/src/joins/stream_join_utils.rs +++ b/datafusion/physical-plan/src/joins/stream_join_utils.rs @@ -95,6 +95,10 @@ impl JoinHashMapType for PruningJoinHashMap { fn is_empty(&self) -> bool { self.map.is_empty() } + + fn len(&self) -> usize { + self.map.len() + } } /// The `PruningJoinHashMap` is similar to a regular `JoinHashMap`, but with diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 36ea5f6fc5add..606c8ce655997 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6407,9 +6407,10 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) @@ -6435,9 +6436,10 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) @@ -6463,9 +6465,10 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) @@ -6491,9 +6494,10 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) @@ -6519,9 +6523,10 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] -06)----------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) -07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] query I with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 7a34b240bd7c7..f9513eb662f9e 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -299,6 +299,8 @@ datafusion.optimizer.enable_topk_dynamic_filter_pushdown true datafusion.optimizer.enable_window_limits true datafusion.optimizer.expand_views_at_output false datafusion.optimizer.filter_null_join_keys false +datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values 150 +datafusion.optimizer.hash_join_inlist_pushdown_max_size 131072 datafusion.optimizer.hash_join_single_partition_threshold 1048576 datafusion.optimizer.hash_join_single_partition_threshold_rows 131072 datafusion.optimizer.max_passes 3 @@ -420,6 +422,8 @@ datafusion.optimizer.enable_topk_dynamic_filter_pushdown true When set to true, datafusion.optimizer.enable_window_limits true When set to true, the optimizer will attempt to push limit operations past window functions, if possible datafusion.optimizer.expand_views_at_output false When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. datafusion.optimizer.filter_null_join_keys false When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. +datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values 150 Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: +datafusion.optimizer.hash_join_inlist_pushdown_max_size 131072 Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins. datafusion.optimizer.hash_join_single_partition_threshold 1048576 The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition datafusion.optimizer.hash_join_single_partition_threshold_rows 131072 The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition datafusion.optimizer.max_passes 3 Number of times that the optimizer will attempt to optimize the plan diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 4bdf2e5da9632..0a0a8c9559030 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -5238,3 +5238,66 @@ set datafusion.explain.physical_plan_only = false; statement ok set datafusion.optimizer.enable_piecewise_merge_join = false; + +# Test hash join with columns named c0, c1, c2 +# These names match the internal naming pattern in inlist_builder.rs +# Regression test for https://github.com/apache/datafusion/pull/18393#discussion_r2601145291 + +statement ok +CREATE TABLE t1_c_source(c0 INT, c1 VARCHAR, c2 INT) AS VALUES +(1, 'a', 100), +(2, 'b', 200), +(3, 'c', 300); + +statement ok +CREATE TABLE t2_c_source(c0 INT, c1 VARCHAR) AS VALUES +(1, 'x'), +(3, 'z'); + +query I +COPY t1_c_source TO 'test_files/scratch/joins/t1_c.parquet' STORED AS PARQUET; +---- +3 + +query I +COPY t2_c_source TO 'test_files/scratch/joins/t2_c.parquet' STORED AS PARQUET; +---- +2 + +statement ok +CREATE EXTERNAL TABLE t1_c(c0 INT, c1 VARCHAR, c2 INT) +STORED AS PARQUET +LOCATION 'test_files/scratch/joins/t1_c.parquet'; + +statement ok +CREATE EXTERNAL TABLE t2_c(c0 INT, c1 VARCHAR) +STORED AS PARQUET +LOCATION 'test_files/scratch/joins/t2_c.parquet'; + +# Test single-column join with column named c0 +query ITI rowsort +SELECT t1.c0, t1.c1, t1.c2 +FROM t1_c t1 +INNER JOIN t2_c t2 ON t1.c0 = t2.c0; +---- +1 a 100 +3 c 300 + +# Test multi-column join with columns named c0, c1 +query ITI rowsort +SELECT t1.c0, t1.c1, t1.c2 +FROM t1_c t1 +INNER JOIN t2_c t2 ON t1.c0 = t2.c0 AND t1.c1 = t2.c1; +---- + +statement ok +DROP TABLE t1_c_source; + +statement ok +DROP TABLE t2_c_source; + +statement ok +DROP TABLE t1_c; + +statement ok +DROP TABLE t2_c; diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 6e5e063a12926..f993c61b235da 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -63,124 +63,126 @@ SET datafusion.execution.target_partitions = '1'; The following configuration settings are available: -| key | default | description | -| ----------------------------------------------------------------------- | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| datafusion.catalog.create_default_catalog_and_schema | true | Whether the default catalog and schema should be created automatically. | -| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | -| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | -| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | -| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | -| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | -| datafusion.catalog.has_header | true | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. | -| datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. | -| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption | -| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | -| datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. | -| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system | -| datafusion.execution.time_zone | NULL | The default time zone Some functions, e.g. `now` return timestamps in this time zone | -| datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | -| datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | -| datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | -| datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | -| datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | -| datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | -| datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | -| datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | -| datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | -| datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | -| datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | -| datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | -| datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in bytes | -| datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | -| datafusion.execution.parquet.skip_arrow_metadata | false | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to | -| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. | -| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | -| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 51.0.0 | (writing) Sets "created by" property | -| datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | -| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | -| datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | -| datafusion.execution.parquet.bloom_filter_fpp | NULL | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.bloom_filter_ndv | NULL | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.allow_single_file_parallelism | true | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. | -| datafusion.execution.parquet.maximum_parallel_row_group_writers | 1 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | -| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | -| datafusion.execution.planning_concurrency | 0 | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system | -| datafusion.execution.skip_physical_aggregate_schema_check | false | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. | -| datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | -| datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | -| datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | -| datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | -| datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | -| datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | -| datafusion.execution.soft_max_rows_per_output_file | 50000000 | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max | -| datafusion.execution.max_buffered_batches_per_output_file | 2 | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption | -| datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | -| datafusion.execution.listing_table_factory_infer_partitions | true | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). | -| datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | -| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | -| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | -| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold | 0.8 | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input | -| datafusion.execution.skip_partial_aggregation_probe_rows_threshold | 100000 | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode | -| datafusion.execution.use_row_number_estimates_to_optimize_partitioning | false | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future. | -| datafusion.execution.enforce_batch_size_in_joins | false | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. | -| datafusion.execution.objectstore_writer_buffer_size | 10485760 | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. | -| datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | -| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | -| datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | -| datafusion.optimizer.enable_window_limits | true | When set to true, the optimizer will attempt to push limit operations past window functions, if possible | -| datafusion.optimizer.enable_topk_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase. | -| datafusion.optimizer.enable_join_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase. | -| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators (topk & join) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown` & `enable_topk_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. | -| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | -| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | -| datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | -| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level | -| datafusion.optimizer.allow_symmetric_joins_without_pruning | true | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. | -| datafusion.optimizer.repartition_file_scans | true | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. | -| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | -| datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | -| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. | -| datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | -| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | -| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | -| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | -| datafusion.optimizer.enable_piecewise_merge_join | false | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter. | -| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | -| datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | -| datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | -| datafusion.optimizer.prefer_existing_union | false | When set to true, the optimizer will not attempt to convert Union to Interleave | -| datafusion.optimizer.expand_views_at_output | false | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. | -| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | -| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | -| datafusion.explain.show_statistics | false | When set to true, the explain statement will print operator statistics for physical plans | -| datafusion.explain.show_sizes | true | When set to true, the explain statement will print the partition sizes | -| datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information | -| datafusion.explain.format | indent | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format. | -| datafusion.explain.tree_maximum_render_width | 240 | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit. | -| datafusion.explain.analyze_level | dev | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers. | -| datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type | -| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | -| datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | -| datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | -| datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | -| datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | -| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | -| datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | -| datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: | -| datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | -| datafusion.format.null | | Format string for nulls | -| datafusion.format.date_format | %Y-%m-%d | Date format for date arrays | -| datafusion.format.datetime_format | %Y-%m-%dT%H:%M:%S%.f | Format for DateTime arrays | -| datafusion.format.timestamp_format | %Y-%m-%dT%H:%M:%S%.f | Timestamp format for timestamp arrays | -| datafusion.format.timestamp_tz_format | NULL | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used. | -| datafusion.format.time_format | %H:%M:%S%.f | Time format for time arrays | -| datafusion.format.duration_format | pretty | Duration format. Can be either `"pretty"` or `"ISO8601"` | -| datafusion.format.types_info | false | Show types in visual representation batches | +| key | default | description | +| ----------------------------------------------------------------------- | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| datafusion.catalog.create_default_catalog_and_schema | true | Whether the default catalog and schema should be created automatically. | +| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | +| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | +| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | +| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | +| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | +| datafusion.catalog.has_header | true | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. | +| datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. | +| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption | +| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | +| datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. | +| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system | +| datafusion.execution.time_zone | NULL | The default time zone Some functions, e.g. `now` return timestamps in this time zone | +| datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | +| datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | +| datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | +| datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | +| datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | +| datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | +| datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | +| datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | +| datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | +| datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | +| datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | +| datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | +| datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in bytes | +| datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | +| datafusion.execution.parquet.skip_arrow_metadata | false | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to | +| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. | +| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | +| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | +| datafusion.execution.parquet.created_by | datafusion version 51.0.0 | (writing) Sets "created by" property | +| datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | +| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | +| datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | +| datafusion.execution.parquet.bloom_filter_fpp | NULL | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.bloom_filter_ndv | NULL | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.allow_single_file_parallelism | true | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. | +| datafusion.execution.parquet.maximum_parallel_row_group_writers | 1 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | +| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | +| datafusion.execution.planning_concurrency | 0 | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system | +| datafusion.execution.skip_physical_aggregate_schema_check | false | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. | +| datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | +| datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | +| datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | +| datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | +| datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | +| datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | +| datafusion.execution.soft_max_rows_per_output_file | 50000000 | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max | +| datafusion.execution.max_buffered_batches_per_output_file | 2 | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption | +| datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | +| datafusion.execution.listing_table_factory_infer_partitions | true | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). | +| datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | +| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | +| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | +| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold | 0.8 | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input | +| datafusion.execution.skip_partial_aggregation_probe_rows_threshold | 100000 | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode | +| datafusion.execution.use_row_number_estimates_to_optimize_partitioning | false | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future. | +| datafusion.execution.enforce_batch_size_in_joins | false | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. | +| datafusion.execution.objectstore_writer_buffer_size | 10485760 | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. | +| datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | +| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | +| datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | +| datafusion.optimizer.enable_window_limits | true | When set to true, the optimizer will attempt to push limit operations past window functions, if possible | +| datafusion.optimizer.enable_topk_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_join_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators (topk & join) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown` & `enable_topk_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. | +| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | +| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | +| datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | +| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level | +| datafusion.optimizer.allow_symmetric_joins_without_pruning | true | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. | +| datafusion.optimizer.repartition_file_scans | true | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. | +| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | +| datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | +| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. | +| datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | +| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | +| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | +| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | +| datafusion.optimizer.enable_piecewise_merge_join | false | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter. | +| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | +| datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | +| datafusion.optimizer.hash_join_inlist_pushdown_max_size | 131072 | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` \* `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins. | +| datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values | 150 | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: | +| datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | +| datafusion.optimizer.prefer_existing_union | false | When set to true, the optimizer will not attempt to convert Union to Interleave | +| datafusion.optimizer.expand_views_at_output | false | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. | +| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | +| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | +| datafusion.explain.show_statistics | false | When set to true, the explain statement will print operator statistics for physical plans | +| datafusion.explain.show_sizes | true | When set to true, the explain statement will print the partition sizes | +| datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information | +| datafusion.explain.format | indent | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format. | +| datafusion.explain.tree_maximum_render_width | 240 | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit. | +| datafusion.explain.analyze_level | dev | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers. | +| datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type | +| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | +| datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | +| datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | +| datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | +| datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | +| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | +| datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | +| datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: | +| datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | +| datafusion.format.null | | Format string for nulls | +| datafusion.format.date_format | %Y-%m-%d | Date format for date arrays | +| datafusion.format.datetime_format | %Y-%m-%dT%H:%M:%S%.f | Format for DateTime arrays | +| datafusion.format.timestamp_format | %Y-%m-%dT%H:%M:%S%.f | Timestamp format for timestamp arrays | +| datafusion.format.timestamp_tz_format | NULL | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used. | +| datafusion.format.time_format | %H:%M:%S%.f | Time format for time arrays | +| datafusion.format.duration_format | pretty | Duration format. Can be either `"pretty"` or `"ISO8601"` | +| datafusion.format.types_info | false | Show types in visual representation batches | # Runtime Configuration Settings From 3a23f9852bc28a7cb3ac92bb984456bfa81bfdc8 Mon Sep 17 00:00:00 2001 From: LiaCastaneda Date: Mon, 22 Dec 2025 10:00:09 +0100 Subject: [PATCH 5/6] fmt --- .../physical-expr/src/expressions/in_list.rs | 2 +- .../test_files/tpch/plans/q19.slt.part | 16 +++++++----- .../test_files/tpch/plans/q22.slt.part | 26 ++++++++++--------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 7b258ddf7435b..d315a8069b5cf 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -33,7 +33,7 @@ use arrow::datatypes::*; use arrow::util::bit_iterator::BitIndexIterator; use datafusion_common::hash_utils::with_hashes; use datafusion_common::{ - DFSchema, HashSet, Result, ScalarValue, exec_datafusion_err, exec_err, internal_err + exec_datafusion_err, exec_err, internal_err, DFSchema, HashSet, Result, ScalarValue, }; use datafusion_expr::{expr_vec_fmt, ColumnarValue}; diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part index d20f090fa5b8f..7401a1b7b63cc 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part @@ -72,10 +72,12 @@ physical_plan 06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 -09)----------------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] -10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], file_type=csv, has_header=false -11)------------CoalesceBatchesExec: target_batch_size=8192 -12)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -13)----------------FilterExec: (p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) AND p_size@2 >= 1 -14)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -15)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_size, p_container], file_type=csv, has_header=false +09)----------------CoalesceBatchesExec: target_batch_size=8192 +10)------------------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] +11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], file_type=csv, has_header=false +12)------------CoalesceBatchesExec: target_batch_size=8192 +13)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +14)----------------CoalesceBatchesExec: target_batch_size=8192 +15)------------------FilterExec: (p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) AND p_size@2 >= 1 +16)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +17)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_size, p_container], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part index a9d95fb1ab79f..ecd3145075dbb 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part @@ -90,15 +90,17 @@ physical_plan 14)--------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2] 15)----------------------------CoalesceBatchesExec: target_batch_size=8192 16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -17)--------------------------------FilterExec: substr(c_phone@1, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]) -18)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -19)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false -20)----------------------------CoalesceBatchesExec: target_batch_size=8192 -21)------------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4 -22)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false -23)--------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] -24)----------------------CoalescePartitionsExec -25)------------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] -26)--------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1] -27)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -28)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false +17)--------------------------------CoalesceBatchesExec: target_batch_size=8192 +18)----------------------------------FilterExec: substr(c_phone@1, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]) +19)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +20)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false +21)----------------------------CoalesceBatchesExec: target_batch_size=8192 +22)------------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4 +23)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false +24)--------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] +25)----------------------CoalescePartitionsExec +26)------------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] +27)--------------------------CoalesceBatchesExec: target_batch_size=8192 +28)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1] +29)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +30)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false From b26f8a9b65a350987df29cc5fd41cd796300f0ce Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 13 Dec 2025 08:52:20 -0600 Subject: [PATCH 6/6] replace HashTableLookupExpr with lit(true) in proto serialization (#19300) *errors* when serializing now, and would break any users using joins + protobuf. --- .../physical-plan/src/joins/hash_join/mod.rs | 1 + .../joins/hash_join/partitioned_hash_eval.rs | 8 ++- datafusion/physical-plan/src/joins/mod.rs | 8 ++- .../proto/src/physical_plan/to_proto.rs | 25 ++++++++++ .../tests/cases/roundtrip_physical_plan.rs | 50 ++++++++++++++++++- 5 files changed, 86 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-plan/src/joins/hash_join/mod.rs b/datafusion/physical-plan/src/joins/hash_join/mod.rs index ac1c54f4f6034..352209e9c3f75 100644 --- a/datafusion/physical-plan/src/joins/hash_join/mod.rs +++ b/datafusion/physical-plan/src/joins/hash_join/mod.rs @@ -18,6 +18,7 @@ //! [`HashJoinExec`] Partitioned Hash Join Operator pub use exec::HashJoinExec; +pub use partitioned_hash_eval::HashTableLookupExpr; mod exec; mod inlist_builder; diff --git a/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs index 9b0ae2ab47a42..8cc93684b4609 100644 --- a/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs +++ b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs @@ -42,7 +42,7 @@ use crate::{hash_utils::create_hashes, joins::utils::JoinHashMapType}; /// This is used for: /// - Computing routing hashes (with RepartitionExec's 0,0,0,0 seeds) /// - Computing lookup hashes (with HashJoin's 'J','O','I','N' seeds) -pub(super) struct HashExpr { +pub struct HashExpr { /// Columns to hash on_columns: Vec, /// Random state for hashing @@ -179,7 +179,11 @@ impl HashTableLookupExpr { /// * `hash_expr` - Expression that computes hash values /// * `hash_map` - Hash table to check membership /// * `description` - Description for debugging - pub(super) fn new( + /// + /// # Note + /// This is public for internal testing purposes only and is not + /// guaranteed to be stable across versions. + pub fn new( hash_expr: PhysicalExprRef, hash_map: Arc, description: String, diff --git a/datafusion/physical-plan/src/joins/mod.rs b/datafusion/physical-plan/src/joins/mod.rs index b0c28cf994f71..0ca77b3cae982 100644 --- a/datafusion/physical-plan/src/joins/mod.rs +++ b/datafusion/physical-plan/src/joins/mod.rs @@ -20,7 +20,7 @@ use arrow::array::BooleanBufferBuilder; pub use cross_join::CrossJoinExec; use datafusion_physical_expr::PhysicalExprRef; -pub use hash_join::HashJoinExec; +pub use hash_join::{HashJoinExec, HashTableLookupExpr}; pub use nested_loop_join::NestedLoopJoinExec; use parking_lot::Mutex; // Note: SortMergeJoin is not used in plans yet @@ -37,7 +37,11 @@ mod symmetric_hash_join; pub mod utils; mod join_filter; -mod join_hash_map; +/// Hash map implementations for join operations. +/// +/// Note: This module is public for internal testing purposes only +/// and is not guaranteed to be stable across versions. +pub mod join_hash_map; #[cfg(test)] pub mod test_utils; diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index dc0a78dbccf11..f230c10cd451e 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -41,6 +41,7 @@ use datafusion_physical_plan::expressions::{ BinaryExpr, CaseExpr, CastExpr, Column, InListExpr, IsNotNullExpr, IsNullExpr, Literal, NegativeExpr, NotExpr, TryCastExpr, UnKnownColumn, }; +use datafusion_physical_plan::joins::HashTableLookupExpr; use datafusion_physical_plan::udaf::AggregateFunctionExpr; use datafusion_physical_plan::windows::{PlainAggregateWindowExpr, WindowUDFExpr}; use datafusion_physical_plan::{Partitioning, PhysicalExpr, WindowExpr}; @@ -226,6 +227,30 @@ pub fn serialize_physical_expr( let value = snapshot_physical_expr(Arc::clone(value))?; let expr = value.as_any(); + // HashTableLookupExpr is used for dynamic filter pushdown in hash joins. + // It contains an Arc (the build-side hash table) which + // cannot be serialized - the hash table is a runtime structure built during + // execution on the build side. + // + // We replace it with lit(true) which is safe because: + // 1. The filter is a performance optimization, not a correctness requirement + // 2. lit(true) passes all rows, so no valid rows are incorrectly filtered out + // 3. The join itself will still produce correct results, just without the + // benefit of early filtering on the probe side + // + // In distributed execution, the remote worker won't have access to the hash + // table anyway, so the best we can do is skip this optimization. + if expr.downcast_ref::().is_some() { + let value = datafusion_proto_common::ScalarValue { + value: Some(datafusion_proto_common::scalar_value::Value::BoolValue( + true, + )), + }; + return Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::Literal(value)), + }); + } + if let Some(expr) = expr.downcast_ref::() { Ok(protobuf::PhysicalExprNode { expr_type: Some(protobuf::physical_expr_node::ExprType::Column( diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index c8b2bc02e447b..c3bea24831aa7 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -78,8 +78,8 @@ use datafusion::physical_plan::expressions::{ }; use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::joins::{ - HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec, - StreamJoinPartitionMode, SymmetricHashJoinExec, + HashJoinExec, HashTableLookupExpr, NestedLoopJoinExec, PartitionMode, + SortMergeJoinExec, StreamJoinPartitionMode, SymmetricHashJoinExec, }; use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion::physical_plan::placeholder_row::PlaceholderRowExec; @@ -113,6 +113,7 @@ use datafusion_expr::{ use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::nth_value::nth_value_udaf; use datafusion_functions_aggregate::string_agg::string_agg_udaf; +use datafusion_physical_plan::joins::join_hash_map::JoinHashMapU32; use datafusion_proto::physical_plan::{ AsExecutionPlan, DefaultPhysicalExtensionCodec, PhysicalExtensionCodec, }; @@ -2264,3 +2265,48 @@ async fn roundtrip_listing_table_with_schema_metadata() -> Result<()> { roundtrip_test(plan) } + +/// Test that HashTableLookupExpr serializes to lit(true) +/// +/// HashTableLookupExpr contains a runtime hash table that cannot be serialized. +/// The serialization code replaces it with lit(true) which is safe because +/// it's a performance optimization filter, not a correctness requirement. +#[test] +fn roundtrip_hash_table_lookup_expr_to_lit() -> Result<()> { + // Create a simple schema and input plan + let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Int64, false)])); + let input = Arc::new(EmptyExec::new(schema.clone())); + + // Create a HashTableLookupExpr - it will be replaced with lit(true) during serialization + let hash_map = Arc::new(JoinHashMapU32::with_capacity(0)); + let hash_expr: Arc = Arc::new(Column::new("col", 0)); + let lookup_expr: Arc = Arc::new(HashTableLookupExpr::new( + hash_expr, + hash_map, + "test_lookup".to_string(), + )); + + // Create a filter with the lookup expression + let filter = Arc::new(FilterExec::try_new(lookup_expr, input)?); + + // Serialize + let ctx = SessionContext::new(); + let codec = DefaultPhysicalExtensionCodec {}; + let proto: protobuf::PhysicalPlanNode = + protobuf::PhysicalPlanNode::try_from_physical_plan(filter.clone(), &codec) + .expect("serialization should succeed"); + + // Deserialize + let result: Arc = proto + .try_into_physical_plan(&ctx.task_ctx(), &codec) + .expect("deserialization should succeed"); + + // The deserialized plan should have lit(true) instead of HashTableLookupExpr + // Verify the filter predicate is a Literal(true) + let result_filter = result.as_any().downcast_ref::().unwrap(); + let predicate = result_filter.predicate(); + let literal = predicate.as_any().downcast_ref::().unwrap(); + assert_eq!(*literal.value(), ScalarValue::Boolean(Some(true))); + + Ok(()) +}