From 424b73d9a65d9c240bfadae2982b70afcc263c46 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 15 Oct 2025 12:36:02 -0700 Subject: [PATCH 001/109] Refactor: split test_window_partial_constant_and_set_monotonicity into multiple tests (#17952) --- .../physical_optimizer/enforce_sorting.rs | 1216 +----------- .../enforce_sorting_monotonicity.rs | 1715 +++++++++++++++++ .../core/tests/physical_optimizer/mod.rs | 1 + 3 files changed, 1724 insertions(+), 1208 deletions(-) create mode 100644 datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index a2c604a84e76f..ad77a453350f8 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -33,16 +33,12 @@ use arrow::compute::SortOptions; use arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{TreeNode, TransformedResult}; -use datafusion_common::{Result, ScalarValue, TableReference}; +use datafusion_common::{Result, TableReference}; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; use datafusion_expr_common::operator::Operator; -use datafusion_expr::{JoinType, SortExpr, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition}; +use datafusion_expr::{JoinType, SortExpr}; use datafusion_execution::object_store::ObjectStoreUrl; -use datafusion_functions_aggregate::average::avg_udaf; -use datafusion_functions_aggregate::count::count_udaf; -use datafusion_functions_aggregate::min_max::{max_udaf, min_udaf}; -use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{ LexOrdering, PhysicalSortExpr, PhysicalSortRequirement, OrderingRequirements }; @@ -52,8 +48,7 @@ use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion_physical_plan::sorts::sort::SortExec; -use datafusion_physical_plan::windows::{create_window_expr, BoundedWindowAggExec, WindowAggExec}; -use datafusion_physical_plan::{displayable, get_plan_string, ExecutionPlan, InputOrderMode}; +use datafusion_physical_plan::{displayable, get_plan_string, ExecutionPlan}; use datafusion::datasource::physical_plan::CsvSource; use datafusion::datasource::listing::PartitionedFile; use datafusion_physical_optimizer::enforce_sorting::{EnforceSorting, PlanWithCorrespondingCoalescePartitions, PlanWithCorrespondingSort, parallelize_sorts, ensure_sorting}; @@ -93,13 +88,13 @@ fn csv_exec_sorted( /// Runs the sort enforcement optimizer and asserts the plan /// against the original and expected plans -struct EnforceSortingTest { +pub(crate) struct EnforceSortingTest { plan: Arc, repartition_sorts: bool, } impl EnforceSortingTest { - fn new(plan: Arc) -> Self { + pub(crate) fn new(plan: Arc) -> Self { Self { plan, repartition_sorts: false, @@ -107,14 +102,14 @@ impl EnforceSortingTest { } /// Set whether to repartition sorts - fn with_repartition_sorts(mut self, repartition_sorts: bool) -> Self { + pub(crate) fn with_repartition_sorts(mut self, repartition_sorts: bool) -> Self { self.repartition_sorts = repartition_sorts; self } /// Runs the enforce sorting test and returns a string with the input and /// optimized plan as strings for snapshot comparison using insta - fn run(&self) -> String { + pub(crate) fn run(&self) -> String { let mut config = ConfigOptions::new(); config.optimizer.repartition_sorts = self.repartition_sorts; @@ -2487,1203 +2482,8 @@ async fn test_not_replaced_with_partial_sort_for_unbounded_input() -> Result<()> "); Ok(()) } -// aal here -#[tokio::test] -async fn test_window_partial_constant_and_set_monotonicity() -> Result<()> { - let input_schema = create_test_schema()?; - let ordering = [sort_expr_options( - "nullable_col", - &input_schema, - SortOptions { - descending: false, - nulls_first: false, - }, - )] - .into(); - let source = parquet_exec_with_sort(input_schema.clone(), vec![ordering]) as _; - - // Macro for testing window function optimization with snapshots - macro_rules! test_window_case { - ( - partition_by: $partition_by:expr, - window_frame: $window_frame:expr, - func: ($func_def:expr, $func_name:expr, $func_args:expr), - required_sort: [$($col:expr, $asc:expr, $nulls_first:expr),*], - @ $expected:literal - ) => {{ - let partition_by_exprs = if $partition_by { - vec![col("nullable_col", &input_schema)?] - } else { - vec![] - }; - - let window_expr = create_window_expr( - &$func_def, - $func_name, - &$func_args, - &partition_by_exprs, - &[], - $window_frame, - Arc::clone(&input_schema), - false, - false, - None, - )?; - - let window_exec = if window_expr.uses_bounded_memory() { - Arc::new(BoundedWindowAggExec::try_new( - vec![window_expr], - Arc::clone(&source), - InputOrderMode::Sorted, - $partition_by, - )?) as Arc - } else { - Arc::new(WindowAggExec::try_new( - vec![window_expr], - Arc::clone(&source), - $partition_by, - )?) as Arc - }; - - let output_schema = window_exec.schema(); - let sort_expr = vec![ - $( - sort_expr_options( - $col, - &output_schema, - SortOptions { - descending: !$asc, - nulls_first: $nulls_first, - }, - ) - ),* - ]; - let ordering = LexOrdering::new(sort_expr).unwrap(); - let physical_plan = sort_exec(ordering, window_exec); - - let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); - - assert_snapshot!(test.run(), @ $expected); - - Result::<(), datafusion_common::DataFusionError>::Ok(()) - }}; - } - - // Function definition - Alias of the resulting column - Arguments of the function - #[derive(Clone)] - struct WindowFuncParam(WindowFunctionDefinition, String, Vec>); - let function_arg_ordered = vec![col("nullable_col", &input_schema)?]; - let function_arg_unordered = vec![col("non_nullable_col", &input_schema)?]; - let fn_count_on_ordered = WindowFuncParam( - WindowFunctionDefinition::AggregateUDF(count_udaf()), - "count".to_string(), - function_arg_ordered.clone(), - ); - let fn_max_on_ordered = WindowFuncParam( - WindowFunctionDefinition::AggregateUDF(max_udaf()), - "max".to_string(), - function_arg_ordered.clone(), - ); - let fn_min_on_ordered = WindowFuncParam( - WindowFunctionDefinition::AggregateUDF(min_udaf()), - "min".to_string(), - function_arg_ordered.clone(), - ); - let fn_avg_on_ordered = WindowFuncParam( - WindowFunctionDefinition::AggregateUDF(avg_udaf()), - "avg".to_string(), - function_arg_ordered, - ); - let fn_count_on_unordered = WindowFuncParam( - WindowFunctionDefinition::AggregateUDF(count_udaf()), - "count".to_string(), - function_arg_unordered.clone(), - ); - let fn_max_on_unordered = WindowFuncParam( - WindowFunctionDefinition::AggregateUDF(max_udaf()), - "max".to_string(), - function_arg_unordered.clone(), - ); - let fn_min_on_unordered = WindowFuncParam( - WindowFunctionDefinition::AggregateUDF(min_udaf()), - "min".to_string(), - function_arg_unordered.clone(), - ); - let fn_avg_on_unordered = WindowFuncParam( - WindowFunctionDefinition::AggregateUDF(avg_udaf()), - "avg".to_string(), - function_arg_unordered, - ); - - // ============================================REGION STARTS============================================ - // WindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on ordered column - // Case 0: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_count_on_ordered.0.clone(), fn_count_on_ordered.1.clone(), fn_count_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", true, false], - @ r#" - Input Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - - Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 1: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_max_on_ordered.0.clone(), fn_max_on_ordered.1.clone(), fn_max_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", false, false], - @ r#" - Input Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - - Optimized Plan: - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 2: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_min_on_ordered.0.clone(), fn_min_on_ordered.1.clone(), fn_min_on_ordered.2.clone()), - required_sort: ["min", false, false, "nullable_col", true, false], - @ r#" - Input Plan: - SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - - Optimized Plan: - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 3: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_avg_on_ordered.0.clone(), fn_avg_on_ordered.1.clone(), fn_avg_on_ordered.2.clone()), - required_sort: ["avg", true, false, "nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // WindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on unordered column - // Case 4: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_count_on_unordered.0.clone(), fn_count_on_unordered.1.clone(), fn_count_on_unordered.2.clone()), - required_sort: ["non_nullable_col", true, false, "count", true, false], - @ r#" -Input Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 5: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_max_on_unordered.0.clone(), fn_max_on_unordered.1.clone(), fn_max_on_unordered.2.clone()), - required_sort: ["non_nullable_col", false, false, "max", false, false], - @ r#" -Input Plan: -SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 6: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_min_on_unordered.0.clone(), fn_min_on_unordered.1.clone(), fn_min_on_unordered.2.clone()), - required_sort: ["min", true, false, "non_nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 7: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_avg_on_unordered.0.clone(), fn_avg_on_unordered.1.clone(), fn_avg_on_unordered.2.clone()), - required_sort: ["avg", false, false, "nullable_col", false, false], - @ r#" -Input Plan: -SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // WindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on ordered column - // Case 8: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_count_on_ordered.0.clone(), fn_count_on_ordered.1.clone(), fn_count_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", true, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 9: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_max_on_ordered.0.clone(), fn_max_on_ordered.1.clone(), fn_max_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", false, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 10: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_min_on_ordered.0.clone(), fn_min_on_ordered.1.clone(), fn_min_on_ordered.2.clone()), - required_sort: ["min", false, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 11: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_avg_on_ordered.0.clone(), fn_avg_on_ordered.1.clone(), fn_avg_on_ordered.2.clone()), - required_sort: ["avg", true, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // WindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on unordered column - // Case 12: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_count_on_unordered.0.clone(), fn_count_on_unordered.1.clone(), fn_count_on_unordered.2.clone()), - required_sort: ["non_nullable_col", true, false, "count", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 13: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_max_on_unordered.0.clone(), fn_max_on_unordered.1.clone(), fn_max_on_unordered.2.clone()), - required_sort: ["non_nullable_col", true, false, "max", false, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 14: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_min_on_unordered.0.clone(), fn_min_on_unordered.1.clone(), fn_min_on_unordered.2.clone()), - required_sort: ["min", false, false, "non_nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 DESC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 15: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(None)), - func: (fn_avg_on_unordered.0.clone(), fn_avg_on_unordered.1.clone(), fn_avg_on_unordered.2.clone()), - required_sort: ["avg", true, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // WindowAggExec + Sliding(current row, unbounded following) + no partition_by + on ordered column - // Case 16: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_count_on_ordered.0.clone(), fn_count_on_ordered.1.clone(), fn_count_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", false, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 17: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_max_on_ordered.0.clone(), fn_max_on_ordered.1.clone(), fn_max_on_ordered.2.clone()), - required_sort: ["max", false, true, "nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 18: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_min_on_ordered.0.clone(), fn_min_on_ordered.1.clone(), fn_min_on_ordered.2.clone()), - required_sort: ["min", true, true, "nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 19: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_avg_on_ordered.0.clone(), fn_avg_on_ordered.1.clone(), fn_avg_on_ordered.2.clone()), - required_sort: ["avg", false, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // WindowAggExec + Sliding(current row, unbounded following) + no partition_by + on unordered column - // Case 20: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_count_on_unordered.0.clone(), fn_count_on_unordered.1.clone(), fn_count_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 21: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_max_on_unordered.0.clone(), fn_max_on_unordered.1.clone(), fn_max_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", false, true], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 22: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_min_on_unordered.0.clone(), fn_min_on_unordered.1.clone(), fn_min_on_unordered.2.clone()), - required_sort: ["min", true, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 23: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_avg_on_unordered.0.clone(), fn_avg_on_unordered.1.clone(), fn_avg_on_unordered.2.clone()), - required_sort: ["avg", false, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // WindowAggExec + Sliding(current row, unbounded following) + partition_by + on ordered column - // Case 24: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_count_on_ordered.0.clone(), fn_count_on_ordered.1.clone(), fn_count_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", false, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 25: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_max_on_ordered.0.clone(), fn_max_on_ordered.1.clone(), fn_max_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 26: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_min_on_ordered.0.clone(), fn_min_on_ordered.1.clone(), fn_min_on_ordered.2.clone()), - required_sort: ["min", false, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 27: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_avg_on_ordered.0.clone(), fn_avg_on_ordered.1.clone(), fn_avg_on_ordered.2.clone()), - required_sort: ["avg", false, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[avg@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // WindowAggExec + Sliding(current row, unbounded following) + partition_by + on unordered column - // Case 28: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_count_on_unordered.0.clone(), fn_count_on_unordered.1.clone(), fn_count_on_unordered.2.clone()), - required_sort: ["count", false, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[count@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 29: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_max_on_unordered.0.clone(), fn_max_on_unordered.1.clone(), fn_max_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", false, true], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 30: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_min_on_unordered.0.clone(), fn_min_on_unordered.1.clone(), fn_min_on_unordered.2.clone()), - required_sort: ["min", false, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 31: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), - func: (fn_avg_on_unordered.0.clone(), fn_avg_on_unordered.1.clone(), fn_avg_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "avg", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on ordered column - // Case 32: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_count_on_ordered.0.clone(), fn_count_on_ordered.1.clone(), fn_count_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", true, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 33: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_max_on_ordered.0.clone(), fn_max_on_ordered.1.clone(), fn_max_on_ordered.2.clone()), - required_sort: ["max", false, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[max@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 34: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_min_on_ordered.0.clone(), fn_min_on_ordered.1.clone(), fn_min_on_ordered.2.clone()), - required_sort: ["min", false, false, "nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 35: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_avg_on_ordered.0.clone(), fn_avg_on_ordered.1.clone(), fn_avg_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "avg", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on unordered column - // Case 36: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_count_on_unordered.0.clone(), fn_count_on_unordered.1.clone(), fn_count_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", true, true], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 37: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_max_on_unordered.0.clone(), fn_max_on_unordered.1.clone(), fn_max_on_unordered.2.clone()), - required_sort: ["max", true, false, "nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 38: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_min_on_unordered.0.clone(), fn_min_on_unordered.1.clone(), fn_min_on_unordered.2.clone()), - required_sort: ["min", false, true, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 39: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_avg_on_unordered.0.clone(), fn_avg_on_unordered.1.clone(), fn_avg_on_unordered.2.clone()), - required_sort: ["avg", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on ordered column - // Case 40: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_count_on_ordered.0.clone(), fn_count_on_ordered.1.clone(), fn_count_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", true, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 41: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_max_on_ordered.0.clone(), fn_max_on_ordered.1.clone(), fn_max_on_ordered.2.clone()), - required_sort: ["max", true, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 42: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_min_on_ordered.0.clone(), fn_min_on_ordered.1.clone(), fn_min_on_ordered.2.clone()), - required_sort: ["min", false, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 43: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_avg_on_ordered.0.clone(), fn_avg_on_ordered.1.clone(), fn_avg_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "avg", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on unordered column - // Case 44: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_count_on_unordered.0.clone(), fn_count_on_unordered.1.clone(), fn_count_on_unordered.2.clone()), - required_sort: ["count", true, true], - @ r#" - Input / Optimized Plan: - SortExec: expr=[count@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 45: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_max_on_unordered.0.clone(), fn_max_on_unordered.1.clone(), fn_max_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", false, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 46: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_min_on_unordered.0.clone(), fn_min_on_unordered.1.clone(), fn_min_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "min", false, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 47: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new(Some(true))), - func: (fn_avg_on_unordered.0.clone(), fn_avg_on_unordered.1.clone(), fn_avg_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + no partition_by + on ordered column - // Case 48: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_count_on_ordered.0.clone(), fn_count_on_ordered.1.clone(), fn_count_on_ordered.2.clone()), - required_sort: ["count", true, false, "nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 49: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32)?))), - func: (fn_max_on_ordered.0.clone(), fn_max_on_ordered.1.clone(), fn_max_on_ordered.2.clone()), - required_sort: ["max", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 50: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_min_on_ordered.0.clone(), fn_min_on_ordered.1.clone(), fn_min_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "min", false, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 51: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_avg_on_ordered.0.clone(), fn_avg_on_ordered.1.clone(), fn_avg_on_ordered.2.clone()), - required_sort: ["avg", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + no partition_by + on unordered column - // Case 52: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32)?))), - func: (fn_count_on_unordered.0.clone(), fn_count_on_unordered.1.clone(), fn_count_on_unordered.2.clone()), - required_sort: ["count", true, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 53: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_max_on_unordered.0.clone(), fn_max_on_unordered.1.clone(), fn_max_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 54: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_min_on_unordered.0.clone(), fn_min_on_unordered.1.clone(), fn_min_on_unordered.2.clone()), - required_sort: ["min", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 55: - test_window_case!( - partition_by: false, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32)?))), - func: (fn_avg_on_unordered.0.clone(), fn_avg_on_unordered.1.clone(), fn_avg_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + partition_by + on ordered column - // Case 56: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_count_on_ordered.0.clone(), fn_count_on_ordered.1.clone(), fn_count_on_ordered.2.clone()), - required_sort: ["count", true, false, "nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - - // Case 57: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32)?))), - func: (fn_max_on_ordered.0.clone(), fn_max_on_ordered.1.clone(), fn_max_on_ordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 58: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_min_on_ordered.0.clone(), fn_min_on_ordered.1.clone(), fn_min_on_ordered.2.clone()), - required_sort: ["min", false, false, "nullable_col", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 59: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_avg_on_ordered.0.clone(), fn_avg_on_ordered.1.clone(), fn_avg_on_ordered.2.clone()), - required_sort: ["avg", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - // =============================================REGION ENDS============================================= - // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = - // ============================================REGION STARTS============================================ - // BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + partition_by + on unordered column - // Case 60: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_count_on_unordered.0.clone(), fn_count_on_unordered.1.clone(), fn_count_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "count", true, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 61: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_max_on_unordered.0.clone(), fn_max_on_unordered.1.clone(), fn_max_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "max", true, true], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 62: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_min_on_unordered.0.clone(), fn_min_on_unordered.1.clone(), fn_min_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false, "min", false, false], - @ r#" - Input / Optimized Plan: - SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - "# - )?; - - // Case 63: - test_window_case!( - partition_by: true, - window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)), - func: (fn_avg_on_unordered.0.clone(), fn_avg_on_unordered.1.clone(), fn_avg_on_unordered.2.clone()), - required_sort: ["nullable_col", true, false], - @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet - -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# - )?; - // =============================================REGION ENDS============================================= - - Ok(()) -} +// Test that verifies that an orthogonal sort (a sort on columns not in the input ordering) #[test] fn test_removes_unused_orthogonal_sort() -> Result<()> { let schema = create_test_schema3()?; diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs new file mode 100644 index 0000000000000..7d6c0484b624b --- /dev/null +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs @@ -0,0 +1,1715 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::physical_optimizer::test_utils::{ + create_test_schema, parquet_exec_with_sort, sort_exec, sort_expr_options, +}; +use arrow::datatypes::DataType; +use arrow_schema::SortOptions; +use datafusion::common::ScalarValue; +use datafusion::logical_expr::WindowFrameBound; +use datafusion::logical_expr::WindowFrameUnits; +use datafusion_expr::{WindowFrame, WindowFunctionDefinition}; +use datafusion_functions_aggregate::average::avg_udaf; +use datafusion_functions_aggregate::count::count_udaf; +use datafusion_functions_aggregate::min_max::{max_udaf, min_udaf}; +use datafusion_physical_expr::expressions::col; +use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +use datafusion_physical_expr_common::sort_expr::LexOrdering; +use datafusion_physical_plan::windows::{ + create_window_expr, BoundedWindowAggExec, WindowAggExec, +}; +use datafusion_physical_plan::{ExecutionPlan, InputOrderMode}; +use insta::assert_snapshot; +use std::sync::{Arc, LazyLock}; + +// Function definition - Alias of the resulting column - Arguments of the function +#[derive(Clone)] +struct WindowFuncParam( + WindowFunctionDefinition, + &'static str, + Vec>, +); + +fn function_arg_ordered() -> Vec> { + let input_schema = create_test_schema().unwrap(); + vec![col("nullable_col", &input_schema).unwrap()] +} +fn function_arg_unordered() -> Vec> { + let input_schema = create_test_schema().unwrap(); + vec![col("non_nullable_col", &input_schema).unwrap()] +} + +fn fn_count_on_ordered() -> WindowFuncParam { + WindowFuncParam( + WindowFunctionDefinition::AggregateUDF(count_udaf()), + "count", + function_arg_ordered(), + ) +} + +fn fn_max_on_ordered() -> WindowFuncParam { + WindowFuncParam( + WindowFunctionDefinition::AggregateUDF(max_udaf()), + "max", + function_arg_ordered(), + ) +} + +fn fn_min_on_ordered() -> WindowFuncParam { + WindowFuncParam( + WindowFunctionDefinition::AggregateUDF(min_udaf()), + "min", + function_arg_ordered(), + ) +} + +fn fn_avg_on_ordered() -> WindowFuncParam { + WindowFuncParam( + WindowFunctionDefinition::AggregateUDF(avg_udaf()), + "avg", + function_arg_ordered(), + ) +} + +fn fn_count_on_unordered() -> WindowFuncParam { + WindowFuncParam( + WindowFunctionDefinition::AggregateUDF(count_udaf()), + "count", + function_arg_unordered(), + ) +} + +fn fn_max_on_unordered() -> WindowFuncParam { + WindowFuncParam( + WindowFunctionDefinition::AggregateUDF(max_udaf()), + "max", + function_arg_unordered(), + ) +} +fn fn_min_on_unordered() -> WindowFuncParam { + WindowFuncParam( + WindowFunctionDefinition::AggregateUDF(min_udaf()), + "min", + function_arg_unordered(), + ) +} + +fn fn_avg_on_unordered() -> WindowFuncParam { + WindowFuncParam( + WindowFunctionDefinition::AggregateUDF(avg_udaf()), + "avg", + function_arg_unordered(), + ) +} + +struct TestWindowCase { + partition_by: bool, + window_frame: Arc, + func: WindowFuncParam, + required_sort: Vec<(&'static str, bool, bool)>, // (column name, ascending, nulls_first) +} +impl TestWindowCase { + fn source() -> Arc { + static SOURCE: LazyLock> = LazyLock::new(|| { + let input_schema = create_test_schema().unwrap(); + let ordering = [sort_expr_options( + "nullable_col", + &input_schema, + SortOptions { + descending: false, + nulls_first: false, + }, + )] + .into(); + parquet_exec_with_sort(input_schema.clone(), vec![ordering]) + }); + Arc::clone(&SOURCE) + } + + // runs the window test case and returns the string representation of the plan + fn run(self) -> String { + let input_schema = create_test_schema().unwrap(); + let source = Self::source(); + + let Self { + partition_by, + window_frame, + func: WindowFuncParam(func_def, func_name, func_args), + required_sort, + } = self; + let partition_by_exprs = if partition_by { + vec![col("nullable_col", &input_schema).unwrap()] + } else { + vec![] + }; + + let window_expr = create_window_expr( + &func_def, + func_name.to_string(), + &func_args, + &partition_by_exprs, + &[], + window_frame, + Arc::clone(&input_schema), + false, + false, + None, + ) + .unwrap(); + + let window_exec = if window_expr.uses_bounded_memory() { + Arc::new( + BoundedWindowAggExec::try_new( + vec![window_expr], + Arc::clone(&source), + InputOrderMode::Sorted, + partition_by, + ) + .unwrap(), + ) as Arc + } else { + Arc::new( + WindowAggExec::try_new( + vec![window_expr], + Arc::clone(&source), + partition_by, + ) + .unwrap(), + ) as Arc + }; + + let output_schema = window_exec.schema(); + let sort_expr = required_sort.into_iter().map(|(col, asc, nulls_first)| { + sort_expr_options( + col, + &output_schema, + SortOptions { + descending: !asc, + nulls_first, + }, + ) + }); + let ordering = LexOrdering::new(sort_expr).unwrap(); + let physical_plan = sort_exec(ordering, window_exec); + + crate::physical_optimizer::enforce_sorting::EnforceSortingTest::new(physical_plan) + .with_repartition_sorts(true) + .run() + } +} +#[test] +fn test_window_partial_constant_and_set_monotonicity_0() { + // ============================================REGION STARTS============================================ + // WindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on ordered column + // Case 0: + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_count_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", true, false), + ], + }.run(), + @ r#" + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_1() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_max_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", false, false), + ], + }.run(), + @ r#" + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_2() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_min_on_ordered(), + required_sort: vec![ + ("min", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_3() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_avg_on_ordered(), + required_sort: vec![ + ("avg", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_4() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_count_on_unordered(), + required_sort: vec![ + ("non_nullable_col", true, false), + ("count", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_5() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_max_on_unordered(), + required_sort: vec![ + ("non_nullable_col", false, false), + ("max", false, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_6() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_min_on_unordered(), + required_sort: vec![ + ("min", true, false), + ("non_nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_7() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_avg_on_unordered(), + required_sort: vec![ + ("avg", false, false), + ("nullable_col", false, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ + +#[test] +fn test_window_partial_constant_and_set_monotonicity_8() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_count_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_9() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_max_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", false, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_10() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_min_on_ordered(), + required_sort: vec![ + ("min", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +#[test] +fn test_window_partial_constant_and_set_monotonicity_11() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_avg_on_ordered(), + required_sort: vec![ + ("avg", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// WindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on unordered column +// Case 12: +#[test] +fn test_window_partial_constant_and_set_monotonicity_12() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_count_on_unordered(), + required_sort: vec![ + ("non_nullable_col", true, false), + ("count", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 13: +#[test] +fn test_window_partial_constant_and_set_monotonicity_13() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_max_on_unordered(), + required_sort: vec![ + ("non_nullable_col", true, false), + ("max", false, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 14: +#[test] +fn test_window_partial_constant_and_set_monotonicity_14() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_min_on_unordered(), + required_sort: vec![ + ("min", false, false), + ("non_nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 DESC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 15: +#[test] +fn test_window_partial_constant_and_set_monotonicity_15() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(None)), + func: fn_avg_on_unordered(), + required_sort: vec![ + ("avg", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// WindowAggExec + Sliding(current row, unbounded following) + no partition_by + on ordered column +// Case 16: +#[test] +fn test_window_partial_constant_and_set_monotonicity_16() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_count_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", false, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 17: +#[test] +fn test_window_partial_constant_and_set_monotonicity_17() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_max_on_ordered(), + required_sort: vec![ + ("max", false, true), + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 18: +#[test] +fn test_window_partial_constant_and_set_monotonicity_18() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_min_on_ordered(), + required_sort: vec![ + ("min", true, true), + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 19: +#[test] +fn test_window_partial_constant_and_set_monotonicity_19() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_avg_on_ordered(), + required_sort: vec![ + ("avg", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// WindowAggExec + Sliding(current row, unbounded following) + no partition_by + on unordered column +// Case 20: +#[test] +fn test_window_partial_constant_and_set_monotonicity_20() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_count_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 21: +#[test] +fn test_window_partial_constant_and_set_monotonicity_21() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_max_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", false, true), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 22: +#[test] +fn test_window_partial_constant_and_set_monotonicity_22() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_min_on_unordered(), + required_sort: vec![ + ("min", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 23: +#[test] +fn test_window_partial_constant_and_set_monotonicity_23() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_avg_on_unordered(), + required_sort: vec![ + ("avg", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// WindowAggExec + Sliding(current row, unbounded following) + partition_by + on ordered column +// Case 24: +#[test] +fn test_window_partial_constant_and_set_monotonicity_24() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_count_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", false, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 25: +#[test] +fn test_window_partial_constant_and_set_monotonicity_25() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_max_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 26: +#[test] +fn test_window_partial_constant_and_set_monotonicity_26() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_min_on_ordered(), + required_sort: vec![ + ("min", false, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "#); +} + +// Case 27: +#[test] +fn test_window_partial_constant_and_set_monotonicity_27() { + assert_snapshot!( + TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_avg_on_ordered(), + required_sort: vec![ + ("avg", false, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[avg@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "#); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// WindowAggExec + Sliding(current row, unbounded following) + partition_by + on unordered column + +// Case 28: +#[test] +fn test_window_partial_constant_and_set_monotonicity_28() { + assert_snapshot!( + TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_count_on_unordered(), + required_sort: vec![ + ("count", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[count@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 29: +#[test] +fn test_window_partial_constant_and_set_monotonicity_29() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_max_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", false, true), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"#) +} + +// Case 30: +#[test] +fn test_window_partial_constant_and_set_monotonicity_30() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_min_on_unordered(), + required_sort: vec![ + ("min", false, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "#); +} + +// Case 31: +#[test] +fn test_window_partial_constant_and_set_monotonicity_31() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()), + func: fn_avg_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("avg", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on ordered column + +// Case 32: +#[test] +fn test_window_partial_constant_and_set_monotonicity_32() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_count_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 33: +#[test] +fn test_window_partial_constant_and_set_monotonicity_33() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_max_on_ordered(), + required_sort: vec![ + ("max", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[max@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 34: +#[test] +fn test_window_partial_constant_and_set_monotonicity_34() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_min_on_ordered(), + required_sort: vec![ + ("min", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} +// Case 35: +#[test] +fn test_window_partial_constant_and_set_monotonicity_35() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_avg_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("avg", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on unordered column + +// Case 36: +#[test] +fn test_window_partial_constant_and_set_monotonicity_36() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_count_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", true, true), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 37: +#[test] +fn test_window_partial_constant_and_set_monotonicity_37() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_max_on_unordered(), + required_sort: vec![ + ("max", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 38: +#[test] +fn test_window_partial_constant_and_set_monotonicity_38() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_min_on_unordered(), + required_sort: vec![ + ("min", false, true), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 39: +#[test] +fn test_window_partial_constant_and_set_monotonicity_39() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_avg_on_unordered(), + required_sort: vec![ + ("avg", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on ordered column + +// Case 40: +#[test] +fn test_window_partial_constant_and_set_monotonicity_40() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_count_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 41: +#[test] +fn test_window_partial_constant_and_set_monotonicity_41() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_max_on_ordered(), + required_sort: vec![ + ("max", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 42: +#[test] +fn test_window_partial_constant_and_set_monotonicity_42() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_min_on_ordered(), + required_sort: vec![ + ("min", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 43: +#[test] +fn test_window_partial_constant_and_set_monotonicity_43() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_avg_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("avg", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on unordered column + +// Case 44: +#[test] +fn test_window_partial_constant_and_set_monotonicity_44() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_count_on_unordered(), + required_sort: vec![ + ("count", true, true), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[count@2 ASC], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 45: +#[test] +fn test_window_partial_constant_and_set_monotonicity_45() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_max_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", false, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 46: +#[test] +fn test_window_partial_constant_and_set_monotonicity_46() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_min_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("min", false, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 47: +#[test] +fn test_window_partial_constant_and_set_monotonicity_47() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new(Some(true))), + func: fn_avg_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + no partition_by + on ordered column + +// Case 48: +#[test] +fn test_window_partial_constant_and_set_monotonicity_48() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_count_on_ordered(), + required_sort: vec![ + ("count", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 49: +#[test] +fn test_window_partial_constant_and_set_monotonicity_49() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32).unwrap()))), + func: fn_max_on_ordered(), + required_sort: vec![ + ("max", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[max@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 50: +#[test] +fn test_window_partial_constant_and_set_monotonicity_50() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_min_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("min", false, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 51: +#[test] +fn test_window_partial_constant_and_set_monotonicity_51() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_avg_on_ordered(), + required_sort: vec![ + ("avg", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + no partition_by + on unordered column + +// Case 52: +#[test] +fn test_window_partial_constant_and_set_monotonicity_52() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32).unwrap()))), + func: fn_count_on_unordered(), + required_sort: vec![ + ("count", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 53: +#[test] +fn test_window_partial_constant_and_set_monotonicity_53() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_max_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 54: +#[test] +fn test_window_partial_constant_and_set_monotonicity_54() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_min_on_unordered(), + required_sort: vec![ + ("min", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 55: +#[test] +fn test_window_partial_constant_and_set_monotonicity_55() { + assert_snapshot!(TestWindowCase { + partition_by: false, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32).unwrap()))), + func: fn_avg_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + partition_by + on ordered column + +// Case 56: +#[test] +fn test_window_partial_constant_and_set_monotonicity_56() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_count_on_ordered(), + required_sort: vec![ + ("count", true, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} + +// Case 57: +#[test] +fn test_window_partial_constant_and_set_monotonicity_57() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32).unwrap()))), + func: fn_max_on_ordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 58: +#[test] +fn test_window_partial_constant_and_set_monotonicity_58() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_min_on_ordered(), + required_sort: vec![ + ("min", false, false), + ("nullable_col", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 59: +#[test] +fn test_window_partial_constant_and_set_monotonicity_59() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_avg_on_ordered(), + required_sort: vec![ + ("avg", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// =============================================REGION ENDS============================================= +// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +// ============================================REGION STARTS============================================ +// BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + partition_by + on unordered column + +// Case 60: +#[test] +fn test_window_partial_constant_and_set_monotonicity_60() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_count_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("count", true, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 61: +#[test] +fn test_window_partial_constant_and_set_monotonicity_61() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_max_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("max", true, true), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 62: +#[test] +fn test_window_partial_constant_and_set_monotonicity_62() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_min_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ("min", false, false), + ], + }.run(), + @ r#" + Input / Optimized Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# + ); +} + +// Case 63: +#[test] +fn test_window_partial_constant_and_set_monotonicity_63() { + assert_snapshot!(TestWindowCase { + partition_by: true, + window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)), + func: fn_avg_on_unordered(), + required_sort: vec![ + ("nullable_col", true, false), + ], + }.run(), + @ r#" +Input Plan: +SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + +Optimized Plan: +BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet +"# + ); +} +// =============================================REGION ENDS============================================= diff --git a/datafusion/core/tests/physical_optimizer/mod.rs b/datafusion/core/tests/physical_optimizer/mod.rs index 777c26e80e902..936c02eb2a02d 100644 --- a/datafusion/core/tests/physical_optimizer/mod.rs +++ b/datafusion/core/tests/physical_optimizer/mod.rs @@ -21,6 +21,7 @@ mod aggregate_statistics; mod combine_partial_final_agg; mod enforce_distribution; mod enforce_sorting; +mod enforce_sorting_monotonicity; mod filter_pushdown; mod join_selection; mod limit_pushdown; From a61a9c2c10ae9cb153acc12f44d554d55af59c2d Mon Sep 17 00:00:00 2001 From: peasee <98815791+peasee@users.noreply.github.com> Date: Thu, 16 Oct 2025 05:55:21 +1000 Subject: [PATCH 002/109] fix: Ensure ListingTable partitions are pruned when filters are not used (#17958) * fix: Prune partitions when no filters are defined * fix: Formatting * chore: Cargo fmt * chore: Clippy --- datafusion/catalog-listing/src/helpers.rs | 23 +++++++-- .../core/src/datasource/listing/table.rs | 47 +++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/datafusion/catalog-listing/src/helpers.rs b/datafusion/catalog-listing/src/helpers.rs index 00e9c71df3489..82cc36867939e 100644 --- a/datafusion/catalog-listing/src/helpers.rs +++ b/datafusion/catalog-listing/src/helpers.rs @@ -156,6 +156,7 @@ pub fn split_files( chunks } +#[derive(Debug)] pub struct Partition { /// The path to the partition, including the table prefix path: Path, @@ -245,7 +246,16 @@ async fn prune_partitions( partition_cols: &[(String, DataType)], ) -> Result> { if filters.is_empty() { - return Ok(partitions); + // prune partitions which don't contain the partition columns + return Ok(partitions + .into_iter() + .filter(|p| { + let cols = partition_cols.iter().map(|x| x.0.as_str()); + !parse_partitions_for_path(table_path, &p.path, cols) + .unwrap_or_default() + .is_empty() + }) + .collect()); } let mut builders: Vec<_> = (0..partition_cols.len()) @@ -432,6 +442,7 @@ pub async fn pruned_partition_list<'a>( } let partition_prefix = evaluate_partition_prefix(partition_cols, filters); + let partitions = list_partitions(store, table_path, partition_cols.len(), partition_prefix) .await?; @@ -502,12 +513,12 @@ where let subpath = table_path.strip_prefix(file_path)?; let mut part_values = vec![]; - for (part, pn) in subpath.zip(table_partition_cols) { + for (part, expected_partition) in subpath.zip(table_partition_cols) { match part.split_once('=') { - Some((name, val)) if name == pn => part_values.push(val), + Some((name, val)) if name == expected_partition => part_values.push(val), _ => { debug!( - "Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{pn}'", + "Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'", ); return None; } @@ -594,6 +605,8 @@ mod tests { ("tablepath/mypartition=val1/notparquetfile", 100), ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), ("tablepath/file.parquet", 100), + ("tablepath/notapartition/file.parquet", 100), + ("tablepath/notmypartition=val1/file.parquet", 100), ]); let filter = Expr::eq(col("mypartition"), lit("val1")); let pruned = pruned_partition_list( @@ -619,6 +632,8 @@ mod tests { ("tablepath/mypartition=val2/file.parquet", 100), ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), ("tablepath/mypartition=val1/other=val3/file.parquet", 100), + ("tablepath/notapartition/file.parquet", 100), + ("tablepath/notmypartition=val1/file.parquet", 100), ]); let filter = Expr::eq(col("mypartition"), lit("val1")); let pruned = pruned_partition_list( diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 3ce58938d77e4..4ffb6d41864f3 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -2732,6 +2732,52 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_listing_table_prunes_extra_files_in_hive() -> Result<()> { + let files = [ + "bucket/test/pid=1/file1", + "bucket/test/pid=1/file2", + "bucket/test/pid=2/file3", + "bucket/test/pid=2/file4", + "bucket/test/other/file5", + ]; + + let ctx = SessionContext::new(); + register_test_store(&ctx, &files.iter().map(|f| (*f, 10)).collect::>()); + + let opt = ListingOptions::new(Arc::new(JsonFormat::default())) + .with_file_extension_opt(Some("")) + .with_table_partition_cols(vec![("pid".to_string(), DataType::Int32)]); + + let table_path = ListingTableUrl::parse("test:///bucket/test/").unwrap(); + let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]); + let config = ListingTableConfig::new(table_path) + .with_listing_options(opt) + .with_schema(Arc::new(schema)); + + let table = ListingTable::try_new(config)?; + + let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], None).await?; + assert_eq!(file_list.len(), 1); + + let files = file_list[0].clone(); + + assert_eq!( + files + .iter() + .map(|f| f.path().to_string()) + .collect::>(), + vec![ + "bucket/test/pid=1/file1", + "bucket/test/pid=1/file2", + "bucket/test/pid=2/file3", + "bucket/test/pid=2/file4", + ] + ); + + Ok(()) + } + #[cfg(feature = "parquet")] #[tokio::test] async fn test_table_stats_behaviors() -> Result<()> { @@ -2750,6 +2796,7 @@ mod tests { let config_default = ListingTableConfig::new(table_path.clone()) .with_listing_options(opt_default) .with_schema(schema_default); + let table_default = ListingTable::try_new(config_default)?; let exec_default = table_default.scan(&state, None, &[], None).await?; From 41fdab9d29244b62fe5537db6fd5decdb046d339 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 15 Oct 2025 21:56:16 +0200 Subject: [PATCH 003/109] Push Down Filter Subexpressions in Nested Loop Joins as Projections (#17906) * Check-in NestedLoopJoinProjectionPushDown * Update Cargo.lock * Add some comments * Update slts that are affected by the nl-join-projection-push-down * please lints * Move code into projection_pushdown.rs * Remove explicit coalesce batches * Docs --- Cargo.lock | 1 + datafusion/physical-optimizer/Cargo.toml | 1 + .../src/projection_pushdown.rs | 748 +++++++++++++++++- .../sqllogictest/test_files/join.slt.part | 5 +- datafusion/sqllogictest/test_files/joins.slt | 10 +- .../test_files/tpch/plans/q11.slt.part | 97 +-- .../test_files/tpch/plans/q22.slt.part | 41 +- 7 files changed, 824 insertions(+), 79 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 00bd64f21eb11..bbf64d5262e29 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2452,6 +2452,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-expr-common", + "datafusion-functions", "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index 15466cd86bb04..4df011fc0a05e 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -52,5 +52,6 @@ recursive = { workspace = true, optional = true } [dev-dependencies] datafusion-expr = { workspace = true } +datafusion-functions = { workspace = true } insta = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/physical-optimizer/src/projection_pushdown.rs b/datafusion/physical-optimizer/src/projection_pushdown.rs index 34affcbd4a19b..987e3cb6f713e 100644 --- a/datafusion/physical-optimizer/src/projection_pushdown.rs +++ b/datafusion/physical-optimizer/src/projection_pushdown.rs @@ -20,18 +20,32 @@ //! projections one by one if the operator below is amenable to this. If a //! projection reaches a source, it can even disappear from the plan entirely. -use std::sync::Arc; - use crate::PhysicalOptimizerRule; +use arrow::datatypes::{Fields, Schema, SchemaRef}; +use datafusion_common::alias::AliasGenerator; +use std::collections::HashSet; +use std::sync::Arc; use datafusion_common::config::ConfigOptions; -use datafusion_common::tree_node::{TransformedResult, TreeNode}; -use datafusion_common::Result; -use datafusion_physical_plan::projection::remove_unnecessary_projections; +use datafusion_common::tree_node::{ + Transformed, TransformedResult, TreeNode, TreeNodeRecursion, +}; +use datafusion_common::{JoinSide, JoinType, Result}; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter}; +use datafusion_physical_plan::joins::NestedLoopJoinExec; +use datafusion_physical_plan::projection::{ + remove_unnecessary_projections, ProjectionExec, +}; use datafusion_physical_plan::ExecutionPlan; /// This rule inspects `ProjectionExec`'s in the given physical plan and tries to /// remove or swap with its child. +/// +/// Furthermore, tries to push down projections from nested loop join filters that only depend on +/// one side of the join. By pushing these projections down, functions that only depend on one side +/// of the join must be evaluated for the cartesian product of the two sides. #[derive(Default, Debug)] pub struct ProjectionPushdown {} @@ -48,6 +62,20 @@ impl PhysicalOptimizerRule for ProjectionPushdown { plan: Arc, _config: &ConfigOptions, ) -> Result> { + let alias_generator = AliasGenerator::new(); + let plan = plan + .transform_up(|plan| { + match plan.as_any().downcast_ref::() { + None => Ok(Transformed::no(plan)), + Some(hash_join) => try_push_down_join_filter( + Arc::clone(&plan), + hash_join, + &alias_generator, + ), + } + }) + .map(|t| t.data)?; + plan.transform_down(remove_unnecessary_projections).data() } @@ -59,3 +87,713 @@ impl PhysicalOptimizerRule for ProjectionPushdown { true } } + +/// Tries to push down parts of the filter. +/// +/// See [JoinFilterRewriter] for details. +fn try_push_down_join_filter( + original_plan: Arc, + join: &NestedLoopJoinExec, + alias_generator: &AliasGenerator, +) -> Result>> { + // Mark joins are currently not supported. + if matches!(join.join_type(), JoinType::LeftMark | JoinType::RightMark) { + return Ok(Transformed::no(original_plan)); + } + + let projections = join.projection(); + let Some(filter) = join.filter() else { + return Ok(Transformed::no(original_plan)); + }; + + let original_lhs_length = join.left().schema().fields().len(); + let original_rhs_length = join.right().schema().fields().len(); + + let lhs_rewrite = try_push_down_projection( + Arc::clone(&join.right().schema()), + Arc::clone(join.left()), + JoinSide::Left, + filter.clone(), + alias_generator, + )?; + let rhs_rewrite = try_push_down_projection( + Arc::clone(&lhs_rewrite.data.0.schema()), + Arc::clone(join.right()), + JoinSide::Right, + lhs_rewrite.data.1, + alias_generator, + )?; + if !lhs_rewrite.transformed && !rhs_rewrite.transformed { + return Ok(Transformed::no(original_plan)); + } + + let join_filter = minimize_join_filter( + Arc::clone(rhs_rewrite.data.1.expression()), + rhs_rewrite.data.1.column_indices().to_vec(), + lhs_rewrite.data.0.schema().as_ref(), + rhs_rewrite.data.0.schema().as_ref(), + ); + + let new_lhs_length = lhs_rewrite.data.0.schema().fields.len(); + let projections = match projections { + None => match join.join_type() { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { + // Build projections that ignore the newly projected columns. + let mut projections = Vec::new(); + projections.extend(0..original_lhs_length); + projections.extend(new_lhs_length..new_lhs_length + original_rhs_length); + projections + } + JoinType::LeftSemi | JoinType::LeftAnti => { + // Only return original left columns + let mut projections = Vec::new(); + projections.extend(0..original_lhs_length); + projections + } + JoinType::RightSemi | JoinType::RightAnti => { + // Only return original right columns + let mut projections = Vec::new(); + projections.extend(0..original_rhs_length); + projections + } + _ => unreachable!("Unsupported join type"), + }, + Some(projections) => { + let rhs_offset = new_lhs_length - original_lhs_length; + projections + .iter() + .map(|idx| { + if *idx >= original_lhs_length { + idx + rhs_offset + } else { + *idx + } + }) + .collect() + } + }; + + Ok(Transformed::yes(Arc::new(NestedLoopJoinExec::try_new( + lhs_rewrite.data.0, + rhs_rewrite.data.0, + Some(join_filter), + join.join_type(), + Some(projections), + )?))) +} + +/// Tries to push down parts of `expr` into the `join_side`. +fn try_push_down_projection( + other_schema: SchemaRef, + plan: Arc, + join_side: JoinSide, + join_filter: JoinFilter, + alias_generator: &AliasGenerator, +) -> Result, JoinFilter)>> { + let expr = Arc::clone(join_filter.expression()); + let original_plan_schema = plan.schema(); + let mut rewriter = JoinFilterRewriter::new( + join_side, + original_plan_schema.as_ref(), + join_filter.column_indices().to_vec(), + alias_generator, + ); + let new_expr = rewriter.rewrite(expr)?; + + if new_expr.transformed { + let new_join_side = + ProjectionExec::try_new(rewriter.join_side_projections, plan)?; + let new_schema = Arc::clone(&new_join_side.schema()); + + let (lhs_schema, rhs_schema) = match join_side { + JoinSide::Left => (new_schema, other_schema), + JoinSide::Right => (other_schema, new_schema), + JoinSide::None => unreachable!("Mark join not supported"), + }; + let intermediate_schema = rewriter + .intermediate_column_indices + .iter() + .map(|ci| match ci.side { + JoinSide::Left => Arc::clone(&lhs_schema.fields[ci.index]), + JoinSide::Right => Arc::clone(&rhs_schema.fields[ci.index]), + JoinSide::None => unreachable!("Mark join not supported"), + }) + .collect::(); + + let join_filter = JoinFilter::new( + new_expr.data, + rewriter.intermediate_column_indices, + Arc::new(Schema::new(intermediate_schema)), + ); + Ok(Transformed::yes((Arc::new(new_join_side), join_filter))) + } else { + Ok(Transformed::no((plan, join_filter))) + } +} + +/// Creates a new [JoinFilter] and tries to minimize the internal schema. +/// +/// This could eliminate some columns that were only part of a computation that has been pushed +/// down. As this computation is now materialized on one side of the join, the original input +/// columns are not needed anymore. +fn minimize_join_filter( + expr: Arc, + old_column_indices: Vec, + lhs_schema: &Schema, + rhs_schema: &Schema, +) -> JoinFilter { + let mut used_columns = HashSet::new(); + expr.apply(|expr| { + if let Some(col) = expr.as_any().downcast_ref::() { + used_columns.insert(col.index()); + } + Ok(TreeNodeRecursion::Continue) + }) + .expect("Closure cannot fail"); + + let new_column_indices = old_column_indices + .iter() + .enumerate() + .filter(|(idx, _)| used_columns.contains(idx)) + .map(|(_, ci)| ci.clone()) + .collect::>(); + let fields = new_column_indices + .iter() + .map(|ci| match ci.side { + JoinSide::Left => lhs_schema.field(ci.index).clone(), + JoinSide::Right => rhs_schema.field(ci.index).clone(), + JoinSide::None => unreachable!("Mark join not supported"), + }) + .collect::(); + + let final_expr = expr + .transform_up(|expr| match expr.as_any().downcast_ref::() { + None => Ok(Transformed::no(expr)), + Some(column) => { + let new_idx = used_columns + .iter() + .filter(|idx| **idx < column.index()) + .count(); + let new_column = Column::new(column.name(), new_idx); + Ok(Transformed::yes( + Arc::new(new_column) as Arc + )) + } + }) + .expect("Closure cannot fail"); + + JoinFilter::new( + final_expr.data, + new_column_indices, + Arc::new(Schema::new(fields)), + ) +} + +/// Implements the push-down machinery. +/// +/// The rewriter starts at the top of the filter expression and traverses the expression tree. For +/// each (sub-)expression, the rewriter checks whether it only refers to one side of the join. If +/// this is never the case, no subexpressions of the filter can be pushed down. If there is a +/// subexpression that can be computed using only one side of the join, the entire subexpression is +/// pushed down to the join side. +struct JoinFilterRewriter<'a> { + join_side: JoinSide, + join_side_schema: &'a Schema, + join_side_projections: Vec<(Arc, String)>, + intermediate_column_indices: Vec, + alias_generator: &'a AliasGenerator, +} + +impl<'a> JoinFilterRewriter<'a> { + /// Creates a new [JoinFilterRewriter]. + fn new( + join_side: JoinSide, + join_side_schema: &'a Schema, + column_indices: Vec, + alias_generator: &'a AliasGenerator, + ) -> Self { + let projections = join_side_schema + .fields() + .iter() + .enumerate() + .map(|(idx, field)| { + ( + Arc::new(Column::new(field.name(), idx)) as Arc, + field.name().to_string(), + ) + }) + .collect(); + + Self { + join_side, + join_side_schema, + join_side_projections: projections, + intermediate_column_indices: column_indices, + alias_generator, + } + } + + /// Executes the push-down machinery on `expr`. + /// + /// See the [JoinFilterRewriter] for further information. + fn rewrite( + &mut self, + expr: Arc, + ) -> Result>> { + let depends_on_this_side = self.depends_on_join_side(&expr, self.join_side)?; + // We don't push down things that do not depend on this side (other side or no side). + if !depends_on_this_side { + return Ok(Transformed::no(expr)); + } + + // Recurse if there is a dependency to both sides or if the entire expression is volatile. + let depends_on_other_side = + self.depends_on_join_side(&expr, self.join_side.negate())?; + let is_volatile = is_volatile_expression_tree(expr.as_ref()); + if depends_on_other_side || is_volatile { + return expr.map_children(|expr| self.rewrite(expr)); + } + + // There is only a dependency on this side. + + // If this expression has no children, we do not push down, as it should already be a column + // reference. + if expr.children().is_empty() { + return Ok(Transformed::no(expr)); + } + + // Otherwise, we push down a projection. + let alias = self.alias_generator.next("join_proj_push_down"); + let idx = self.create_new_column(alias.clone(), expr)?; + + Ok(Transformed::yes( + Arc::new(Column::new(&alias, idx)) as Arc + )) + } + + /// Creates a new column in the current join side. + fn create_new_column( + &mut self, + name: String, + expr: Arc, + ) -> Result { + // First, add a new projection. The expression must be rewritten, as it is no longer + // executed against the filter schema. + let new_idx = self.join_side_projections.len(); + let rewritten_expr = expr.transform_up(|expr| { + Ok(match expr.as_any().downcast_ref::() { + None => Transformed::no(expr), + Some(column) => { + let intermediate_column = + &self.intermediate_column_indices[column.index()]; + assert_eq!(intermediate_column.side, self.join_side); + + let join_side_index = intermediate_column.index; + let field = self.join_side_schema.field(join_side_index); + let new_column = Column::new(field.name(), join_side_index); + Transformed::yes(Arc::new(new_column) as Arc) + } + }) + })?; + self.join_side_projections.push((rewritten_expr.data, name)); + + // Then, update the column indices + let new_intermediate_idx = self.intermediate_column_indices.len(); + let idx = ColumnIndex { + index: new_idx, + side: self.join_side, + }; + self.intermediate_column_indices.push(idx); + + Ok(new_intermediate_idx) + } + + /// Checks whether the entire expression depends on the given `join_side`. + fn depends_on_join_side( + &mut self, + expr: &Arc, + join_side: JoinSide, + ) -> Result { + let mut result = false; + expr.apply(|expr| match expr.as_any().downcast_ref::() { + None => Ok(TreeNodeRecursion::Continue), + Some(c) => { + let column_index = &self.intermediate_column_indices[c.index()]; + if column_index.side == join_side { + result = true; + return Ok(TreeNodeRecursion::Stop); + } + Ok(TreeNodeRecursion::Continue) + } + })?; + + Ok(result) + } +} + +fn is_volatile_expression_tree(expr: &dyn PhysicalExpr) -> bool { + if expr.is_volatile_node() { + return true; + } + + expr.children() + .iter() + .map(|expr| is_volatile_expression_tree(expr.as_ref())) + .reduce(|lhs, rhs| lhs || rhs) + .unwrap_or(false) +} + +#[cfg(test)] +mod test { + use super::*; + use arrow::datatypes::{DataType, Field, FieldRef, Schema}; + use datafusion_expr_common::operator::Operator; + use datafusion_functions::math::random; + use datafusion_physical_expr::expressions::{binary, lit}; + use datafusion_physical_expr::ScalarFunctionExpr; + use datafusion_physical_expr_common::physical_expr::PhysicalExpr; + use datafusion_physical_plan::displayable; + use datafusion_physical_plan::empty::EmptyExec; + use insta::assert_snapshot; + use std::sync::Arc; + + #[tokio::test] + async fn no_computation_does_not_project() -> Result<()> { + let (left_schema, right_schema) = create_simple_schemas(); + let optimized_plan = run_test( + left_schema, + right_schema, + a_x(), + None, + a_greater_than_x, + JoinType::Inner, + )?; + + assert_snapshot!(optimized_plan, @r" + NestedLoopJoinExec: join_type=Inner, filter=a@0 > x@1 + EmptyExec + EmptyExec + "); + Ok(()) + } + + #[tokio::test] + async fn simple_push_down() -> Result<()> { + let (left_schema, right_schema) = create_simple_schemas(); + let optimized_plan = run_test( + left_schema, + right_schema, + a_x(), + None, + a_plus_one_greater_than_x_plus_one, + JoinType::Inner, + )?; + + assert_snapshot!(optimized_plan, @r" + NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[a@0, x@2] + ProjectionExec: expr=[a@0 as a, a@0 + 1 as join_proj_push_down_1] + EmptyExec + ProjectionExec: expr=[x@0 as x, x@0 + 1 as join_proj_push_down_2] + EmptyExec + "); + Ok(()) + } + + #[tokio::test] + async fn does_not_push_down_short_circuiting_expressions() -> Result<()> { + let (left_schema, right_schema) = create_simple_schemas(); + let optimized_plan = run_test( + left_schema, + right_schema, + a_x(), + None, + |schema| { + binary( + lit(false), + Operator::And, + a_plus_one_greater_than_x_plus_one(schema)?, + schema, + ) + }, + JoinType::Inner, + )?; + + assert_snapshot!(optimized_plan, @r" + NestedLoopJoinExec: join_type=Inner, filter=false AND join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[a@0, x@2] + ProjectionExec: expr=[a@0 as a, a@0 + 1 as join_proj_push_down_1] + EmptyExec + ProjectionExec: expr=[x@0 as x, x@0 + 1 as join_proj_push_down_2] + EmptyExec + "); + Ok(()) + } + + #[tokio::test] + async fn does_not_push_down_volatile_functions() -> Result<()> { + let (left_schema, right_schema) = create_simple_schemas(); + let optimized_plan = run_test( + left_schema, + right_schema, + a_x(), + None, + a_plus_rand_greater_than_x, + JoinType::Inner, + )?; + + assert_snapshot!(optimized_plan, @r" + NestedLoopJoinExec: join_type=Inner, filter=a@0 + rand() > x@1 + EmptyExec + EmptyExec + "); + Ok(()) + } + + #[tokio::test] + async fn complex_schema_push_down() -> Result<()> { + let (left_schema, right_schema) = create_complex_schemas(); + + let optimized_plan = run_test( + left_schema, + right_schema, + a_b_x_z(), + None, + a_plus_b_greater_than_x_plus_z, + JoinType::Inner, + )?; + + assert_snapshot!(optimized_plan, @r" + NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[a@0, b@1, c@2, x@4, y@5, z@6] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, a@0 + b@1 as join_proj_push_down_1] + EmptyExec + ProjectionExec: expr=[x@0 as x, y@1 as y, z@2 as z, x@0 + z@2 as join_proj_push_down_2] + EmptyExec + "); + Ok(()) + } + + #[tokio::test] + async fn push_down_with_existing_projections() -> Result<()> { + let (left_schema, right_schema) = create_complex_schemas(); + + let optimized_plan = run_test( + left_schema, + right_schema, + a_b_x_z(), + Some(vec![1, 3, 5]), // ("b", "x", "z") + a_plus_b_greater_than_x_plus_z, + JoinType::Inner, + )?; + + assert_snapshot!(optimized_plan, @r" + NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[b@1, x@4, z@6] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, a@0 + b@1 as join_proj_push_down_1] + EmptyExec + ProjectionExec: expr=[x@0 as x, y@1 as y, z@2 as z, x@0 + z@2 as join_proj_push_down_2] + EmptyExec + "); + Ok(()) + } + + #[tokio::test] + async fn left_semi_join_projection() -> Result<()> { + let (left_schema, right_schema) = create_simple_schemas(); + + let left_semi_join_plan = run_test( + left_schema.clone(), + right_schema.clone(), + a_x(), + None, + a_plus_one_greater_than_x_plus_one, + JoinType::LeftSemi, + )?; + + assert_snapshot!(left_semi_join_plan, @r" + NestedLoopJoinExec: join_type=LeftSemi, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[a@0] + ProjectionExec: expr=[a@0 as a, a@0 + 1 as join_proj_push_down_1] + EmptyExec + ProjectionExec: expr=[x@0 as x, x@0 + 1 as join_proj_push_down_2] + EmptyExec + "); + Ok(()) + } + + #[tokio::test] + async fn right_semi_join_projection() -> Result<()> { + let (left_schema, right_schema) = create_simple_schemas(); + let right_semi_join_plan = run_test( + left_schema, + right_schema, + a_x(), + None, + a_plus_one_greater_than_x_plus_one, + JoinType::RightSemi, + )?; + assert_snapshot!(right_semi_join_plan, @r" + NestedLoopJoinExec: join_type=RightSemi, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[x@0] + ProjectionExec: expr=[a@0 as a, a@0 + 1 as join_proj_push_down_1] + EmptyExec + ProjectionExec: expr=[x@0 as x, x@0 + 1 as join_proj_push_down_2] + EmptyExec + "); + Ok(()) + } + + fn run_test( + left_schema: Schema, + right_schema: Schema, + column_indices: Vec, + existing_projections: Option>, + filter_expr_builder: impl FnOnce(&Schema) -> Result>, + join_type: JoinType, + ) -> Result { + let left = Arc::new(EmptyExec::new(Arc::new(left_schema.clone()))); + let right = Arc::new(EmptyExec::new(Arc::new(right_schema.clone()))); + + let join_fields: Vec<_> = column_indices + .iter() + .map(|ci| match ci.side { + JoinSide::Left => left_schema.field(ci.index).clone(), + JoinSide::Right => right_schema.field(ci.index).clone(), + JoinSide::None => unreachable!(), + }) + .collect(); + let join_schema = Arc::new(Schema::new(join_fields)); + + let filter_expr = filter_expr_builder(join_schema.as_ref())?; + + let join_filter = JoinFilter::new(filter_expr, column_indices, join_schema); + + let join = NestedLoopJoinExec::try_new( + left, + right, + Some(join_filter), + &join_type, + existing_projections, + )?; + + let optimizer = ProjectionPushdown::new(); + let optimized_plan = optimizer.optimize(Arc::new(join), &Default::default())?; + + let displayable_plan = displayable(optimized_plan.as_ref()).indent(false); + Ok(displayable_plan.to_string()) + } + + fn create_simple_schemas() -> (Schema, Schema) { + let left_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let right_schema = Schema::new(vec![Field::new("x", DataType::Int32, false)]); + + (left_schema, right_schema) + } + + fn create_complex_schemas() -> (Schema, Schema) { + let left_schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Int32, false), + ]); + + let right_schema = Schema::new(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, false), + Field::new("z", DataType::Int32, false), + ]); + + (left_schema, right_schema) + } + + fn a_x() -> Vec { + vec![ + ColumnIndex { + index: 0, + side: JoinSide::Left, + }, + ColumnIndex { + index: 0, + side: JoinSide::Right, + }, + ] + } + + fn a_b_x_z() -> Vec { + vec![ + ColumnIndex { + index: 0, + side: JoinSide::Left, + }, + ColumnIndex { + index: 1, + side: JoinSide::Left, + }, + ColumnIndex { + index: 0, + side: JoinSide::Right, + }, + ColumnIndex { + index: 2, + side: JoinSide::Right, + }, + ] + } + + fn a_plus_one_greater_than_x_plus_one( + join_schema: &Schema, + ) -> Result> { + let left_expr = binary( + Arc::new(Column::new("a", 0)), + Operator::Plus, + lit(1), + join_schema, + )?; + let right_expr = binary( + Arc::new(Column::new("x", 1)), + Operator::Plus, + lit(1), + join_schema, + )?; + binary(left_expr, Operator::Gt, right_expr, join_schema) + } + + fn a_plus_rand_greater_than_x(join_schema: &Schema) -> Result> { + let left_expr = binary( + Arc::new(Column::new("a", 0)), + Operator::Plus, + Arc::new(ScalarFunctionExpr::new( + "rand", + random(), + vec![], + FieldRef::new(Field::new("out", DataType::Float64, false)), + Arc::new(ConfigOptions::default()), + )), + join_schema, + )?; + let right_expr = Arc::new(Column::new("x", 1)); + binary(left_expr, Operator::Gt, right_expr, join_schema) + } + + fn a_greater_than_x(join_schema: &Schema) -> Result> { + binary( + Arc::new(Column::new("a", 0)), + Operator::Gt, + Arc::new(Column::new("x", 1)), + join_schema, + ) + } + + fn a_plus_b_greater_than_x_plus_z( + join_schema: &Schema, + ) -> Result> { + let lhs = binary( + Arc::new(Column::new("a", 0)), + Operator::Plus, + Arc::new(Column::new("b", 1)), + join_schema, + )?; + let rhs = binary( + Arc::new(Column::new("x", 2)), + Operator::Plus, + Arc::new(Column::new("z", 3)), + join_schema, + )?; + binary(lhs, Operator::Gt, rhs, join_schema) + } +} diff --git a/datafusion/sqllogictest/test_files/join.slt.part b/datafusion/sqllogictest/test_files/join.slt.part index 2abe654a96c8c..fe3356af88fcc 100644 --- a/datafusion/sqllogictest/test_files/join.slt.part +++ b/datafusion/sqllogictest/test_files/join.slt.part @@ -849,9 +849,10 @@ logical_plan 05)----TableScan: department projection=[dept_name] physical_plan 01)ProjectionExec: expr=[emp_id@1 as emp_id, name@2 as name, dept_name@0 as dept_name] -02)--NestedLoopJoinExec: join_type=Right, filter=name@0 = Alice OR name@0 = Bob +02)--NestedLoopJoinExec: join_type=Right, filter=join_proj_push_down_1@0, projection=[dept_name@0, emp_id@1, name@2] 03)----DataSourceExec: partitions=1, partition_sizes=[1] -04)----DataSourceExec: partitions=1, partition_sizes=[1] +04)----ProjectionExec: expr=[emp_id@0 as emp_id, name@1 as name, name@1 = Alice OR name@1 = Bob as join_proj_push_down_1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] query ITT rowsort SELECT e.emp_id, e.name, d.dept_name diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 96d2bad086e66..9472395da6418 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3519,10 +3519,12 @@ logical_plan 04)--SubqueryAlias: t2 05)----TableScan: annotated_data projection=[a0, a, b, c, d] physical_plan -01)NestedLoopJoinExec: join_type=Inner, filter=example(CAST(a@0 AS Float64), CAST(a@1 AS Float64)) > 3 -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true -03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true +01)NestedLoopJoinExec: join_type=Inner, filter=example(join_proj_push_down_1@0, join_proj_push_down_2@1) > 3, projection=[a0@0, a@1, b@2, c@3, d@4, a0@6, a@7, b@8, c@9, d@10] +02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true +04)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_2] +05)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true #### # Config teardown diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part index a6225daae4362..6b03d708c7fa2 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part @@ -75,51 +75,52 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=10), expr=[value@1 DESC], preserve_partitioning=[false] 02)--ProjectionExec: expr=[ps_partkey@0 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value] -03)----NestedLoopJoinExec: join_type=Inner, filter=CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Decimal128(38, 15)) > sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@1, projection=[ps_partkey@0, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1] -04)------CoalescePartitionsExec -05)--------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 -08)--------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] -09)----------------CoalesceBatchesExec: target_batch_size=8192 -10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[ps_partkey@0, ps_availqty@1, ps_supplycost@2] -11)--------------------CoalesceBatchesExec: target_batch_size=8192 -12)----------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4 -13)------------------------CoalesceBatchesExec: target_batch_size=8192 -14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_availqty@2, ps_supplycost@3, s_nationkey@5] -15)----------------------------CoalesceBatchesExec: target_batch_size=8192 -16)------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4 -17)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false -18)----------------------------CoalesceBatchesExec: target_batch_size=8192 -19)------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4 -20)--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -21)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false -22)--------------------CoalesceBatchesExec: target_batch_size=8192 -23)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -24)------------------------CoalesceBatchesExec: target_batch_size=8192 -25)--------------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0] -26)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -27)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false -28)------ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] -29)--------AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] -30)----------CoalescePartitionsExec -31)------------AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] -32)--------------CoalesceBatchesExec: target_batch_size=8192 -33)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_availqty@0, ps_supplycost@1] -34)------------------CoalesceBatchesExec: target_batch_size=8192 -35)--------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 -36)----------------------CoalesceBatchesExec: target_batch_size=8192 -37)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@0, s_suppkey@0)], projection=[ps_availqty@1, ps_supplycost@2, s_nationkey@4] -38)--------------------------CoalesceBatchesExec: target_batch_size=8192 -39)----------------------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4 -40)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false -41)--------------------------CoalesceBatchesExec: target_batch_size=8192 -42)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4 -43)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -44)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false -45)------------------CoalesceBatchesExec: target_batch_size=8192 -46)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -47)----------------------CoalesceBatchesExec: target_batch_size=8192 -48)------------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0] -49)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -50)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false +03)----NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@1 > sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@0, projection=[ps_partkey@0, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1, sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@3] +04)------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as sum(partsupp.ps_supplycost * partsupp.ps_availqty), CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 AS Decimal128(38, 15)) as join_proj_push_down_1] +05)--------CoalescePartitionsExec +06)----------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] +07)------------CoalesceBatchesExec: target_batch_size=8192 +08)--------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 +09)----------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] +10)------------------CoalesceBatchesExec: target_batch_size=8192 +11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[ps_partkey@0, ps_availqty@1, ps_supplycost@2] +12)----------------------CoalesceBatchesExec: target_batch_size=8192 +13)------------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4 +14)--------------------------CoalesceBatchesExec: target_batch_size=8192 +15)----------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_availqty@2, ps_supplycost@3, s_nationkey@5] +16)------------------------------CoalesceBatchesExec: target_batch_size=8192 +17)--------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4 +18)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false +19)------------------------------CoalesceBatchesExec: target_batch_size=8192 +20)--------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4 +21)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +22)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false +23)----------------------CoalesceBatchesExec: target_batch_size=8192 +24)------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +25)--------------------------CoalesceBatchesExec: target_batch_size=8192 +26)----------------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0] +27)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +28)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false +29)------ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] +30)--------AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] +31)----------CoalescePartitionsExec +32)------------AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] +33)--------------CoalesceBatchesExec: target_batch_size=8192 +34)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_availqty@0, ps_supplycost@1] +35)------------------CoalesceBatchesExec: target_batch_size=8192 +36)--------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 +37)----------------------CoalesceBatchesExec: target_batch_size=8192 +38)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@0, s_suppkey@0)], projection=[ps_availqty@1, ps_supplycost@2, s_nationkey@4] +39)--------------------------CoalesceBatchesExec: target_batch_size=8192 +40)----------------------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4 +41)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false +42)--------------------------CoalesceBatchesExec: target_batch_size=8192 +43)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4 +44)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +45)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false +46)------------------CoalesceBatchesExec: target_batch_size=8192 +47)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +48)----------------------CoalesceBatchesExec: target_batch_size=8192 +49)------------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0] +50)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +51)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part index fc9c01843cc75..22476156b80d8 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part @@ -83,23 +83,24 @@ physical_plan 07)------------AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)] 08)--------------ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal] 09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -10)------------------NestedLoopJoinExec: join_type=Inner, filter=CAST(c_acctbal@0 AS Decimal128(19, 6)) > avg(customer.c_acctbal)@1 -11)--------------------CoalescePartitionsExec -12)----------------------CoalesceBatchesExec: target_batch_size=8192 -13)------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2] -14)--------------------------CoalesceBatchesExec: target_batch_size=8192 -15)----------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -16)------------------------------CoalesceBatchesExec: target_batch_size=8192 -17)--------------------------------FilterExec: substr(c_phone@1, 1, 2) IN ([13, 31, 23, 29, 30, 18, 17]) -18)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -19)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false -20)--------------------------CoalesceBatchesExec: target_batch_size=8192 -21)----------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4 -22)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false -23)--------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] -24)----------------------CoalescePartitionsExec -25)------------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] -26)--------------------------CoalesceBatchesExec: target_batch_size=8192 -27)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1] -28)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -29)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false +10)------------------NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@1 > avg(customer.c_acctbal)@0, projection=[c_phone@0, c_acctbal@1, avg(customer.c_acctbal)@3] +11)--------------------ProjectionExec: expr=[c_phone@0 as c_phone, c_acctbal@1 as c_acctbal, CAST(c_acctbal@1 AS Decimal128(19, 6)) as join_proj_push_down_1] +12)----------------------CoalescePartitionsExec +13)------------------------CoalesceBatchesExec: target_batch_size=8192 +14)--------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2] +15)----------------------------CoalesceBatchesExec: target_batch_size=8192 +16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 +17)--------------------------------CoalesceBatchesExec: target_batch_size=8192 +18)----------------------------------FilterExec: substr(c_phone@1, 1, 2) IN ([13, 31, 23, 29, 30, 18, 17]) +19)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +20)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false +21)----------------------------CoalesceBatchesExec: target_batch_size=8192 +22)------------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4 +23)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false +24)--------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] +25)----------------------CoalescePartitionsExec +26)------------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] +27)--------------------------CoalesceBatchesExec: target_batch_size=8192 +28)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1] +29)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +30)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false From 264030cca76d0bdb4d8809f252b422e72624a345 Mon Sep 17 00:00:00 2001 From: Oleks V Date: Wed, 15 Oct 2025 13:06:02 -0700 Subject: [PATCH 004/109] feat: support Spark `concat` string function (#18063) * chore: Extend backtrace coverage * fmt * part2 * feedback * clippy * feat: support Spark `concat` * clippy * comments * test * doc --- .../spark/src/function/string/concat.rs | 306 ++++++++++++++++++ datafusion/spark/src/function/string/mod.rs | 8 + .../test_files/spark/string/concat.slt | 48 +++ 3 files changed, 362 insertions(+) create mode 100644 datafusion/spark/src/function/string/concat.rs create mode 100644 datafusion/sqllogictest/test_files/spark/string/concat.slt diff --git a/datafusion/spark/src/function/string/concat.rs b/datafusion/spark/src/function/string/concat.rs new file mode 100644 index 0000000000000..0e981e7c37224 --- /dev/null +++ b/datafusion/spark/src/function/string/concat.rs @@ -0,0 +1,306 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, ArrayBuilder}; +use arrow::datatypes::DataType; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use datafusion_functions::string::concat::ConcatFunc; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible `concat` expression +/// +/// +/// Concatenates multiple input strings into a single string. +/// Returns NULL if any input is NULL. +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkConcat { + signature: Signature, +} + +impl Default for SparkConcat { + fn default() -> Self { + Self::new() + } +} + +impl SparkConcat { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![TypeSignature::UserDefined, TypeSignature::Nullary], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkConcat { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "concat" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + spark_concat(args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + // Accept any string types, including zero arguments + Ok(arg_types.to_vec()) + } +} + +/// Concatenates strings, returning NULL if any input is NULL +/// This is a Spark-specific wrapper around DataFusion's concat that returns NULL +/// if any argument is NULL (Spark behavior), whereas DataFusion's concat ignores NULLs. +fn spark_concat(args: ScalarFunctionArgs) -> Result { + let ScalarFunctionArgs { + args: arg_values, + arg_fields, + number_rows, + return_field, + config_options, + } = args; + + // Handle zero-argument case: return empty string + if arg_values.is_empty() { + return Ok(ColumnarValue::Scalar(ScalarValue::Utf8( + Some(String::new()), + ))); + } + + // Step 1: Check for NULL mask in incoming args + let null_mask = compute_null_mask(&arg_values, number_rows)?; + + // If all scalars and any is NULL, return NULL immediately + if null_mask.is_none() { + return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); + } + + // Step 2: Delegate to DataFusion's concat + let concat_func = ConcatFunc::new(); + let func_args = ScalarFunctionArgs { + args: arg_values, + arg_fields, + number_rows, + return_field, + config_options, + }; + let result = concat_func.invoke_with_args(func_args)?; + + // Step 3: Apply NULL mask to result + apply_null_mask(result, null_mask) +} + +/// Compute NULL mask for the arguments +/// Returns None if all scalars and any is NULL, or a Vector of +/// boolean representing the null mask for incoming arrays +fn compute_null_mask( + args: &[ColumnarValue], + number_rows: usize, +) -> Result>> { + // Check if all arguments are scalars + let all_scalars = args + .iter() + .all(|arg| matches!(arg, ColumnarValue::Scalar(_))); + + if all_scalars { + // For scalars, check if any is NULL + for arg in args { + if let ColumnarValue::Scalar(scalar) = arg { + if scalar.is_null() { + // Return None to indicate all values should be NULL + return Ok(None); + } + } + } + // No NULLs in scalars + Ok(Some(vec![])) + } else { + // For arrays, compute NULL mask for each row + let array_len = args + .iter() + .find_map(|arg| match arg { + ColumnarValue::Array(array) => Some(array.len()), + _ => None, + }) + .unwrap_or(number_rows); + + // Convert all scalars to arrays for uniform processing + let arrays: Result> = args + .iter() + .map(|arg| match arg { + ColumnarValue::Array(array) => Ok(Arc::clone(array)), + ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(array_len), + }) + .collect(); + let arrays = arrays?; + + // Compute NULL mask + let mut null_mask = vec![false; array_len]; + for array in &arrays { + for (i, null_flag) in null_mask.iter_mut().enumerate().take(array_len) { + if array.is_null(i) { + *null_flag = true; + } + } + } + + Ok(Some(null_mask)) + } +} + +/// Apply NULL mask to the result +fn apply_null_mask( + result: ColumnarValue, + null_mask: Option>, +) -> Result { + match (result, null_mask) { + // Scalar with NULL mask means return NULL + (ColumnarValue::Scalar(_), None) => { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))) + } + // Scalar without NULL mask, return as-is + (scalar @ ColumnarValue::Scalar(_), Some(mask)) if mask.is_empty() => Ok(scalar), + // Array with NULL mask + (ColumnarValue::Array(array), Some(null_mask)) if !null_mask.is_empty() => { + let array_len = array.len(); + let return_type = array.data_type(); + + let mut builder: Box = match return_type { + DataType::Utf8 => { + let string_array = array + .as_any() + .downcast_ref::() + .unwrap(); + let mut builder = + arrow::array::StringBuilder::with_capacity(array_len, 0); + for (i, &is_null) in null_mask.iter().enumerate().take(array_len) { + if is_null || string_array.is_null(i) { + builder.append_null(); + } else { + builder.append_value(string_array.value(i)); + } + } + Box::new(builder) + } + DataType::LargeUtf8 => { + let string_array = array + .as_any() + .downcast_ref::() + .unwrap(); + let mut builder = + arrow::array::LargeStringBuilder::with_capacity(array_len, 0); + for (i, &is_null) in null_mask.iter().enumerate().take(array_len) { + if is_null || string_array.is_null(i) { + builder.append_null(); + } else { + builder.append_value(string_array.value(i)); + } + } + Box::new(builder) + } + DataType::Utf8View => { + let string_array = array + .as_any() + .downcast_ref::() + .unwrap(); + let mut builder = + arrow::array::StringViewBuilder::with_capacity(array_len); + for (i, &is_null) in null_mask.iter().enumerate().take(array_len) { + if is_null || string_array.is_null(i) { + builder.append_null(); + } else { + builder.append_value(string_array.value(i)); + } + } + Box::new(builder) + } + _ => { + return datafusion_common::exec_err!( + "Unsupported return type for concat: {:?}", + return_type + ); + } + }; + + Ok(ColumnarValue::Array(builder.finish())) + } + // Array without NULL mask, return as-is + (array @ ColumnarValue::Array(_), _) => Ok(array), + // Shouldn't happen + (scalar, _) => Ok(scalar), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::function::utils::test::test_scalar_function; + use arrow::array::StringArray; + use arrow::datatypes::DataType; + use datafusion_common::Result; + + #[test] + fn test_concat_basic() -> Result<()> { + test_scalar_function!( + SparkConcat::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(Some("Spark".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("SQL".to_string()))), + ], + Ok(Some("SparkSQL")), + &str, + DataType::Utf8, + StringArray + ); + Ok(()) + } + + #[test] + fn test_concat_with_null() -> Result<()> { + test_scalar_function!( + SparkConcat::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(Some("Spark".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("SQL".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ], + Ok(None), + &str, + DataType::Utf8, + StringArray + ); + Ok(()) + } +} diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 3115c1e960fa8..480984f02159b 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -17,6 +17,7 @@ pub mod ascii; pub mod char; +pub mod concat; pub mod elt; pub mod format_string; pub mod ilike; @@ -30,6 +31,7 @@ use std::sync::Arc; make_udf_function!(ascii::SparkAscii, ascii); make_udf_function!(char::CharFunc, char); +make_udf_function!(concat::SparkConcat, concat); make_udf_function!(ilike::SparkILike, ilike); make_udf_function!(length::SparkLengthFunc, length); make_udf_function!(elt::SparkElt, elt); @@ -50,6 +52,11 @@ pub mod expr_fn { "Returns the ASCII character having the binary equivalent to col. If col is larger than 256 the result is equivalent to char(col % 256).", arg1 )); + export_functions!(( + concat, + "Concatenates multiple input strings into a single string. Returns NULL if any input is NULL.", + args + )); export_functions!(( elt, "Returns the n-th input (1-indexed), e.g. returns 2nd input when n is 2. The function returns NULL if the index is 0 or exceeds the length of the array.", @@ -86,6 +93,7 @@ pub fn functions() -> Vec> { vec![ ascii(), char(), + concat(), elt(), ilike(), length(), diff --git a/datafusion/sqllogictest/test_files/spark/string/concat.slt b/datafusion/sqllogictest/test_files/spark/string/concat.slt new file mode 100644 index 0000000000000..0b796a54a69e8 --- /dev/null +++ b/datafusion/sqllogictest/test_files/spark/string/concat.slt @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +query T +SELECT concat('Spark', 'SQL'); +---- +SparkSQL + +query T +SELECT concat('Spark', 'SQL', NULL); +---- +NULL + +query T +SELECT concat('', '1', '', '2'); +---- +12 + +query T +SELECT concat(); +---- +(empty) + +query T +SELECT concat(''); +---- +(empty) + + +query T +SELECT concat(a, b, c) from (select 'a' a, 'b' b, 'c' c union all select null a, 'b', 'c') order by 1 nulls last; +---- +abc +NULL \ No newline at end of file From 4153adf2c0f6e317ef476febfdc834208bd46622 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 16 Oct 2025 14:28:58 +0800 Subject: [PATCH 005/109] Add independent configs for topk/join dynamic filter (#18090) * Add independent configs for topk/join dynamic filter * fix ci * update doc * fix typo --- datafusion/common/src/config.rs | 26 +- .../physical-plan/src/joins/hash_join/exec.rs | 2 +- datafusion/physical-plan/src/sorts/sort.rs | 2 +- .../dynamic_filter_pushdown_config.slt | 339 ++++++++++++++++++ .../test_files/information_schema.slt | 6 +- docs/source/user-guide/configs.md | 4 +- 6 files changed, 374 insertions(+), 5 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 39d730eaafb49..9bde637f43794 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -745,11 +745,21 @@ config_namespace! { /// past window functions, if possible pub enable_window_limits: bool, default = true - /// When set to true attempts to push down dynamic filters generated by operators into the file scan phase. + /// When set to true, the optimizer will attempt to push down TopK dynamic filters + /// into the file scan phase. + pub enable_topk_dynamic_filter_pushdown: bool, default = true + + /// When set to true, the optimizer will attempt to push down Join dynamic filters + /// into the file scan phase. + pub enable_join_dynamic_filter_pushdown: bool, default = true + + /// When set to true attempts to push down dynamic filters generated by operators (topk & join) into the file scan phase. /// For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer /// will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. /// This means that if we already have 10 timestamps in the year 2025 /// any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. + /// The config will suppress `enable_join_dynamic_filter_pushdown` & `enable_topk_dynamic_filter_pushdown` + /// So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. pub enable_dynamic_filter_pushdown: bool, default = true /// When set to true, the optimizer will insert filters before a join between @@ -1039,6 +1049,20 @@ impl ConfigOptions { }; if prefix == "datafusion" { + if key == "optimizer.enable_dynamic_filter_pushdown" { + let bool_value = value.parse::().map_err(|e| { + DataFusionError::Configuration(format!( + "Failed to parse '{value}' as bool: {e}", + )) + })?; + + { + self.optimizer.enable_dynamic_filter_pushdown = bool_value; + self.optimizer.enable_topk_dynamic_filter_pushdown = bool_value; + self.optimizer.enable_join_dynamic_filter_pushdown = bool_value; + } + return Ok(()); + } return ConfigField::set(self, key, value); } diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 4c293b0498e77..b5fe5ee5cda14 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -1137,7 +1137,7 @@ impl ExecutionPlan for HashJoinExec { // Add dynamic filters in Post phase if enabled if matches!(phase, FilterPushdownPhase::Post) - && config.optimizer.enable_dynamic_filter_pushdown + && config.optimizer.enable_join_dynamic_filter_pushdown { // Add actual dynamic filter to right side (probe side) let dynamic_filter = Self::create_dynamic_filter(&self.on); diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 7f47d60c735a3..bd798ab4f54b2 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1355,7 +1355,7 @@ impl ExecutionPlan for SortExec { ChildFilterDescription::from_child(&parent_filters, self.input())?; if let Some(filter) = &self.filter { - if config.optimizer.enable_dynamic_filter_pushdown { + if config.optimizer.enable_topk_dynamic_filter_pushdown { child = child.with_self_filter(filter.read().expr()); } } diff --git a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt new file mode 100644 index 0000000000000..e5cd6d88b08f4 --- /dev/null +++ b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt @@ -0,0 +1,339 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for dynamic filter pushdown configuration options +# - enable_topk_dynamic_filter_pushdown (for TopK dynamic filters) +# - enable_join_dynamic_filter_pushdown (for Join dynamic filters) +# - enable_dynamic_filter_pushdown (controls both) + +# Setup: Create parquet test files +statement ok +CREATE TABLE test_data(id INT, value INT, name VARCHAR) AS VALUES +(1, 100, 'a'), +(2, 200, 'b'), +(3, 300, 'c'), +(4, 400, 'd'), +(5, 500, 'e'), +(6, 600, 'f'), +(7, 700, 'g'), +(8, 800, 'h'), +(9, 900, 'i'), +(10, 1000, 'j'); + +statement ok +CREATE TABLE join_left(id INT, data VARCHAR) AS VALUES +(1, 'left1'), +(2, 'left2'), +(3, 'left3'), +(4, 'left4'), +(5, 'left5'); + +statement ok +CREATE TABLE join_right(id INT, info VARCHAR) AS VALUES +(1, 'right1'), +(3, 'right3'), +(5, 'right5'); + +# Copy data to parquet files +query I +COPY test_data TO 'test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet' STORED AS PARQUET; +---- +10 + +query I +COPY join_left TO 'test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet' STORED AS PARQUET; +---- +5 + +query I +COPY join_right TO 'test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet' STORED AS PARQUET; +---- +3 + +# Create external tables from parquet files +statement ok +CREATE EXTERNAL TABLE test_parquet(id INT, value INT, name VARCHAR) +STORED AS PARQUET +LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet'; + +statement ok +CREATE EXTERNAL TABLE left_parquet(id INT, data VARCHAR) +STORED AS PARQUET +LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet'; + +statement ok +CREATE EXTERNAL TABLE right_parquet(id INT, info VARCHAR) +STORED AS PARQUET +LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet'; + +# Test 1: TopK dynamic filter pushdown with Parquet +query TT +EXPLAIN SELECT * FROM test_parquet ORDER BY value DESC LIMIT 3; +---- +logical_plan +01)Sort: test_parquet.value DESC NULLS FIRST, fetch=3 +02)--TableScan: test_parquet projection=[id, value, name] +physical_plan +01)SortExec: TopK(fetch=3), expr=[value@1 DESC], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Disable TopK dynamic filter pushdown +statement ok +SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = false; + +query TT +EXPLAIN SELECT * FROM test_parquet ORDER BY value DESC LIMIT 3; +---- +logical_plan +01)Sort: test_parquet.value DESC NULLS FIRST, fetch=3 +02)--TableScan: test_parquet projection=[id, value, name] +physical_plan +01)SortExec: TopK(fetch=3), expr=[value@1 DESC], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value, name], file_type=parquet + +# Re-enable for next tests +statement ok +SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true; + +# Test 2: Join dynamic filter pushdown with Parquet +query TT +EXPLAIN SELECT l.*, r.info +FROM left_parquet l +INNER JOIN right_parquet r ON l.id = r.id; +---- +logical_plan +01)Projection: l.id, l.data, r.info +02)--Inner Join: l.id = r.id +03)----SubqueryAlias: l +04)------TableScan: left_parquet projection=[id, data] +05)----SubqueryAlias: r +06)------TableScan: right_parquet projection=[id, info] +physical_plan +01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Disable Join dynamic filter pushdown +statement ok +SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = false; + +# Without Join filter, HashJoin should NOT have filter=DynamicFilter +query TT +EXPLAIN SELECT l.*, r.info +FROM left_parquet l +INNER JOIN right_parquet r ON l.id = r.id; +---- +logical_plan +01)Projection: l.id, l.data, r.info +02)--Inner Join: l.id = r.id +03)----SubqueryAlias: l +04)------TableScan: left_parquet projection=[id, data] +05)----SubqueryAlias: r +06)------TableScan: right_parquet projection=[id, info] +physical_plan +01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet + +# Re-enable for next tests +statement ok +SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true; + +# Test 3: Test independent control + +# Disable TopK, keep Join enabled +statement ok +SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = false; + +statement ok +SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true; + +# Join should still have dynamic filter +query TT +EXPLAIN SELECT l.*, r.info +FROM left_parquet l +INNER JOIN right_parquet r ON l.id = r.id; +---- +logical_plan +01)Projection: l.id, l.data, r.info +02)--Inner Join: l.id = r.id +03)----SubqueryAlias: l +04)------TableScan: left_parquet projection=[id, data] +05)----SubqueryAlias: r +06)------TableScan: right_parquet projection=[id, info] +physical_plan +01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Enable TopK, disable Join +statement ok +SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true; + +statement ok +SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = false; + +# Join should NOT have dynamic filter +query TT +EXPLAIN SELECT l.*, r.info +FROM left_parquet l +INNER JOIN right_parquet r ON l.id = r.id; +---- +logical_plan +01)Projection: l.id, l.data, r.info +02)--Inner Join: l.id = r.id +03)----SubqueryAlias: l +04)------TableScan: left_parquet projection=[id, data] +05)----SubqueryAlias: r +06)------TableScan: right_parquet projection=[id, info] +physical_plan +01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet + +# Test 4: Backward compatibility + +# First, set both new configs to specific values +statement ok +SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true; + +statement ok +SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true; + +statement ok +set datafusion.catalog.information_schema = true + +# Setting the config should override both +statement ok +SET datafusion.optimizer.enable_dynamic_filter_pushdown = false; + +# Verify both configs are now false +query T +SELECT value FROM information_schema.df_settings +WHERE name = 'datafusion.optimizer.enable_topk_dynamic_filter_pushdown'; +---- +false + +query T +SELECT value FROM information_schema.df_settings +WHERE name = 'datafusion.optimizer.enable_join_dynamic_filter_pushdown'; +---- +false + +statement ok +set datafusion.catalog.information_schema = false + +# Join should NOT have dynamic filter +query TT +EXPLAIN SELECT l.*, r.info +FROM left_parquet l +INNER JOIN right_parquet r ON l.id = r.id; +---- +logical_plan +01)Projection: l.id, l.data, r.info +02)--Inner Join: l.id = r.id +03)----SubqueryAlias: l +04)------TableScan: left_parquet projection=[id, data] +05)----SubqueryAlias: r +06)------TableScan: right_parquet projection=[id, info] +physical_plan +01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet + +# Re-enable +statement ok +SET datafusion.optimizer.enable_dynamic_filter_pushdown = true; + +statement ok +set datafusion.catalog.information_schema = true + +# Verify both configs are now true +query T +SELECT value FROM information_schema.df_settings +WHERE name = 'datafusion.optimizer.enable_topk_dynamic_filter_pushdown'; +---- +true + +query T +SELECT value FROM information_schema.df_settings +WHERE name = 'datafusion.optimizer.enable_join_dynamic_filter_pushdown'; +---- +true + +statement ok +set datafusion.catalog.information_schema = false + +# Join should have dynamic filter again +query TT +EXPLAIN SELECT l.*, r.info +FROM left_parquet l +INNER JOIN right_parquet r ON l.id = r.id; +---- +logical_plan +01)Projection: l.id, l.data, r.info +02)--Inner Join: l.id = r.id +03)----SubqueryAlias: l +04)------TableScan: left_parquet projection=[id, data] +05)----SubqueryAlias: r +06)------TableScan: right_parquet projection=[id, info] +physical_plan +01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Cleanup + +statement ok +DROP TABLE test_data; + +statement ok +DROP TABLE join_left; + +statement ok +DROP TABLE join_right; + +statement ok +DROP TABLE test_parquet; + +statement ok +DROP TABLE left_parquet; + +statement ok +DROP TABLE right_parquet; + +# Reset configs to defaults +statement ok +SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true; + +statement ok +SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true; + +statement ok +SET datafusion.optimizer.enable_dynamic_filter_pushdown = true; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 670992633bb85..a69a8d5c0d8f6 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -289,8 +289,10 @@ datafusion.optimizer.allow_symmetric_joins_without_pruning true datafusion.optimizer.default_filter_selectivity 20 datafusion.optimizer.enable_distinct_aggregation_soft_limit true datafusion.optimizer.enable_dynamic_filter_pushdown true +datafusion.optimizer.enable_join_dynamic_filter_pushdown true datafusion.optimizer.enable_round_robin_repartition true datafusion.optimizer.enable_topk_aggregation true +datafusion.optimizer.enable_topk_dynamic_filter_pushdown true datafusion.optimizer.enable_window_limits true datafusion.optimizer.expand_views_at_output false datafusion.optimizer.filter_null_join_keys false @@ -404,9 +406,11 @@ datafusion.format.types_info false Show types in visual representation batches datafusion.optimizer.allow_symmetric_joins_without_pruning true Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. datafusion.optimizer.default_filter_selectivity 20 The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). datafusion.optimizer.enable_distinct_aggregation_soft_limit true When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. -datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. +datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators (topk & join) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown` & `enable_topk_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. +datafusion.optimizer.enable_join_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase. datafusion.optimizer.enable_round_robin_repartition true When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores datafusion.optimizer.enable_topk_aggregation true When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible +datafusion.optimizer.enable_topk_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase. datafusion.optimizer.enable_window_limits true When set to true, the optimizer will attempt to push limit operations past window functions, if possible datafusion.optimizer.expand_views_at_output false When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. datafusion.optimizer.filter_null_join_keys false When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 6bc7b90e893ad..ab3b11a8d833a 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -132,7 +132,9 @@ The following configuration settings are available: | datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | | datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | | datafusion.optimizer.enable_window_limits | true | When set to true, the optimizer will attempt to push limit operations past window functions, if possible | -| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. | +| datafusion.optimizer.enable_topk_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_join_dynamic_filter_pushdown | true | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase. | +| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators (topk & join) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown` & `enable_topk_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden. | | datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | | datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | From 3bca1bb6eb7251ef48dd639d371be2427f5ca696 Mon Sep 17 00:00:00 2001 From: Blake Orth Date: Thu, 16 Oct 2025 04:38:08 -0600 Subject: [PATCH 006/109] Adds Trace and Summary to CLI instrumented stores (#18064) - Adds the ability for a user to choose a summary only output for an instrumented object store when using the CLI - The existing "enabled" setting that displays both a summary and a detailed usage for each object store call has been renamed to `Trace` to improve clarity - Adds additional test cases for summary only and modifies existing tests to use trace - Updates user guide docs to reflect the CLI flag and command line changes --- datafusion-cli/src/command.rs | 17 +++++++--- datafusion-cli/src/main.rs | 2 +- .../src/object_storage/instrumented.rs | 31 ++++++++++++------- datafusion-cli/src/print_options.rs | 17 +++++----- datafusion-cli/tests/cli_integration.rs | 7 +++-- ...bject_store_profiling@s3_url_fallback.snap | 28 +++++++++++++++-- docs/source/user-guide/cli/usage.md | 4 +-- 7 files changed, 75 insertions(+), 31 deletions(-) diff --git a/datafusion-cli/src/command.rs b/datafusion-cli/src/command.rs index 48fb37e8a8880..3fbfe5680cfcd 100644 --- a/datafusion-cli/src/command.rs +++ b/datafusion-cli/src/command.rs @@ -128,7 +128,7 @@ impl Command { let profile_mode = mode .parse() .map_err(|_| - exec_datafusion_err!("Failed to parse input: {mode}. Valid options are disabled, enabled") + exec_datafusion_err!("Failed to parse input: {mode}. Valid options are disabled, summary, trace") )?; print_options .instrumented_registry @@ -165,7 +165,7 @@ impl Command { ("\\pset [NAME [VALUE]]", "set table output option\n(format)") } Self::ObjectStoreProfileMode(_) => ( - "\\object_store_profiling (disabled|enabled)", + "\\object_store_profiling (disabled|summary|trace)", "print or set object store profile mode", ), } @@ -312,13 +312,22 @@ mod tests { InstrumentedObjectStoreMode::default() ); - cmd = "object_store_profiling enabled" + cmd = "object_store_profiling summary" .parse() .expect("expected parse to succeed"); assert!(cmd.execute(&ctx, &mut print_options).await.is_ok()); assert_eq!( print_options.instrumented_registry.instrument_mode(), - InstrumentedObjectStoreMode::Enabled + InstrumentedObjectStoreMode::Summary + ); + + cmd = "object_store_profiling trace" + .parse() + .expect("expected parse to succeed"); + assert!(cmd.execute(&ctx, &mut print_options).await.is_ok()); + assert_eq!( + print_options.instrumented_registry.instrument_mode(), + InstrumentedObjectStoreMode::Trace ); cmd = "object_store_profiling does_not_exist" diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 3dbe839d3c9b3..bdb2fdf5198e2 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -151,7 +151,7 @@ struct Args { #[clap( long, - help = "Specify the default object_store_profiling mode, defaults to 'disabled'.\n[possible values: disabled, enabled]", + help = "Specify the default object_store_profiling mode, defaults to 'disabled'.\n[possible values: disabled, summary, trace]", default_value_t = InstrumentedObjectStoreMode::Disabled )] object_store_profiling: InstrumentedObjectStoreMode, diff --git a/datafusion-cli/src/object_storage/instrumented.rs b/datafusion-cli/src/object_storage/instrumented.rs index 9252e0688c35a..cb96734f24645 100644 --- a/datafusion-cli/src/object_storage/instrumented.rs +++ b/datafusion-cli/src/object_storage/instrumented.rs @@ -48,8 +48,10 @@ pub enum InstrumentedObjectStoreMode { /// Disable collection of profiling data #[default] Disabled, - /// Enable collection of profiling data - Enabled, + /// Enable collection of profiling data and output a summary + Summary, + /// Enable collection of profiling data and output a summary and all details + Trace, } impl fmt::Display for InstrumentedObjectStoreMode { @@ -64,7 +66,8 @@ impl FromStr for InstrumentedObjectStoreMode { fn from_str(s: &str) -> std::result::Result { match s.to_lowercase().as_str() { "disabled" => Ok(Self::Disabled), - "enabled" => Ok(Self::Enabled), + "summary" => Ok(Self::Summary), + "trace" => Ok(Self::Trace), _ => Err(DataFusionError::Execution(format!("Unrecognized mode {s}"))), } } @@ -73,7 +76,8 @@ impl FromStr for InstrumentedObjectStoreMode { impl From for InstrumentedObjectStoreMode { fn from(value: u8) -> Self { match value { - 1 => InstrumentedObjectStoreMode::Enabled, + 1 => InstrumentedObjectStoreMode::Summary, + 2 => InstrumentedObjectStoreMode::Trace, _ => InstrumentedObjectStoreMode::Disabled, } } @@ -434,16 +438,21 @@ mod tests { InstrumentedObjectStoreMode::Disabled )); assert!(matches!( - "EnABlEd".parse().unwrap(), - InstrumentedObjectStoreMode::Enabled + "SUmMaRy".parse().unwrap(), + InstrumentedObjectStoreMode::Summary + )); + assert!(matches!( + "TRaCe".parse().unwrap(), + InstrumentedObjectStoreMode::Trace )); assert!("does_not_exist" .parse::() .is_err()); assert!(matches!(0.into(), InstrumentedObjectStoreMode::Disabled)); - assert!(matches!(1.into(), InstrumentedObjectStoreMode::Enabled)); - assert!(matches!(2.into(), InstrumentedObjectStoreMode::Disabled)); + assert!(matches!(1.into(), InstrumentedObjectStoreMode::Summary)); + assert!(matches!(2.into(), InstrumentedObjectStoreMode::Trace)); + assert!(matches!(3.into(), InstrumentedObjectStoreMode::Disabled)); } #[test] @@ -455,8 +464,8 @@ mod tests { InstrumentedObjectStoreMode::default() ); - reg = reg.with_profile_mode(InstrumentedObjectStoreMode::Enabled); - assert_eq!(reg.instrument_mode(), InstrumentedObjectStoreMode::Enabled); + reg = reg.with_profile_mode(InstrumentedObjectStoreMode::Trace); + assert_eq!(reg.instrument_mode(), InstrumentedObjectStoreMode::Trace); let store = object_store::memory::InMemory::new(); let url = "mem://test".parse().unwrap(); @@ -484,7 +493,7 @@ mod tests { let _ = instrumented.get(&path).await.unwrap(); assert!(instrumented.requests.lock().is_empty()); - instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Enabled); + instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace); assert!(instrumented.requests.lock().is_empty()); let _ = instrumented.get(&path).await.unwrap(); assert_eq!(instrumented.requests.lock().len(), 1); diff --git a/datafusion-cli/src/print_options.rs b/datafusion-cli/src/print_options.rs index f54de189b4ef5..01be736ca54df 100644 --- a/datafusion-cli/src/print_options.rs +++ b/datafusion-cli/src/print_options.rs @@ -188,20 +188,21 @@ impl PrintOptions { if !self.quiet { writeln!(writer, "{formatted_exec_details}")?; - if self.instrumented_registry.instrument_mode() - != InstrumentedObjectStoreMode::Disabled - { + let instrument_mode = self.instrumented_registry.instrument_mode(); + if instrument_mode != InstrumentedObjectStoreMode::Disabled { writeln!(writer, "{OBJECT_STORE_PROFILING_HEADER}")?; for store in self.instrumented_registry.stores() { let requests = store.take_requests(); if !requests.is_empty() { writeln!(writer, "{store}")?; - for req in requests.iter() { - writeln!(writer, "{req}")?; + if instrument_mode == InstrumentedObjectStoreMode::Trace { + for req in requests.iter() { + writeln!(writer, "{req}")?; + } + // Add an extra blank line to help visually organize the output + writeln!(writer)?; } - // Add an extra blank line to help visually organize the output - writeln!(writer)?; writeln!(writer, "Summaries:")?; let summaries = RequestSummary::summarize_by_operation(&requests); @@ -252,7 +253,7 @@ mod tests { print_output.clear(); print_options .instrumented_registry - .set_instrument_mode(InstrumentedObjectStoreMode::Enabled); + .set_instrument_mode(InstrumentedObjectStoreMode::Trace); print_options.write_output(&mut print_output, exec_out.clone())?; let out_str: String = print_output .clone() diff --git a/datafusion-cli/tests/cli_integration.rs b/datafusion-cli/tests/cli_integration.rs index a67924fef2537..56620346ed0fe 100644 --- a/datafusion-cli/tests/cli_integration.rs +++ b/datafusion-cli/tests/cli_integration.rs @@ -434,8 +434,11 @@ LOCATION 's3://data/cars.csv'; -- Initial query should not show any profiling as the object store is not instrumented yet SELECT * from CARS LIMIT 1; -\object_store_profiling enabled --- Query again to see the profiling output +\object_store_profiling trace +-- Query again to see the full profiling output +SELECT * from CARS LIMIT 1; +\object_store_profiling summary +-- Query again to see the summarized profiling output SELECT * from CARS LIMIT 1; \object_store_profiling disabled -- Final query should not show any profiling as we disabled it again diff --git a/datafusion-cli/tests/snapshots/object_store_profiling@s3_url_fallback.snap b/datafusion-cli/tests/snapshots/object_store_profiling@s3_url_fallback.snap index 50c6cc8eab99f..5c91800676a4d 100644 --- a/datafusion-cli/tests/snapshots/object_store_profiling@s3_url_fallback.snap +++ b/datafusion-cli/tests/snapshots/object_store_profiling@s3_url_fallback.snap @@ -8,7 +8,7 @@ info: AWS_ALLOW_HTTP: "true" AWS_ENDPOINT: "http://localhost:55031" AWS_SECRET_ACCESS_KEY: TEST-DataFusionPassword - stdin: "\n CREATE EXTERNAL TABLE CARS\nSTORED AS CSV\nLOCATION 's3://data/cars.csv';\n\n-- Initial query should not show any profiling as the object store is not instrumented yet\nSELECT * from CARS LIMIT 1;\n\\object_store_profiling enabled\n-- Query again to see the profiling output\nSELECT * from CARS LIMIT 1;\n\\object_store_profiling disabled\n-- Final query should not show any profiling as we disabled it again\nSELECT * from CARS LIMIT 1;\n" + stdin: "\n CREATE EXTERNAL TABLE CARS\nSTORED AS CSV\nLOCATION 's3://data/cars.csv';\n\n-- Initial query should not show any profiling as the object store is not instrumented yet\nSELECT * from CARS LIMIT 1;\n\\object_store_profiling trace\n-- Query again to see the full profiling output\nSELECT * from CARS LIMIT 1;\n\\object_store_profiling summary\n-- Query again to see the summarized profiling output\nSELECT * from CARS LIMIT 1;\n\\object_store_profiling disabled\n-- Final query should not show any profiling as we disabled it again\nSELECT * from CARS LIMIT 1;\n" snapshot_kind: text --- success: true @@ -26,7 +26,7 @@ exit_code: 0 1 row(s) fetched. [ELAPSED] -ObjectStore Profile mode set to Enabled +ObjectStore Profile mode set to Trace +-----+-------+---------------------+ | car | speed | time | +-----+-------+---------------------+ @@ -36,7 +36,7 @@ ObjectStore Profile mode set to Enabled [ELAPSED] Object Store Profiling -Instrumented Object Store: instrument_mode: Enabled, inner: AmazonS3(data) +Instrumented Object Store: instrument_mode: Trace, inner: AmazonS3(data) operation=Get duration=[DURATION] size=1006 path=cars.csv Summaries: @@ -50,6 +50,28 @@ size max: 1006 B size avg: 1006 B size sum: 1006 B +ObjectStore Profile mode set to Summary ++-----+-------+---------------------+ +| car | speed | time | ++-----+-------+---------------------+ +| red | 20.0 | 1996-04-12T12:05:03 | ++-----+-------+---------------------+ +1 row(s) fetched. +[ELAPSED] + +Object Store Profiling +Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(data) +Summaries: +Get +count: 1 +[SUMMARY_DURATION] +[SUMMARY_DURATION] +[SUMMARY_DURATION] +size min: 1006 B +size max: 1006 B +size avg: 1006 B +size sum: 1006 B + ObjectStore Profile mode set to Disabled +-----+-------+---------------------+ | car | speed | time | diff --git a/docs/source/user-guide/cli/usage.md b/docs/source/user-guide/cli/usage.md index 57a96c5d79003..29ed6b8183c26 100644 --- a/docs/source/user-guide/cli/usage.md +++ b/docs/source/user-guide/cli/usage.md @@ -65,7 +65,7 @@ OPTIONS: --object-store-profiling Specify the default object_store_profiling mode, defaults to 'disabled'. - [possible values: disabled, enabled] [default: Disabled] + [possible values: disabled, summary, trace] [default: Disabled] -p, --data-path Path to your data, default to current directory @@ -129,7 +129,7 @@ Available commands inside DataFusion CLI are: - Object Store Profiling Mode ```bash -> \object_store_profiling [disabled|enabled] +> \object_store_profiling [disabled|summary|trace] ``` ## Supported SQL From ec3ca719d6e0aee8f4ef2c9551e6374ee08ce469 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Thu, 16 Oct 2025 14:03:02 +0300 Subject: [PATCH 007/109] fix: Improve null handling in array_to_string function (#18076) * fix: Improve null handling in array_to_string function * chore --- datafusion/functions-nested/src/string.rs | 58 +++++++++++++------- datafusion/sqllogictest/test_files/array.slt | 2 +- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 3373f7a9838e1..61caa3ac70764 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -369,27 +369,38 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { List(..) => { let list_array = as_list_array(&arr)?; for i in 0..list_array.len() { - compute_array_to_string( - arg, - list_array.value(i), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; + if !list_array.is_null(i) { + compute_array_to_string( + arg, + list_array.value(i), + delimiter.clone(), + null_string.clone(), + with_null_string, + )?; + } else if with_null_string { + arg.push_str(&null_string); + arg.push_str(&delimiter); + } } Ok(arg) } FixedSizeList(..) => { let list_array = as_fixed_size_list_array(&arr)?; + for i in 0..list_array.len() { - compute_array_to_string( - arg, - list_array.value(i), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; + if !list_array.is_null(i) { + compute_array_to_string( + arg, + list_array.value(i), + delimiter.clone(), + null_string.clone(), + with_null_string, + )?; + } else if with_null_string { + arg.push_str(&null_string); + arg.push_str(&delimiter); + } } Ok(arg) @@ -397,13 +408,18 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { LargeList(..) => { let list_array = as_large_list_array(&arr)?; for i in 0..list_array.len() { - compute_array_to_string( - arg, - list_array.value(i), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; + if !list_array.is_null(i) { + compute_array_to_string( + arg, + list_array.value(i), + delimiter.clone(), + null_string.clone(), + with_null_string, + )?; + } else if with_null_string { + arg.push_str(&null_string); + arg.push_str(&delimiter); + } } Ok(arg) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index d8c29a323e945..f488204d6d7b6 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -4855,7 +4855,7 @@ h,-,-,-,o nil-2-nil-4-5 1|0|3 query T select array_to_string(arrow_cast([arrow_cast([NULL, 'a'], 'FixedSizeList(2, Utf8)'), NULL], 'FixedSizeList(2, FixedSizeList(2, Utf8))'), ',', '-'); ---- --,a,-,- +-,a,- # array_to_string with columns #1 From c8e0f1cf7bba4a955b09ffd5d9b1bec38d868e8a Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Thu, 16 Oct 2025 14:41:28 +0300 Subject: [PATCH 008/109] feat: update .asf.yaml configuration settings (#18027) --- .asf.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.asf.yaml b/.asf.yaml index d71e7def36ad1..99fd6fac22c76 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -41,6 +41,7 @@ github: - sql enabled_merge_buttons: squash: true + squash_commit_message: PR_TITLE_AND_DESC merge: false rebase: false features: From 0a57e017c7a114936a3a6e1e16a4de6b44888342 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 16 Oct 2025 07:54:01 -0700 Subject: [PATCH 009/109] Fix extended tests on main to get CI green (#18096) ## Which issue does this PR close? - Closes https://github.com/apache/datafusion/issues/18084 ## Rationale for this change Some of the extended tests are failing because we have fixed case conditional evaluation and queries that (incorrectly) previously did not pass are now. ## What changes are included in this PR? Update datafusion-testing pin ## Are these changes tested? I tested locally with: ```shell INCLUDE_SQLITE=true cargo test --profile release-nonlto --test sqllogictests ``` ## Are there any user-facing changes? No --- datafusion-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-testing b/datafusion-testing index 905df5f65cc9d..eccb0e4a42634 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit 905df5f65cc9d0851719c21f5a4dd5cd77621f19 +Subproject commit eccb0e4a426344ef3faf534cd60e02e9c3afd3ac From 9bfa2ae770f03455eca1a0dc32e39a6a201cbe17 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 17 Oct 2025 01:55:06 +1100 Subject: [PATCH 010/109] chore(deps): bump taiki-e/install-action from 2.62.29 to 2.62.31 (#18094) Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.62.29 to 2.62.31.
Release notes

Sourced from taiki-e/install-action's releases.

2.62.31

  • Update protoc@latest to 3.33.0.

  • Update uv@latest to 0.9.3.

  • Update syft@latest to 1.34.1.

  • Update mise@latest to 2025.10.9.

  • Update cargo-shear@latest to 1.6.0.

2.62.30

  • Update vacuum@latest to 0.18.6.

  • Update zizmor@latest to 1.15.2.

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

[2.62.31] - 2025-10-16

  • Update protoc@latest to 3.33.0.

  • Update uv@latest to 0.9.3.

  • Update syft@latest to 1.34.1.

  • Update mise@latest to 2025.10.9.

  • Update cargo-shear@latest to 1.6.0.

[2.62.30] - 2025-10-15

  • Update vacuum@latest to 0.18.6.

  • Update zizmor@latest to 1.15.2.

[2.62.29] - 2025-10-14

  • Update zizmor@latest to 1.15.1.

  • Update cargo-nextest@latest to 0.9.106.

  • Update mise@latest to 2025.10.8.

  • Update ubi@latest to 0.8.1.

[2.62.28] - 2025-10-11

  • Update release-plz@latest to 0.3.148.

  • Update cargo-sort@latest to 2.0.2.

  • Update cargo-binstall@latest to 1.15.7.

  • Update uv@latest to 0.9.2.

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.62.29&new-version=2.62.31)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/audit.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 5d5e9e270a65e..00bfa1e1b285f 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@5b5de1b4da26ad411330c0454bdd72929bfcbeb2 # v2.62.29 + uses: taiki-e/install-action@0005e0116e92d8489d8d96fbff83f061c79ba95a # v2.62.31 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ecdbf031b45b0..9fa033fce646f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -412,7 +412,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@5b5de1b4da26ad411330c0454bdd72929bfcbeb2 # v2.62.29 + uses: taiki-e/install-action@0005e0116e92d8489d8d96fbff83f061c79ba95a # v2.62.31 with: tool: wasm-pack - name: Run tests with headless mode @@ -739,7 +739,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@5b5de1b4da26ad411330c0454bdd72929bfcbeb2 # v2.62.29 + uses: taiki-e/install-action@0005e0116e92d8489d8d96fbff83f061c79ba95a # v2.62.31 with: tool: cargo-msrv From b1723e5c6a6700ba939b03319377830511719aa2 Mon Sep 17 00:00:00 2001 From: Oleks V Date: Thu, 16 Oct 2025 09:29:22 -0700 Subject: [PATCH 011/109] chore: run extended suite on PRs for critical areas (#18088) ## Which issue does this PR close? - Closes #. Related to https://github.com/apache/datafusion/issues/18084 ## Rationale for this change Run extended suite on PRs for critical areas, to avoid post merge bugfixing ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --------- Co-authored-by: Andrew Lamb --- .github/workflows/extended.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 9343997e05682..23bd66a0cf352 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -36,6 +36,14 @@ on: # it is not expected to have many changes in these branches, # so running extended tests is not a burden - 'branch-*' + # Also run for changes to some critical areas that are most likely + # to trigger errors in extended tests + pull_request: + branches: [ '**' ] + paths: + - 'datafusion/physical*/**/*.rs' + - 'datafusion/expr*/**/*.rs' + - 'datafusion/optimizer/**/*.rs' workflow_dispatch: inputs: pr_number: From 4e03c92059d2aaa72bb8b3f076626f6111ec720d Mon Sep 17 00:00:00 2001 From: dario curreri <48800335+dariocurr@users.noreply.github.com> Date: Thu, 16 Oct 2025 19:21:24 +0200 Subject: [PATCH 012/109] refactor: add dialect enum (#18043) ## Which issue does this PR close? - Closes #18042 ## Rationale for this change This PR introduces a new dialect enum to improve type safety and code maintainability when handling different SQL dialects in DataFusion 1. Provide compile-time guarantees for dialect handling 2. Improve code readability and self-documentation 3. Enable better IDE support and autocomplete ## What changes are included in this PR? - Added a new `Dialect` enum to represent supported SQL dialects - Refactored existing code to use the new enum instead of previous representations - Modified tests to work with the new enum-based approach ## Are these changes tested? Yes ## Are there any user-facing changes? Yes, this is an API change: the type of the `dialect` field changed from `String` to `Dialect` --- Cargo.lock | 1 + datafusion-cli/Cargo.toml | 5 +- datafusion-cli/src/helper.rs | 19 ++-- datafusion-cli/src/highlighter.rs | 10 ++- .../examples/remote_catalog.rs | 4 +- datafusion/common/src/config.rs | 90 ++++++++++++++++++- datafusion/core/benches/sql_planner.rs | 7 +- .../core/src/execution/session_state.rs | 27 +++--- .../tests/user_defined/insert_operation.rs | 5 +- docs/source/library-user-guide/upgrading.md | 8 ++ 10 files changed, 141 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bbf64d5262e29..7b09121595d67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1936,6 +1936,7 @@ dependencies = [ "clap 4.5.48", "ctor", "datafusion", + "datafusion-common", "dirs", "env_logger", "futures", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index d186cd711945d..53744e6c609b8 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -40,7 +40,7 @@ async-trait = { workspace = true } aws-config = "1.8.7" aws-credential-types = "1.2.7" chrono = { workspace = true } -clap = { version = "4.5.47", features = ["derive", "cargo"] } +clap = { version = "4.5.47", features = ["cargo", "derive"] } datafusion = { workspace = true, features = [ "avro", "compression", @@ -55,6 +55,7 @@ datafusion = { workspace = true, features = [ "sql", "unicode_expressions", ] } +datafusion-common = { workspace = true } dirs = "6.0.0" env_logger = { workspace = true } futures = { workspace = true } @@ -65,7 +66,7 @@ parking_lot = { workspace = true } parquet = { workspace = true, default-features = false } regex = { workspace = true } rustyline = "17.0" -tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] } +tokio = { workspace = true, features = ["macros", "parking_lot", "rt", "rt-multi-thread", "signal", "sync"] } url = { workspace = true } [dev-dependencies] diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs index 64c34c4737369..219637b3460e6 100644 --- a/datafusion-cli/src/helper.rs +++ b/datafusion-cli/src/helper.rs @@ -24,6 +24,7 @@ use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter}; use datafusion::sql::parser::{DFParser, Statement}; use datafusion::sql::sqlparser::dialect::dialect_from_str; +use datafusion_common::config::Dialect; use rustyline::completion::{Completer, FilenameCompleter, Pair}; use rustyline::error::ReadlineError; @@ -34,12 +35,12 @@ use rustyline::{Context, Helper, Result}; pub struct CliHelper { completer: FilenameCompleter, - dialect: String, + dialect: Dialect, highlighter: Box, } impl CliHelper { - pub fn new(dialect: &str, color: bool) -> Self { + pub fn new(dialect: &Dialect, color: bool) -> Self { let highlighter: Box = if !color { Box::new(NoSyntaxHighlighter {}) } else { @@ -47,20 +48,20 @@ impl CliHelper { }; Self { completer: FilenameCompleter::new(), - dialect: dialect.into(), + dialect: *dialect, highlighter, } } - pub fn set_dialect(&mut self, dialect: &str) { - if dialect != self.dialect { - self.dialect = dialect.to_string(); + pub fn set_dialect(&mut self, dialect: &Dialect) { + if *dialect != self.dialect { + self.dialect = *dialect; } } fn validate_input(&self, input: &str) -> Result { if let Some(sql) = input.strip_suffix(';') { - let dialect = match dialect_from_str(&self.dialect) { + let dialect = match dialect_from_str(self.dialect) { Some(dialect) => dialect, None => { return Ok(ValidationResult::Invalid(Some(format!( @@ -97,7 +98,7 @@ impl CliHelper { impl Default for CliHelper { fn default() -> Self { - Self::new("generic", false) + Self::new(&Dialect::Generic, false) } } @@ -289,7 +290,7 @@ mod tests { ); // valid in postgresql dialect - validator.set_dialect("postgresql"); + validator.set_dialect(&Dialect::PostgreSQL); let result = readline_direct(Cursor::new(r"select 1 # 2;".as_bytes()), &validator)?; assert!(matches!(result, ValidationResult::Valid(None))); diff --git a/datafusion-cli/src/highlighter.rs b/datafusion-cli/src/highlighter.rs index 7a886b94740bd..f4e57a2e3593a 100644 --- a/datafusion-cli/src/highlighter.rs +++ b/datafusion-cli/src/highlighter.rs @@ -27,6 +27,7 @@ use datafusion::sql::sqlparser::{ keywords::Keyword, tokenizer::{Token, Tokenizer}, }; +use datafusion_common::config; use rustyline::highlight::{CmdKind, Highlighter}; /// The syntax highlighter. @@ -36,7 +37,7 @@ pub struct SyntaxHighlighter { } impl SyntaxHighlighter { - pub fn new(dialect: &str) -> Self { + pub fn new(dialect: &config::Dialect) -> Self { let dialect = dialect_from_str(dialect).unwrap_or(Box::new(GenericDialect {})); Self { dialect } } @@ -93,13 +94,14 @@ impl Color { #[cfg(test)] mod tests { + use super::config::Dialect; use super::SyntaxHighlighter; use rustyline::highlight::Highlighter; #[test] fn highlighter_valid() { let s = "SElect col_a from tab_1;"; - let highlighter = SyntaxHighlighter::new("generic"); + let highlighter = SyntaxHighlighter::new(&Dialect::Generic); let out = highlighter.highlight(s, s.len()); assert_eq!( "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1;", @@ -110,7 +112,7 @@ mod tests { #[test] fn highlighter_valid_with_new_line() { let s = "SElect col_a from tab_1\n WHERE col_b = 'なにか';"; - let highlighter = SyntaxHighlighter::new("generic"); + let highlighter = SyntaxHighlighter::new(&Dialect::Generic); let out = highlighter.highlight(s, s.len()); assert_eq!( "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1\n \u{1b}[91mWHERE\u{1b}[0m col_b = \u{1b}[92m'なにか'\u{1b}[0m;", @@ -121,7 +123,7 @@ mod tests { #[test] fn highlighter_invalid() { let s = "SElect col_a from tab_1 WHERE col_b = ';"; - let highlighter = SyntaxHighlighter::new("generic"); + let highlighter = SyntaxHighlighter::new(&Dialect::Generic); let out = highlighter.highlight(s, s.len()); assert_eq!("SElect col_a from tab_1 WHERE col_b = ';", out); } diff --git a/datafusion-examples/examples/remote_catalog.rs b/datafusion-examples/examples/remote_catalog.rs index 70c0963545e08..74575554ec0af 100644 --- a/datafusion-examples/examples/remote_catalog.rs +++ b/datafusion-examples/examples/remote_catalog.rs @@ -75,8 +75,8 @@ async fn main() -> Result<()> { let state = ctx.state(); // First, parse the SQL (but don't plan it / resolve any table references) - let dialect = state.config().options().sql_parser.dialect.as_str(); - let statement = state.sql_to_statement(sql, dialect)?; + let dialect = state.config().options().sql_parser.dialect; + let statement = state.sql_to_statement(sql, &dialect)?; // Find all `TableReferences` in the parsed queries. These correspond to the // tables referred to by the query (in this case diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 9bde637f43794..126935a1de90b 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -258,7 +258,7 @@ config_namespace! { /// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, /// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. - pub dialect: String, default = "generic".to_string() + pub dialect: Dialect, default = Dialect::Generic // no need to lowercase because `sqlparser::dialect_from_str`] is case-insensitive /// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but @@ -292,6 +292,94 @@ config_namespace! { } } +/// This is the SQL dialect used by DataFusion's parser. +/// This mirrors [sqlparser::dialect::Dialect](https://docs.rs/sqlparser/latest/sqlparser/dialect/trait.Dialect.html) +/// trait in order to offer an easier API and avoid adding the `sqlparser` dependency +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub enum Dialect { + #[default] + Generic, + MySQL, + PostgreSQL, + Hive, + SQLite, + Snowflake, + Redshift, + MsSQL, + ClickHouse, + BigQuery, + Ansi, + DuckDB, + Databricks, +} + +impl AsRef for Dialect { + fn as_ref(&self) -> &str { + match self { + Self::Generic => "generic", + Self::MySQL => "mysql", + Self::PostgreSQL => "postgresql", + Self::Hive => "hive", + Self::SQLite => "sqlite", + Self::Snowflake => "snowflake", + Self::Redshift => "redshift", + Self::MsSQL => "mssql", + Self::ClickHouse => "clickhouse", + Self::BigQuery => "bigquery", + Self::Ansi => "ansi", + Self::DuckDB => "duckdb", + Self::Databricks => "databricks", + } + } +} + +impl FromStr for Dialect { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + let value = match s.to_ascii_lowercase().as_str() { + "generic" => Self::Generic, + "mysql" => Self::MySQL, + "postgresql" | "postgres" => Self::PostgreSQL, + "hive" => Self::Hive, + "sqlite" => Self::SQLite, + "snowflake" => Self::Snowflake, + "redshift" => Self::Redshift, + "mssql" => Self::MsSQL, + "clickhouse" => Self::ClickHouse, + "bigquery" => Self::BigQuery, + "ansi" => Self::Ansi, + "duckdb" => Self::DuckDB, + "databricks" => Self::Databricks, + other => { + let error_message = format!( + "Invalid Dialect: {other}. Expected one of: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB, Databricks" + ); + return Err(DataFusionError::Configuration(error_message)); + } + }; + Ok(value) + } +} + +impl ConfigField for Dialect { + fn visit(&self, v: &mut V, key: &str, description: &'static str) { + v.some(key, self, description) + } + + fn set(&mut self, _: &str, value: &str) -> Result<()> { + *self = Self::from_str(value)?; + Ok(()) + } +} + +impl Display for Dialect { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let str = self.as_ref(); + write!(f, "{str}") + } +} + #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] pub enum SpillCompression { Zstd, diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index 3be8668b2b8c4..83563099cad67 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -30,7 +30,7 @@ use criterion::Bencher; use datafusion::datasource::MemTable; use datafusion::execution::context::SessionContext; use datafusion::prelude::DataFrame; -use datafusion_common::ScalarValue; +use datafusion_common::{config::Dialect, ScalarValue}; use datafusion_expr::Expr::Literal; use datafusion_expr::{cast, col, lit, not, try_cast, when}; use datafusion_functions::expr_fn::{ @@ -288,7 +288,10 @@ fn benchmark_with_param_values_many_columns( } // SELECT max(attr0), ..., max(attrN) FROM t1. let query = format!("SELECT {aggregates} FROM t1"); - let statement = ctx.state().sql_to_statement(&query, "Generic").unwrap(); + let statement = ctx + .state() + .sql_to_statement(&query, &Dialect::Generic) + .unwrap(); let plan = rt.block_on(async { ctx.state().statement_to_plan(statement).await.unwrap() }); b.iter(|| { diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index b04004dd495c8..6749ddd7ab8d5 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -30,15 +30,14 @@ use crate::datasource::provider_as_source; use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner}; use crate::execution::SessionStateDefaults; use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}; +use arrow::datatypes::DataType; use datafusion_catalog::information_schema::{ InformationSchemaProvider, INFORMATION_SCHEMA, }; - -use arrow::datatypes::DataType; use datafusion_catalog::MemoryCatalogProviderList; use datafusion_catalog::{TableFunction, TableFunctionImpl}; use datafusion_common::alias::AliasGenerator; -use datafusion_common::config::{ConfigExtension, ConfigOptions, TableOptions}; +use datafusion_common::config::{ConfigExtension, ConfigOptions, Dialect, TableOptions}; use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; use datafusion_common::tree_node::TreeNode; use datafusion_common::{ @@ -374,7 +373,7 @@ impl SessionState { pub fn sql_to_statement( &self, sql: &str, - dialect: &str, + dialect: &Dialect, ) -> datafusion_common::Result { let dialect = dialect_from_str(dialect).ok_or_else(|| { plan_datafusion_err!( @@ -411,7 +410,7 @@ impl SessionState { pub fn sql_to_expr( &self, sql: &str, - dialect: &str, + dialect: &Dialect, ) -> datafusion_common::Result { self.sql_to_expr_with_alias(sql, dialect).map(|x| x.expr) } @@ -423,7 +422,7 @@ impl SessionState { pub fn sql_to_expr_with_alias( &self, sql: &str, - dialect: &str, + dialect: &Dialect, ) -> datafusion_common::Result { let dialect = dialect_from_str(dialect).ok_or_else(|| { plan_datafusion_err!( @@ -527,8 +526,8 @@ impl SessionState { &self, sql: &str, ) -> datafusion_common::Result { - let dialect = self.config.options().sql_parser.dialect.as_str(); - let statement = self.sql_to_statement(sql, dialect)?; + let dialect = self.config.options().sql_parser.dialect; + let statement = self.sql_to_statement(sql, &dialect)?; let plan = self.statement_to_plan(statement).await?; Ok(plan) } @@ -542,9 +541,9 @@ impl SessionState { sql: &str, df_schema: &DFSchema, ) -> datafusion_common::Result { - let dialect = self.config.options().sql_parser.dialect.as_str(); + let dialect = self.config.options().sql_parser.dialect; - let sql_expr = self.sql_to_expr_with_alias(sql, dialect)?; + let sql_expr = self.sql_to_expr_with_alias(sql, &dialect)?; let provider = SessionContextProvider { state: self, @@ -2034,6 +2033,7 @@ mod tests { use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use datafusion_catalog::MemoryCatalogProviderList; + use datafusion_common::config::Dialect; use datafusion_common::DFSchema; use datafusion_common::Result; use datafusion_execution::config::SessionConfig; @@ -2059,8 +2059,8 @@ mod tests { let sql = "[1,2,3]"; let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); let df_schema = DFSchema::try_from(schema)?; - let dialect = state.config.options().sql_parser.dialect.as_str(); - let sql_expr = state.sql_to_expr(sql, dialect)?; + let dialect = state.config.options().sql_parser.dialect; + let sql_expr = state.sql_to_expr(sql, &dialect)?; let query = SqlToRel::new_with_options(&provider, state.get_parser_options()); query.sql_to_expr(sql_expr, &df_schema, &mut PlannerContext::new()) @@ -2218,7 +2218,8 @@ mod tests { } let state = &context_provider.state; - let statement = state.sql_to_statement("select count(*) from t", "mysql")?; + let statement = + state.sql_to_statement("select count(*) from t", &Dialect::MySQL)?; let plan = SqlToRel::new(&context_provider).statement_to_plan(statement)?; state.create_physical_plan(&plan).await } diff --git a/datafusion/core/tests/user_defined/insert_operation.rs b/datafusion/core/tests/user_defined/insert_operation.rs index c8a4279a42110..e0a3e98604ae4 100644 --- a/datafusion/core/tests/user_defined/insert_operation.rs +++ b/datafusion/core/tests/user_defined/insert_operation.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{any::Any, sync::Arc}; +use std::{any::Any, str::FromStr, sync::Arc}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use async_trait::async_trait; @@ -24,6 +24,7 @@ use datafusion::{ prelude::{SessionConfig, SessionContext}, }; use datafusion_catalog::{Session, TableProvider}; +use datafusion_common::config::Dialect; use datafusion_expr::{dml::InsertOp, Expr, TableType}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_plan::execution_plan::SchedulingType; @@ -63,7 +64,7 @@ async fn assert_insert_op(ctx: &SessionContext, sql: &str, insert_op: InsertOp) fn session_ctx_with_dialect(dialect: impl Into) -> SessionContext { let mut config = SessionConfig::new(); let options = config.options_mut(); - options.sql_parser.dialect = dialect.into(); + options.sql_parser.dialect = Dialect::from_str(&dialect.into()).unwrap(); SessionContext::new_with_config(config) } diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index e93659872565b..0b9da1b5a86ae 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -86,6 +86,14 @@ See [issue #17601] for more details. [issue #17601]: https://github.com/apache/datafusion/issues/17601 +### `SessionState`'s `sql_to_statement` method takes `Dialect` rather than a `str` + +The `dialect` parameter of `sql_to_statement` method defined in `datafusion::execution::session_state::SessionState` +has changed from `&str` to `&Dialect`. +`Dialect` is an enum defined in the `datafusion-common` +crate under the `config` module that provides type safety +and better validation for SQL dialect selection + ## DataFusion `50.0.0` ### ListingTable automatically detects Hive Partitioned tables From ea83c2644eb559e55401ce2f7f975032e8d7845d Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Thu, 16 Oct 2025 19:21:48 +0200 Subject: [PATCH 013/109] #17982 Make `nvl` a thin wrapper for `coalesce` (#17991) ## Which issue does this PR close? - Closes #17982 ## Rationale for this change By making `NVLFunc` a wrapper for `CoalesceFunc` with a more restrictive signature the implementation automatically benefits from any optimisation work related to `coalesce`. ## What changes are included in this PR? - Make `NVLFunc` a thin wrapper of `CoalesceFunc`. This seemed like the simplest way to reuse the coalesce logic, but keep the stricter signature of `nvl`. - Add `ScalarUDF::conditional_arguments` as a more precise complement to `ScalarUDF::short_circuits`. By letting each function expose which arguments are eager and which are lazy, we provide more precise information to the optimizer which may enable better optimisation. ## Are these changes tested? Assumed to be covered by sql logic tests. Unit tests for the custom implementation were removed since those are no longer relevant. ## Are there any user-facing changes? The rewriting of `nvl` to `case when ... then ... else ... end` is visible in the physical query plan. --------- Co-authored-by: Andrew Lamb --- datafusion/expr/src/udf.rs | 55 +++- datafusion/functions/src/core/coalesce.rs | 11 +- datafusion/functions/src/core/nvl.rs | 240 +++--------------- .../optimizer/src/common_subexpr_eliminate.rs | 6 +- datafusion/sqllogictest/test_files/nvl.slt | 35 +++ .../test_files/string/string_view.slt | 2 +- .../source/user-guide/sql/scalar_functions.md | 2 +- 7 files changed, 141 insertions(+), 210 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index d522158f7b6b7..de81ec5f0bacf 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -252,7 +252,21 @@ impl ScalarUDF { Ok(result) } - /// Get the circuits of inner implementation + /// Determines which of the arguments passed to this function are evaluated eagerly + /// and which may be evaluated lazily. + /// + /// See [ScalarUDFImpl::conditional_arguments] for more information. + pub fn conditional_arguments<'a>( + &self, + args: &'a [Expr], + ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> { + self.inner.conditional_arguments(args) + } + + /// Returns true if some of this `exprs` subexpressions may not be evaluated + /// and thus any side effects (like divide by zero) may not be encountered. + /// + /// See [ScalarUDFImpl::short_circuits] for more information. pub fn short_circuits(&self) -> bool { self.inner.short_circuits() } @@ -656,10 +670,42 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// /// Setting this to true prevents certain optimizations such as common /// subexpression elimination + /// + /// When overriding this function to return `true`, [ScalarUDFImpl::conditional_arguments] can also be + /// overridden to report more accurately which arguments are eagerly evaluated and which ones + /// lazily. fn short_circuits(&self) -> bool { false } + /// Determines which of the arguments passed to this function are evaluated eagerly + /// and which may be evaluated lazily. + /// + /// If this function returns `None`, all arguments are eagerly evaluated. + /// Returning `None` is a micro optimization that saves a needless `Vec` + /// allocation. + /// + /// If the function returns `Some`, returns (`eager`, `lazy`) where `eager` + /// are the arguments that are always evaluated, and `lazy` are the + /// arguments that may be evaluated lazily (i.e. may not be evaluated at all + /// in some cases). + /// + /// Implementations must ensure that the two returned `Vec`s are disjunct, + /// and that each argument from `args` is present in one the two `Vec`s. + /// + /// When overriding this function, [ScalarUDFImpl::short_circuits] must + /// be overridden to return `true`. + fn conditional_arguments<'a>( + &self, + args: &'a [Expr], + ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> { + if self.short_circuits() { + Some((vec![], args.iter().collect())) + } else { + None + } + } + /// Computes the output [`Interval`] for a [`ScalarUDFImpl`], given the input /// intervals. /// @@ -845,6 +891,13 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { self.inner.simplify(args, info) } + fn conditional_arguments<'a>( + &self, + args: &'a [Expr], + ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> { + self.inner.conditional_arguments(args) + } + fn short_circuits(&self) -> bool { self.inner.short_circuits() } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 3fba539dd04b4..aab1f445d5590 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -47,7 +47,7 @@ use std::any::Any; )] #[derive(Debug, PartialEq, Eq, Hash)] pub struct CoalesceFunc { - signature: Signature, + pub(super) signature: Signature, } impl Default for CoalesceFunc { @@ -126,6 +126,15 @@ impl ScalarUDFImpl for CoalesceFunc { internal_err!("coalesce should have been simplified to case") } + fn conditional_arguments<'a>( + &self, + args: &'a [Expr], + ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> { + let eager = vec![&args[0]]; + let lazy = args[1..].iter().collect(); + Some((eager, lazy)) + } + fn short_circuits(&self) -> bool { true } diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs index c8b34c4b17800..0b9968a88fc95 100644 --- a/datafusion/functions/src/core/nvl.rs +++ b/datafusion/functions/src/core/nvl.rs @@ -15,21 +15,19 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::Array; -use arrow::compute::is_not_null; -use arrow::compute::kernels::zip::zip; -use arrow::datatypes::DataType; -use datafusion_common::{utils::take_function_args, Result}; +use crate::core::coalesce::CoalesceFunc; +use arrow::datatypes::{DataType, FieldRef}; +use datafusion_common::Result; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, - Volatility, + ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs, + ScalarUDFImpl, Signature, Volatility, }; use datafusion_macros::user_doc; -use std::sync::Arc; #[user_doc( doc_section(label = "Conditional Functions"), - description = "Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_.", + description = "Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_ and _expression2_ is not evaluated. This function can be used to substitute a default value for NULL values.", syntax_example = "nvl(expression1, expression2)", sql_example = r#"```sql > select nvl(null, 'a'); @@ -57,7 +55,7 @@ use std::sync::Arc; )] #[derive(Debug, PartialEq, Eq, Hash)] pub struct NVLFunc { - signature: Signature, + coalesce: CoalesceFunc, aliases: Vec, } @@ -90,11 +88,13 @@ impl Default for NVLFunc { impl NVLFunc { pub fn new() -> Self { Self { - signature: Signature::uniform( - 2, - SUPPORTED_NVL_TYPES.to_vec(), - Volatility::Immutable, - ), + coalesce: CoalesceFunc { + signature: Signature::uniform( + 2, + SUPPORTED_NVL_TYPES.to_vec(), + Volatility::Immutable, + ), + }, aliases: vec![String::from("ifnull")], } } @@ -110,209 +110,45 @@ impl ScalarUDFImpl for NVLFunc { } fn signature(&self) -> &Signature { - &self.signature + &self.coalesce.signature } fn return_type(&self, arg_types: &[DataType]) -> Result { - Ok(arg_types[0].clone()) + self.coalesce.return_type(arg_types) } - fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - nvl_func(&args.args) - } - - fn aliases(&self) -> &[String] { - &self.aliases - } - - fn documentation(&self) -> Option<&Documentation> { - self.doc() + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + self.coalesce.return_field_from_args(args) } -} - -fn nvl_func(args: &[ColumnarValue]) -> Result { - let [lhs, rhs] = take_function_args("nvl/ifnull", args)?; - let (lhs_array, rhs_array) = match (lhs, rhs) { - (ColumnarValue::Array(lhs), ColumnarValue::Scalar(rhs)) => { - (Arc::clone(lhs), rhs.to_array_of_size(lhs.len())?) - } - (ColumnarValue::Array(lhs), ColumnarValue::Array(rhs)) => { - (Arc::clone(lhs), Arc::clone(rhs)) - } - (ColumnarValue::Scalar(lhs), ColumnarValue::Array(rhs)) => { - (lhs.to_array_of_size(rhs.len())?, Arc::clone(rhs)) - } - (ColumnarValue::Scalar(lhs), ColumnarValue::Scalar(rhs)) => { - let mut current_value = lhs; - if lhs.is_null() { - current_value = rhs; - } - return Ok(ColumnarValue::Scalar(current_value.clone())); - } - }; - let to_apply = is_not_null(&lhs_array)?; - let value = zip(&to_apply, &lhs_array, &rhs_array)?; - Ok(ColumnarValue::Array(value)) -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use arrow::array::*; - use super::*; - use datafusion_common::ScalarValue; - - #[test] - fn nvl_int32() -> Result<()> { - let a = Int32Array::from(vec![ - Some(1), - Some(2), - None, - None, - Some(3), - None, - None, - Some(4), - Some(5), - ]); - let a = ColumnarValue::Array(Arc::new(a)); - - let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(6i32))); - - let result = nvl_func(&[a, lit_array])?; - let result = result.into_array(0).expect("Failed to convert to array"); - - let expected = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - Some(6), - Some(6), - Some(3), - Some(6), - Some(6), - Some(4), - Some(5), - ])) as ArrayRef; - assert_eq!(expected.as_ref(), result.as_ref()); - Ok(()) + fn simplify( + &self, + args: Vec, + info: &dyn SimplifyInfo, + ) -> Result { + self.coalesce.simplify(args, info) } - #[test] - // Ensure that arrays with no nulls can also invoke nvl() correctly - fn nvl_int32_non_nulls() -> Result<()> { - let a = Int32Array::from(vec![1, 3, 10, 7, 8, 1, 2, 4, 5]); - let a = ColumnarValue::Array(Arc::new(a)); - - let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(20i32))); - - let result = nvl_func(&[a, lit_array])?; - let result = result.into_array(0).expect("Failed to convert to array"); - - let expected = Arc::new(Int32Array::from(vec![ - Some(1), - Some(3), - Some(10), - Some(7), - Some(8), - Some(1), - Some(2), - Some(4), - Some(5), - ])) as ArrayRef; - assert_eq!(expected.as_ref(), result.as_ref()); - Ok(()) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + self.coalesce.invoke_with_args(args) } - #[test] - fn nvl_boolean() -> Result<()> { - let a = BooleanArray::from(vec![Some(true), Some(false), None]); - let a = ColumnarValue::Array(Arc::new(a)); - - let lit_array = ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))); - - let result = nvl_func(&[a, lit_array])?; - let result = result.into_array(0).expect("Failed to convert to array"); - - let expected = Arc::new(BooleanArray::from(vec![ - Some(true), - Some(false), - Some(false), - ])) as ArrayRef; - - assert_eq!(expected.as_ref(), result.as_ref()); - Ok(()) + fn conditional_arguments<'a>( + &self, + args: &'a [Expr], + ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> { + self.coalesce.conditional_arguments(args) } - #[test] - fn nvl_string() -> Result<()> { - let a = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]); - let a = ColumnarValue::Array(Arc::new(a)); - - let lit_array = ColumnarValue::Scalar(ScalarValue::from("bax")); - - let result = nvl_func(&[a, lit_array])?; - let result = result.into_array(0).expect("Failed to convert to array"); - - let expected = Arc::new(StringArray::from(vec![ - Some("foo"), - Some("bar"), - Some("bax"), - Some("baz"), - ])) as ArrayRef; - - assert_eq!(expected.as_ref(), result.as_ref()); - Ok(()) + fn short_circuits(&self) -> bool { + self.coalesce.short_circuits() } - #[test] - fn nvl_literal_first() -> Result<()> { - let a = Int32Array::from(vec![Some(1), Some(2), None, None, Some(3), Some(4)]); - let a = ColumnarValue::Array(Arc::new(a)); - - let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32))); - - let result = nvl_func(&[lit_array, a])?; - let result = result.into_array(0).expect("Failed to convert to array"); - - let expected = Arc::new(Int32Array::from(vec![ - Some(2), - Some(2), - Some(2), - Some(2), - Some(2), - Some(2), - ])) as ArrayRef; - assert_eq!(expected.as_ref(), result.as_ref()); - Ok(()) + fn aliases(&self) -> &[String] { + &self.aliases } - #[test] - fn nvl_scalar() -> Result<()> { - let a_null = ColumnarValue::Scalar(ScalarValue::Int32(None)); - let b_null = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32))); - - let result_null = nvl_func(&[a_null, b_null])?; - let result_null = result_null - .into_array(1) - .expect("Failed to convert to array"); - - let expected_null = Arc::new(Int32Array::from(vec![Some(2i32)])) as ArrayRef; - - assert_eq!(expected_null.as_ref(), result_null.as_ref()); - - let a_nnull = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32))); - let b_nnull = ColumnarValue::Scalar(ScalarValue::Int32(Some(1i32))); - - let result_nnull = nvl_func(&[a_nnull, b_nnull])?; - let result_nnull = result_nnull - .into_array(1) - .expect("Failed to convert to array"); - - let expected_nnull = Arc::new(Int32Array::from(vec![Some(2i32)])) as ArrayRef; - assert_eq!(expected_nnull.as_ref(), result_nnull.as_ref()); - - Ok(()) + fn documentation(&self) -> Option<&Documentation> { + self.doc() } } diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index ec1f8f991a8ee..2510068494591 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -652,10 +652,8 @@ impl CSEController for ExprCSEController<'_> { // In case of `ScalarFunction`s we don't know which children are surely // executed so start visiting all children conditionally and stop the // recursion with `TreeNodeRecursion::Jump`. - Expr::ScalarFunction(ScalarFunction { func, args }) - if func.short_circuits() => - { - Some((vec![], args.iter().collect())) + Expr::ScalarFunction(ScalarFunction { func, args }) => { + func.conditional_arguments(args) } // In case of `And` and `Or` the first child is surely executed, but we diff --git a/datafusion/sqllogictest/test_files/nvl.slt b/datafusion/sqllogictest/test_files/nvl.slt index daab54307cc20..f4225148ab781 100644 --- a/datafusion/sqllogictest/test_files/nvl.slt +++ b/datafusion/sqllogictest/test_files/nvl.slt @@ -148,3 +148,38 @@ query T SELECT NVL(arrow_cast('a', 'Utf8View'), NULL); ---- a + +# nvl is implemented as a case, and short-circuits evaluation +# so the following query should not error +query I +SELECT NVL(1, 1/0); +---- +1 + +# but this one should +query error DataFusion error: Arrow error: Divide by zero error +SELECT NVL(NULL, 1/0); + +# Expect the query plan to show nvl as a case expression +query I +select NVL(int_field, 9999) FROM test; +---- +1 +2 +3 +9999 +4 +9999 + +# Expect the query plan to show nvl as a case expression +query TT +EXPLAIN select NVL(int_field, 9999) FROM test; +---- +logical_plan +01)Projection: CASE WHEN __common_expr_1 IS NOT NULL THEN __common_expr_1 ELSE Int64(9999) END AS nvl(test.int_field,Int64(9999)) +02)--Projection: CAST(test.int_field AS Int64) AS __common_expr_1 +03)----TableScan: test projection=[int_field] +physical_plan +01)ProjectionExec: expr=[CASE WHEN __common_expr_1@0 IS NOT NULL THEN __common_expr_1@0 ELSE 9999 END as nvl(test.int_field,Int64(9999))] +02)--ProjectionExec: expr=[CAST(int_field@0 AS Int64) as __common_expr_1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index fb67daa0b8405..4d30f572ad6fb 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -988,7 +988,7 @@ query TT EXPLAIN SELECT NVL(column1_utf8view, 'a') as c2 FROM test; ---- logical_plan -01)Projection: nvl(test.column1_utf8view, Utf8View("a")) AS c2 +01)Projection: CASE WHEN test.column1_utf8view IS NOT NULL THEN test.column1_utf8view ELSE Utf8View("a") END AS c2 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for nullif diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 9fcaac7628557..ec2faf8b3d5df 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1056,7 +1056,7 @@ nullif(expression1, expression2) ### `nvl` -Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_. +Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_ and _expression2_ is not evaluated. This function can be used to substitute a default value for NULL values. ```sql nvl(expression1, expression2) From 7c3b0d0a68d89ba0ac079c7d9adaa3d52ece1c39 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Fri, 17 Oct 2025 04:23:36 +1100 Subject: [PATCH 014/109] minor: fix incorrect deprecation version & window docs (#18093) --- datafusion/functions-window-common/src/expr.rs | 2 +- datafusion/functions-window-common/src/partition.rs | 2 +- datafusion/functions/src/planner.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-window-common/src/expr.rs b/datafusion/functions-window-common/src/expr.rs index 774cd5182b30b..d72cd412f0175 100644 --- a/datafusion/functions-window-common/src/expr.rs +++ b/datafusion/functions-window-common/src/expr.rs @@ -37,7 +37,7 @@ impl<'a> ExpressionArgs<'a> { /// /// * `input_exprs` - The expressions passed as arguments /// to the user-defined window function. - /// * `input_types` - The data types corresponding to the + /// * `input_fields` - The fields corresponding to the /// arguments to the user-defined window function. /// pub fn new( diff --git a/datafusion/functions-window-common/src/partition.rs b/datafusion/functions-window-common/src/partition.rs index 61125e596130b..df0a815401177 100644 --- a/datafusion/functions-window-common/src/partition.rs +++ b/datafusion/functions-window-common/src/partition.rs @@ -42,7 +42,7 @@ impl<'a> PartitionEvaluatorArgs<'a> { /// /// * `input_exprs` - The expressions passed as arguments /// to the user-defined window function. - /// * `input_types` - The data types corresponding to the + /// * `input_fields` - The fields corresponding to the /// arguments to the user-defined window function. /// * `is_reversed` - Set to `true` if and only if the user-defined /// window function is reversible and is reversed. diff --git a/datafusion/functions/src/planner.rs b/datafusion/functions/src/planner.rs index 7228cdc07e727..ccd167997003e 100644 --- a/datafusion/functions/src/planner.rs +++ b/datafusion/functions/src/planner.rs @@ -25,7 +25,7 @@ use datafusion_expr::{ }; #[deprecated( - since = "0.50.0", + since = "50.0.0", note = "Use UnicodeFunctionPlanner and DateTimeFunctionPlanner instead" )] #[derive(Default, Debug)] From 337378ab81f6c7dab7da9000124c554d3b7ee568 Mon Sep 17 00:00:00 2001 From: Oleks V Date: Thu, 16 Oct 2025 12:36:01 -0700 Subject: [PATCH 015/109] chore: use `NullBuffer::union` for Spark `concat` (#18087) ## Which issue does this PR close? - Closes #. Followup on https://github.com/apache/datafusion/pull/18063#pullrequestreview-3341818221 ## Rationale for this change Use cheaper `NullBuffer::union` to apply null mask instead of iterator approach ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --- .../spark/src/function/string/concat.rs | 141 +++++++----------- 1 file changed, 52 insertions(+), 89 deletions(-) diff --git a/datafusion/spark/src/function/string/concat.rs b/datafusion/spark/src/function/string/concat.rs index 0e981e7c37224..0dcc58d5bb8ed 100644 --- a/datafusion/spark/src/function/string/concat.rs +++ b/datafusion/spark/src/function/string/concat.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, ArrayBuilder}; +use arrow::array::Array; +use arrow::buffer::NullBuffer; use arrow::datatypes::DataType; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ @@ -31,6 +32,10 @@ use std::sync::Arc; /// /// Concatenates multiple input strings into a single string. /// Returns NULL if any input is NULL. +/// +/// Differences with DataFusion concat: +/// - Support 0 arguments +/// - Return NULL if any input is NULL #[derive(Debug, PartialEq, Eq, Hash)] pub struct SparkConcat { signature: Signature, @@ -80,6 +85,16 @@ impl ScalarUDFImpl for SparkConcat { } } +/// Represents the null state for Spark concat +enum NullMaskResolution { + /// Return NULL as the result (e.g., scalar inputs with at least one NULL) + ReturnNull, + /// No null mask needed (e.g., all scalar inputs are non-NULL) + NoMask, + /// Null mask to apply for arrays + Apply(NullBuffer), +} + /// Concatenates strings, returning NULL if any input is NULL /// This is a Spark-specific wrapper around DataFusion's concat that returns NULL /// if any argument is NULL (Spark behavior), whereas DataFusion's concat ignores NULLs. @@ -103,7 +118,7 @@ fn spark_concat(args: ScalarFunctionArgs) -> Result { let null_mask = compute_null_mask(&arg_values, number_rows)?; // If all scalars and any is NULL, return NULL immediately - if null_mask.is_none() { + if matches!(null_mask, NullMaskResolution::ReturnNull) { return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); } @@ -122,13 +137,11 @@ fn spark_concat(args: ScalarFunctionArgs) -> Result { apply_null_mask(result, null_mask) } -/// Compute NULL mask for the arguments -/// Returns None if all scalars and any is NULL, or a Vector of -/// boolean representing the null mask for incoming arrays +/// Compute NULL mask for the arguments using NullBuffer::union fn compute_null_mask( args: &[ColumnarValue], number_rows: usize, -) -> Result>> { +) -> Result { // Check if all arguments are scalars let all_scalars = args .iter() @@ -139,15 +152,14 @@ fn compute_null_mask( for arg in args { if let ColumnarValue::Scalar(scalar) = arg { if scalar.is_null() { - // Return None to indicate all values should be NULL - return Ok(None); + return Ok(NullMaskResolution::ReturnNull); } } } // No NULLs in scalars - Ok(Some(vec![])) + Ok(NullMaskResolution::NoMask) } else { - // For arrays, compute NULL mask for each row + // For arrays, compute NULL mask for each row using NullBuffer::union let array_len = args .iter() .find_map(|arg| match arg { @@ -166,99 +178,50 @@ fn compute_null_mask( .collect(); let arrays = arrays?; - // Compute NULL mask - let mut null_mask = vec![false; array_len]; - for array in &arrays { - for (i, null_flag) in null_mask.iter_mut().enumerate().take(array_len) { - if array.is_null(i) { - *null_flag = true; - } - } - } + // Use NullBuffer::union to combine all null buffers + let combined_nulls = arrays + .iter() + .map(|arr| arr.nulls()) + .fold(None, |acc, nulls| NullBuffer::union(acc.as_ref(), nulls)); - Ok(Some(null_mask)) + match combined_nulls { + Some(nulls) => Ok(NullMaskResolution::Apply(nulls)), + None => Ok(NullMaskResolution::NoMask), + } } } -/// Apply NULL mask to the result +/// Apply NULL mask to the result using NullBuffer::union fn apply_null_mask( result: ColumnarValue, - null_mask: Option>, + null_mask: NullMaskResolution, ) -> Result { match (result, null_mask) { - // Scalar with NULL mask means return NULL - (ColumnarValue::Scalar(_), None) => { + // Scalar with ReturnNull mask means return NULL + (ColumnarValue::Scalar(_), NullMaskResolution::ReturnNull) => { Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))) } - // Scalar without NULL mask, return as-is - (scalar @ ColumnarValue::Scalar(_), Some(mask)) if mask.is_empty() => Ok(scalar), - // Array with NULL mask - (ColumnarValue::Array(array), Some(null_mask)) if !null_mask.is_empty() => { - let array_len = array.len(); - let return_type = array.data_type(); + // Scalar without mask, return as-is + (scalar @ ColumnarValue::Scalar(_), NullMaskResolution::NoMask) => Ok(scalar), + // Array with NULL mask - use NullBuffer::union to combine nulls + (ColumnarValue::Array(array), NullMaskResolution::Apply(null_mask)) => { + // Combine the result's existing nulls with our computed null mask + let combined_nulls = NullBuffer::union(array.nulls(), Some(&null_mask)); - let mut builder: Box = match return_type { - DataType::Utf8 => { - let string_array = array - .as_any() - .downcast_ref::() - .unwrap(); - let mut builder = - arrow::array::StringBuilder::with_capacity(array_len, 0); - for (i, &is_null) in null_mask.iter().enumerate().take(array_len) { - if is_null || string_array.is_null(i) { - builder.append_null(); - } else { - builder.append_value(string_array.value(i)); - } - } - Box::new(builder) - } - DataType::LargeUtf8 => { - let string_array = array - .as_any() - .downcast_ref::() - .unwrap(); - let mut builder = - arrow::array::LargeStringBuilder::with_capacity(array_len, 0); - for (i, &is_null) in null_mask.iter().enumerate().take(array_len) { - if is_null || string_array.is_null(i) { - builder.append_null(); - } else { - builder.append_value(string_array.value(i)); - } - } - Box::new(builder) - } - DataType::Utf8View => { - let string_array = array - .as_any() - .downcast_ref::() - .unwrap(); - let mut builder = - arrow::array::StringViewBuilder::with_capacity(array_len); - for (i, &is_null) in null_mask.iter().enumerate().take(array_len) { - if is_null || string_array.is_null(i) { - builder.append_null(); - } else { - builder.append_value(string_array.value(i)); - } - } - Box::new(builder) - } - _ => { - return datafusion_common::exec_err!( - "Unsupported return type for concat: {:?}", - return_type - ); - } - }; + // Create new array with combined nulls + let new_array = array + .into_data() + .into_builder() + .nulls(combined_nulls) + .build()?; - Ok(ColumnarValue::Array(builder.finish())) + Ok(ColumnarValue::Array(Arc::new(arrow::array::make_array( + new_array, + )))) } // Array without NULL mask, return as-is - (array @ ColumnarValue::Array(_), _) => Ok(array), - // Shouldn't happen + (array @ ColumnarValue::Array(_), NullMaskResolution::NoMask) => Ok(array), + // Edge cases that shouldn't happen in practice (scalar, _) => Ok(scalar), } } From cadf42955146fd91cf594e0375eb932752457f55 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Fri, 17 Oct 2025 09:56:36 +0900 Subject: [PATCH 016/109] feat: support `null_treatment`, `distinct`, and `filter` for window functions in proto (#18024) ## Which issue does this PR close? - Closes #17417. ## Rationale for this change - Support `null_treatment`, `distinct`, and `filter` for window function in proto. - Support `null_treatment` for aggregate udf in proto. ## What changes are included in this PR? - [x] Add `null_treatment`, `distinct`, `filter` fields to `WindowExprNode` message and handle them in `to/from_proto.rs`. - [x] Add `null_treatment` field to `AggregateUDFExprNode` message and handle them in `to/from_proto.rs`. - [ ] Docs update: I'm not sure where to add docs as declared in the issue description. ## Are these changes tested? - Add tests to `roundtrip_window` for respectnulls, ignorenulls, distinct, filter. - Add tests to `roundtrip_aggregate_udf` for respectnulls, ignorenulls. ## Are there any user-facing changes? N/A --------- Co-authored-by: Jeffrey Vo --- datafusion/proto/proto/datafusion.proto | 11 +- datafusion/proto/src/generated/pbjson.rs | 145 ++++++++++++++++++ datafusion/proto/src/generated/prost.rs | 36 ++++- .../proto/src/logical_plan/from_proto.rs | 87 +++++++---- datafusion/proto/src/logical_plan/to_proto.rs | 33 +++- .../tests/cases/roundtrip_logical_plan.rs | 93 +++++++++-- 6 files changed, 359 insertions(+), 46 deletions(-) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index ee9ac0e7902d3..11103472ae2ae 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -518,6 +518,7 @@ message AggregateUDFExprNode { LogicalExprNode filter = 3; repeated SortExprNode order_by = 4; optional bytes fun_definition = 6; + optional NullTreatment null_treatment = 7; } message ScalarUDFExprNode { @@ -538,6 +539,9 @@ message WindowExprNode { // repeated LogicalExprNode filter = 7; WindowFrame window_frame = 8; optional bytes fun_definition = 10; + optional NullTreatment null_treatment = 11; + bool distinct = 12; + LogicalExprNode filter = 13; } message BetweenNode { @@ -622,6 +626,11 @@ message WindowFrameBound { datafusion_common.ScalarValue bound_value = 2; } +enum NullTreatment { + RESPECT_NULLS = 0; + IGNORE_NULLS = 1; +} + /////////////////////////////////////////////////////////////////////////////////////////////////// // Arrow Data Types /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1365,4 +1374,4 @@ message SortMergeJoinExecNode { JoinFilter filter = 5; repeated SortExprNode sort_options = 6; datafusion_common.NullEquality null_equality = 7; -} \ No newline at end of file +} diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 29967d812000f..b34da2c312de0 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -596,6 +596,9 @@ impl serde::Serialize for AggregateUdfExprNode { if self.fun_definition.is_some() { len += 1; } + if self.null_treatment.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.AggregateUDFExprNode", len)?; if !self.fun_name.is_empty() { struct_ser.serialize_field("funName", &self.fun_name)?; @@ -617,6 +620,11 @@ impl serde::Serialize for AggregateUdfExprNode { #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?; } + if let Some(v) = self.null_treatment.as_ref() { + let v = NullTreatment::try_from(*v) + .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?; + struct_ser.serialize_field("nullTreatment", &v)?; + } struct_ser.end() } } @@ -636,6 +644,8 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode { "orderBy", "fun_definition", "funDefinition", + "null_treatment", + "nullTreatment", ]; #[allow(clippy::enum_variant_names)] @@ -646,6 +656,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode { Filter, OrderBy, FunDefinition, + NullTreatment, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -673,6 +684,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode { "filter" => Ok(GeneratedField::Filter), "orderBy" | "order_by" => Ok(GeneratedField::OrderBy), "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition), + "nullTreatment" | "null_treatment" => Ok(GeneratedField::NullTreatment), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -698,6 +710,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode { let mut filter__ = None; let mut order_by__ = None; let mut fun_definition__ = None; + let mut null_treatment__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::FunName => { @@ -738,6 +751,12 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode { map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0) ; } + GeneratedField::NullTreatment => { + if null_treatment__.is_some() { + return Err(serde::de::Error::duplicate_field("nullTreatment")); + } + null_treatment__ = map_.next_value::<::std::option::Option>()?.map(|x| x as i32); + } } } Ok(AggregateUdfExprNode { @@ -747,6 +766,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode { filter: filter__, order_by: order_by__.unwrap_or_default(), fun_definition: fun_definition__, + null_treatment: null_treatment__, }) } } @@ -13284,6 +13304,77 @@ impl<'de> serde::Deserialize<'de> for Not { deserializer.deserialize_struct("datafusion.Not", FIELDS, GeneratedVisitor) } } +impl serde::Serialize for NullTreatment { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + let variant = match self { + Self::RespectNulls => "RESPECT_NULLS", + Self::IgnoreNulls => "IGNORE_NULLS", + }; + serializer.serialize_str(variant) + } +} +impl<'de> serde::Deserialize<'de> for NullTreatment { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "RESPECT_NULLS", + "IGNORE_NULLS", + ]; + + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = NullTreatment; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + fn visit_i64(self, v: i64) -> std::result::Result + where + E: serde::de::Error, + { + i32::try_from(v) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_else(|| { + serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self) + }) + } + + fn visit_u64(self, v: u64) -> std::result::Result + where + E: serde::de::Error, + { + i32::try_from(v) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_else(|| { + serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self) + }) + } + + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "RESPECT_NULLS" => Ok(NullTreatment::RespectNulls), + "IGNORE_NULLS" => Ok(NullTreatment::IgnoreNulls), + _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), + } + } + } + deserializer.deserialize_any(GeneratedVisitor) + } +} impl serde::Serialize for OptimizedLogicalPlanType { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result @@ -23514,6 +23605,15 @@ impl serde::Serialize for WindowExprNode { if self.fun_definition.is_some() { len += 1; } + if self.null_treatment.is_some() { + len += 1; + } + if self.distinct { + len += 1; + } + if self.filter.is_some() { + len += 1; + } if self.window_function.is_some() { len += 1; } @@ -23535,6 +23635,17 @@ impl serde::Serialize for WindowExprNode { #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?; } + if let Some(v) = self.null_treatment.as_ref() { + let v = NullTreatment::try_from(*v) + .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?; + struct_ser.serialize_field("nullTreatment", &v)?; + } + if self.distinct { + struct_ser.serialize_field("distinct", &self.distinct)?; + } + if let Some(v) = self.filter.as_ref() { + struct_ser.serialize_field("filter", v)?; + } if let Some(v) = self.window_function.as_ref() { match v { window_expr_node::WindowFunction::Udaf(v) => { @@ -23564,6 +23675,10 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { "windowFrame", "fun_definition", "funDefinition", + "null_treatment", + "nullTreatment", + "distinct", + "filter", "udaf", "udwf", ]; @@ -23575,6 +23690,9 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { OrderBy, WindowFrame, FunDefinition, + NullTreatment, + Distinct, + Filter, Udaf, Udwf, } @@ -23603,6 +23721,9 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { "orderBy" | "order_by" => Ok(GeneratedField::OrderBy), "windowFrame" | "window_frame" => Ok(GeneratedField::WindowFrame), "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition), + "nullTreatment" | "null_treatment" => Ok(GeneratedField::NullTreatment), + "distinct" => Ok(GeneratedField::Distinct), + "filter" => Ok(GeneratedField::Filter), "udaf" => Ok(GeneratedField::Udaf), "udwf" => Ok(GeneratedField::Udwf), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), @@ -23629,6 +23750,9 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { let mut order_by__ = None; let mut window_frame__ = None; let mut fun_definition__ = None; + let mut null_treatment__ = None; + let mut distinct__ = None; + let mut filter__ = None; let mut window_function__ = None; while let Some(k) = map_.next_key()? { match k { @@ -23664,6 +23788,24 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0) ; } + GeneratedField::NullTreatment => { + if null_treatment__.is_some() { + return Err(serde::de::Error::duplicate_field("nullTreatment")); + } + null_treatment__ = map_.next_value::<::std::option::Option>()?.map(|x| x as i32); + } + GeneratedField::Distinct => { + if distinct__.is_some() { + return Err(serde::de::Error::duplicate_field("distinct")); + } + distinct__ = Some(map_.next_value()?); + } + GeneratedField::Filter => { + if filter__.is_some() { + return Err(serde::de::Error::duplicate_field("filter")); + } + filter__ = map_.next_value()?; + } GeneratedField::Udaf => { if window_function__.is_some() { return Err(serde::de::Error::duplicate_field("udaf")); @@ -23684,6 +23826,9 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { order_by: order_by__.unwrap_or_default(), window_frame: window_frame__, fun_definition: fun_definition__, + null_treatment: null_treatment__, + distinct: distinct__.unwrap_or_default(), + filter: filter__, window_function: window_function__, }) } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index d3b5f566e98b7..2e1c482db65c4 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -605,7 +605,7 @@ pub mod logical_expr_node { TryCast(::prost::alloc::boxed::Box), /// window expressions #[prost(message, tag = "18")] - WindowExpr(super::WindowExprNode), + WindowExpr(::prost::alloc::boxed::Box), /// AggregateUDF expressions #[prost(message, tag = "19")] AggregateUdfExpr(::prost::alloc::boxed::Box), @@ -795,6 +795,8 @@ pub struct AggregateUdfExprNode { pub order_by: ::prost::alloc::vec::Vec, #[prost(bytes = "vec", optional, tag = "6")] pub fun_definition: ::core::option::Option<::prost::alloc::vec::Vec>, + #[prost(enumeration = "NullTreatment", optional, tag = "7")] + pub null_treatment: ::core::option::Option, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ScalarUdfExprNode { @@ -818,6 +820,12 @@ pub struct WindowExprNode { pub window_frame: ::core::option::Option, #[prost(bytes = "vec", optional, tag = "10")] pub fun_definition: ::core::option::Option<::prost::alloc::vec::Vec>, + #[prost(enumeration = "NullTreatment", optional, tag = "11")] + pub null_treatment: ::core::option::Option, + #[prost(bool, tag = "12")] + pub distinct: bool, + #[prost(message, optional, boxed, tag = "13")] + pub filter: ::core::option::Option<::prost::alloc::boxed::Box>, #[prost(oneof = "window_expr_node::WindowFunction", tags = "3, 9")] pub window_function: ::core::option::Option, } @@ -2129,6 +2137,32 @@ impl WindowFrameBoundType { } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] +pub enum NullTreatment { + RespectNulls = 0, + IgnoreNulls = 1, +} +impl NullTreatment { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Self::RespectNulls => "RESPECT_NULLS", + Self::IgnoreNulls => "IGNORE_NULLS", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "RESPECT_NULLS" => Some(Self::RespectNulls), + "IGNORE_NULLS" => Some(Self::IgnoreNulls), + _ => None, + } + } +} +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] +#[repr(i32)] pub enum DateUnit { Day = 0, DateMillisecond = 1, diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index cbfa15183b5c1..ec6415adc4c9b 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -23,7 +23,7 @@ use datafusion_common::{ RecursionUnnestOption, Result, ScalarValue, TableReference, UnnestOptions, }; use datafusion_expr::dml::InsertOp; -use datafusion_expr::expr::{Alias, Placeholder, Sort}; +use datafusion_expr::expr::{Alias, NullTreatment, Placeholder, Sort}; use datafusion_expr::expr::{Unnest, WildcardOptions}; use datafusion_expr::{ expr::{self, InList, WindowFunction}, @@ -243,6 +243,15 @@ impl From for WriteOp { } } +impl From for NullTreatment { + fn from(t: protobuf::NullTreatment) -> Self { + match t { + protobuf::NullTreatment::RespectNulls => NullTreatment::RespectNulls, + protobuf::NullTreatment::IgnoreNulls => NullTreatment::IgnoreNulls, + } + } +} + pub fn parse_expr( proto: &protobuf::LogicalExprNode, registry: &dyn FunctionRegistry, @@ -301,9 +310,21 @@ pub fn parse_expr( exec_datafusion_err!("missing window frame during deserialization") })?; - // TODO: support null treatment, distinct, and filter in proto. - // See https://github.com/apache/datafusion/issues/17417 - match window_function { + let null_treatment = match expr.null_treatment { + Some(null_treatment) => { + let null_treatment = protobuf::NullTreatment::try_from(null_treatment) + .map_err(|_| { + proto_error(format!( + "Received a WindowExprNode message with unknown NullTreatment {}", + null_treatment + )) + })?; + Some(NullTreatment::from(null_treatment)) + } + None => None, + }; + + let agg_fn = match window_function { window_expr_node::WindowFunction::Udaf(udaf_name) => { let udaf_function = match &expr.fun_definition { Some(buf) => codec.try_decode_udaf(udaf_name, buf)?, @@ -311,17 +332,7 @@ pub fn parse_expr( .udaf(udaf_name) .or_else(|_| codec.try_decode_udaf(udaf_name, &[]))?, }; - - let args = parse_exprs(&expr.exprs, registry, codec)?; - Expr::from(WindowFunction::new( - expr::WindowFunctionDefinition::AggregateUDF(udaf_function), - args, - )) - .partition_by(partition_by) - .order_by(order_by) - .window_frame(window_frame) - .build() - .map_err(Error::DataFusionError) + expr::WindowFunctionDefinition::AggregateUDF(udaf_function) } window_expr_node::WindowFunction::Udwf(udwf_name) => { let udwf_function = match &expr.fun_definition { @@ -330,19 +341,28 @@ pub fn parse_expr( .udwf(udwf_name) .or_else(|_| codec.try_decode_udwf(udwf_name, &[]))?, }; - - let args = parse_exprs(&expr.exprs, registry, codec)?; - Expr::from(WindowFunction::new( - expr::WindowFunctionDefinition::WindowUDF(udwf_function), - args, - )) - .partition_by(partition_by) - .order_by(order_by) - .window_frame(window_frame) - .build() - .map_err(Error::DataFusionError) + expr::WindowFunctionDefinition::WindowUDF(udwf_function) } + }; + + let args = parse_exprs(&expr.exprs, registry, codec)?; + let mut builder = Expr::from(WindowFunction::new(agg_fn, args)) + .partition_by(partition_by) + .order_by(order_by) + .window_frame(window_frame) + .null_treatment(null_treatment); + + if expr.distinct { + builder = builder.distinct(); + }; + + if let Some(filter) = + parse_optional_expr(expr.filter.as_deref(), registry, codec)? + { + builder = builder.filter(filter); } + + builder.build().map_err(Error::DataFusionError) } ExprType::Alias(alias) => Ok(Expr::Alias(Alias::new( parse_required_expr(alias.expr.as_deref(), registry, "expr", codec)?, @@ -571,6 +591,19 @@ pub fn parse_expr( .udaf(&pb.fun_name) .or_else(|_| codec.try_decode_udaf(&pb.fun_name, &[]))?, }; + let null_treatment = match pb.null_treatment { + Some(null_treatment) => { + let null_treatment = protobuf::NullTreatment::try_from(null_treatment) + .map_err(|_| { + proto_error(format!( + "Received an AggregateUdfExprNode message with unknown NullTreatment {}", + null_treatment + )) + })?; + Some(NullTreatment::from(null_treatment)) + } + None => None, + }; Ok(Expr::AggregateFunction(expr::AggregateFunction::new_udf( agg_fn, @@ -578,7 +611,7 @@ pub fn parse_expr( pb.distinct, parse_optional_expr(pb.filter.as_deref(), registry, codec)?.map(Box::new), parse_sorts(&pb.order_by, registry, codec)?, - None, + null_treatment, ))) } diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 1be3300008c79..6238c2f1cdded 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -25,7 +25,7 @@ use datafusion_common::{NullEquality, TableReference, UnnestOptions}; use datafusion_expr::dml::InsertOp; use datafusion_expr::expr::{ self, AggregateFunctionParams, Alias, Between, BinaryExpr, Cast, GroupingSet, InList, - Like, Placeholder, ScalarFunction, Unnest, + Like, NullTreatment, Placeholder, ScalarFunction, Unnest, }; use datafusion_expr::WriteOp; use datafusion_expr::{ @@ -314,11 +314,9 @@ pub fn serialize_expr( ref partition_by, ref order_by, ref window_frame, - // TODO: support null treatment, distinct, and filter in proto. - // See https://github.com/apache/datafusion/issues/17417 - null_treatment: _, - distinct: _, - filter: _, + ref null_treatment, + ref distinct, + ref filter, }, } = window_fun.as_ref(); let mut buf = Vec::new(); @@ -342,16 +340,24 @@ pub fn serialize_expr( let window_frame: Option = Some(window_frame.try_into()?); + let window_expr = protobuf::WindowExprNode { exprs: serialize_exprs(args, codec)?, window_function: Some(window_function), partition_by, order_by, window_frame, + distinct: *distinct, + filter: match filter { + Some(e) => Some(Box::new(serialize_expr(e.as_ref(), codec)?)), + None => None, + }, + null_treatment: null_treatment + .map(|nt| protobuf::NullTreatment::from(nt).into()), fun_definition, }; protobuf::LogicalExprNode { - expr_type: Some(ExprType::WindowExpr(window_expr)), + expr_type: Some(ExprType::WindowExpr(Box::new(window_expr))), } } Expr::AggregateFunction(expr::AggregateFunction { @@ -362,7 +368,7 @@ pub fn serialize_expr( ref distinct, ref filter, ref order_by, - null_treatment: _, + ref null_treatment, }, }) => { let mut buf = Vec::new(); @@ -379,6 +385,8 @@ pub fn serialize_expr( }, order_by: serialize_sorts(order_by, codec)?, fun_definition: (!buf.is_empty()).then_some(buf), + null_treatment: null_treatment + .map(|nt| protobuf::NullTreatment::from(nt).into()), }, ))), } @@ -722,3 +730,12 @@ impl From<&WriteOp> for protobuf::dml_node::Type { } } } + +impl From for protobuf::NullTreatment { + fn from(t: NullTreatment) -> Self { + match t { + NullTreatment::RespectNulls => protobuf::NullTreatment::RespectNulls, + NullTreatment::IgnoreNulls => protobuf::NullTreatment::IgnoreNulls, + } + } +} diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index c5d4b49092d91..3d51038eba72c 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -73,8 +73,8 @@ use datafusion_common::{ }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ - self, Between, BinaryExpr, Case, Cast, GroupingSet, InList, Like, ScalarFunction, - Unnest, WildcardOptions, + self, Between, BinaryExpr, Case, Cast, GroupingSet, InList, Like, NullTreatment, + ScalarFunction, Unnest, WildcardOptions, }; use datafusion_expr::logical_plan::{Extension, UserDefinedLogicalNodeCore}; use datafusion_expr::{ @@ -2190,7 +2190,11 @@ fn roundtrip_aggregate_udf() { Arc::new(vec![DataType::Float64, DataType::UInt32]), ); - let test_expr = Expr::AggregateFunction(expr::AggregateFunction::new_udf( + let ctx = SessionContext::new(); + ctx.register_udaf(dummy_agg.clone()); + + // null_treatment absent + let test_expr1 = Expr::AggregateFunction(expr::AggregateFunction::new_udf( Arc::new(dummy_agg.clone()), vec![lit(1.0_f64)], false, @@ -2199,10 +2203,29 @@ fn roundtrip_aggregate_udf() { None, )); - let ctx = SessionContext::new(); - ctx.register_udaf(dummy_agg); + // null_treatment respect nulls + let test_expr2 = Expr::AggregateFunction(expr::AggregateFunction::new_udf( + Arc::new(dummy_agg.clone()), + vec![lit(1.0_f64)], + true, + Some(Box::new(lit(true))), + vec![], + Some(NullTreatment::RespectNulls), + )); - roundtrip_expr_test(test_expr, ctx); + // null_treatment ignore nulls + let test_expr3 = Expr::AggregateFunction(expr::AggregateFunction::new_udf( + Arc::new(dummy_agg), + vec![lit(1.0_f64)], + true, + Some(Box::new(lit(true))), + vec![], + Some(NullTreatment::IgnoreNulls), + )); + + roundtrip_expr_test(test_expr1, ctx.clone()); + roundtrip_expr_test(test_expr2, ctx.clone()); + roundtrip_expr_test(test_expr3, ctx); } fn dummy_udf() -> ScalarUDF { @@ -2566,8 +2589,10 @@ fn roundtrip_window() { .window_frame(row_number_frame.clone()) .build() .unwrap(); + ctx.register_udwf(dummy_window_udf); - let text_expr7 = Expr::from(expr::WindowFunction::new( + // 7. test with average udaf + let test_expr7 = Expr::from(expr::WindowFunction::new( WindowFunctionDefinition::AggregateUDF(avg_udaf()), vec![col("col1")], )) @@ -2575,7 +2600,53 @@ fn roundtrip_window() { .build() .unwrap(); - ctx.register_udwf(dummy_window_udf); + // 8. test with respect nulls + let test_expr8 = Expr::from(expr::WindowFunction::new( + WindowFunctionDefinition::WindowUDF(rank_udwf()), + vec![], + )) + .partition_by(vec![col("col1")]) + .order_by(vec![col("col2").sort(true, false)]) + .window_frame(WindowFrame::new(Some(false))) + .null_treatment(NullTreatment::RespectNulls) + .build() + .unwrap(); + + // 9. test with ignore nulls + let test_expr9 = Expr::from(expr::WindowFunction::new( + WindowFunctionDefinition::WindowUDF(rank_udwf()), + vec![], + )) + .partition_by(vec![col("col1")]) + .order_by(vec![col("col2").sort(true, false)]) + .window_frame(WindowFrame::new(Some(false))) + .null_treatment(NullTreatment::IgnoreNulls) + .build() + .unwrap(); + + // 10. test with distinct is `true` + let test_expr10 = Expr::from(expr::WindowFunction::new( + WindowFunctionDefinition::WindowUDF(rank_udwf()), + vec![], + )) + .partition_by(vec![col("col1")]) + .order_by(vec![col("col2").sort(true, false)]) + .window_frame(WindowFrame::new(Some(false))) + .distinct() + .build() + .unwrap(); + + // 11. test with filter + let test_expr11 = Expr::from(expr::WindowFunction::new( + WindowFunctionDefinition::WindowUDF(rank_udwf()), + vec![], + )) + .partition_by(vec![col("col1")]) + .order_by(vec![col("col2").sort(true, false)]) + .window_frame(WindowFrame::new(Some(false))) + .filter(col("col1").eq(lit(1))) + .build() + .unwrap(); roundtrip_expr_test(test_expr1, ctx.clone()); roundtrip_expr_test(test_expr2, ctx.clone()); @@ -2583,7 +2654,11 @@ fn roundtrip_window() { roundtrip_expr_test(test_expr4, ctx.clone()); roundtrip_expr_test(test_expr5, ctx.clone()); roundtrip_expr_test(test_expr6, ctx.clone()); - roundtrip_expr_test(text_expr7, ctx); + roundtrip_expr_test(test_expr7, ctx.clone()); + roundtrip_expr_test(test_expr8, ctx.clone()); + roundtrip_expr_test(test_expr9, ctx.clone()); + roundtrip_expr_test(test_expr10, ctx.clone()); + roundtrip_expr_test(test_expr11, ctx); } #[tokio::test] From c84e3cf5a5a9f4f4b2a0f44a03a90ff0b9461df7 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 16 Oct 2025 23:05:54 -0500 Subject: [PATCH 017/109] feat: Add percentile_cont aggregate function (#17988) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Adds exact `percentile_cont` aggregate function as the counterpart to the existing `approx_percentile_cont` function. ## What changes were made? ### New Implementation - Created `percentile_cont.rs` with full implementation - `PercentileCont` struct implementing `AggregateUDFImpl` - `PercentileContAccumulator` for standard aggregation - `DistinctPercentileContAccumulator` for DISTINCT mode - `PercentileContGroupsAccumulator` for efficient grouped aggregation - `calculate_percentile` function with linear interpolation ### Features - **Exact calculation**: Stores all values in memory for precise results - **WITHIN GROUP syntax**: Supports `WITHIN GROUP (ORDER BY ...)` - **Interpolation**: Uses linear interpolation between values - **All numeric types**: Works with integers, floats, and decimals - **Ordered-set aggregate**: Properly marked as `is_ordered_set_aggregate()` - **GROUP BY support**: Efficient grouped aggregation via GroupsAccumulator ### Tests Added comprehensive tests in `aggregate.slt`: - Error conditions validation - Basic percentile calculations (0.0, 0.25, 0.5, 0.75, 1.0) - Comparison with `median` function - Ascending and descending order - GROUP BY aggregation - NULL handling - Edge cases (empty sets, single values) - Float interpolation - Various numeric data types ## Example Usage ```sql -- Basic usage with WITHIN GROUP syntax SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) FROM table_name; -- With GROUP BY SELECT category, percentile_cont(0.95) WITHIN GROUP (ORDER BY value) FROM sales GROUP BY category; -- Compare with median (percentile_cont(0.5) == median) SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY price) FROM products; ``` ## Performance Considerations Like `median`, this function stores all values in memory before computing results. For large datasets or when approximation is acceptable, use `approx_percentile_cont` instead. ## Related Issues Closes #6714 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude --- datafusion-testing | 2 +- .../src/approx_percentile_cont.rs | 65 +- datafusion/functions-aggregate/src/lib.rs | 4 + .../src/percentile_cont.rs | 814 ++++++++++++++++++ datafusion/functions-aggregate/src/utils.rs | 72 ++ .../sqllogictest/test_files/aggregate.slt | 342 +++++++- .../user-guide/sql/aggregate_functions.md | 45 + 7 files changed, 1294 insertions(+), 50 deletions(-) create mode 100644 datafusion/functions-aggregate/src/percentile_cont.rs create mode 100644 datafusion/functions-aggregate/src/utils.rs diff --git a/datafusion-testing b/datafusion-testing index eccb0e4a42634..905df5f65cc9d 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit eccb0e4a426344ef3faf534cd60e02e9c3afd3ac +Subproject commit 905df5f65cc9d0851719c21f5a4dd5cd77621f19 diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 0deb09184b3f4..668280314e8d7 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -20,7 +20,7 @@ use std::fmt::{Debug, Formatter}; use std::mem::size_of_val; use std::sync::Arc; -use arrow::array::{Array, RecordBatch}; +use arrow::array::Array; use arrow::compute::{filter, is_not_null}; use arrow::datatypes::FieldRef; use arrow::{ @@ -28,19 +28,19 @@ use arrow::{ ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }, - datatypes::{DataType, Field, Schema}, + datatypes::{DataType, Field}, }; use datafusion_common::{ - downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err, - Result, ScalarValue, + downcast_value, internal_err, not_impl_err, plan_err, DataFusionError, Result, + ScalarValue, }; use datafusion_expr::expr::{AggregateFunction, Sort}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, ColumnarValue, Documentation, Expr, Signature, - TypeSignature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature, + Volatility, }; use datafusion_functions_aggregate_common::tdigest::{ TDigest, TryIntoF64, DEFAULT_MAX_SIZE, @@ -48,6 +48,8 @@ use datafusion_functions_aggregate_common::tdigest::{ use datafusion_macros::user_doc; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +use crate::utils::{get_scalar_value, validate_percentile_expr}; + create_func!(ApproxPercentileCont, approx_percentile_cont_udaf); /// Computes the approximate percentile continuous of a set of numbers @@ -164,7 +166,8 @@ impl ApproxPercentileCont { &self, args: AccumulatorArgs, ) -> Result { - let percentile = validate_input_percentile_expr(&args.exprs[1])?; + let percentile = + validate_percentile_expr(&args.exprs[1], "APPROX_PERCENTILE_CONT")?; let is_descending = args .order_bys @@ -214,45 +217,15 @@ impl ApproxPercentileCont { } } -fn get_scalar_value(expr: &Arc) -> Result { - let empty_schema = Arc::new(Schema::empty()); - let batch = RecordBatch::new_empty(Arc::clone(&empty_schema)); - if let ColumnarValue::Scalar(s) = expr.evaluate(&batch)? { - Ok(s) - } else { - internal_err!("Didn't expect ColumnarValue::Array") - } -} - -fn validate_input_percentile_expr(expr: &Arc) -> Result { - let percentile = match get_scalar_value(expr) - .map_err(|_| not_impl_datafusion_err!("Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal, got: {expr}"))? { - ScalarValue::Float32(Some(value)) => { - value as f64 - } - ScalarValue::Float64(Some(value)) => { - value - } - sv => { - return not_impl_err!( - "Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32 or Float64 literal (got data type {})", - sv.data_type() - ) - } - }; - - // Ensure the percentile is between 0 and 1. - if !(0.0..=1.0).contains(&percentile) { - return plan_err!( - "Percentile value must be between 0.0 and 1.0 inclusive, {percentile} is invalid" - ); - } - Ok(percentile) -} - fn validate_input_max_size_expr(expr: &Arc) -> Result { - let max_size = match get_scalar_value(expr) - .map_err(|_| not_impl_datafusion_err!("Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal, got: {expr}"))? { + let scalar_value = get_scalar_value(expr).map_err(|_e| { + DataFusionError::Plan( + "Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal" + .to_string(), + ) + })?; + + let max_size = match scalar_value { ScalarValue::UInt8(Some(q)) => q as usize, ScalarValue::UInt16(Some(q)) => q as usize, ScalarValue::UInt32(Some(q)) => q as usize, @@ -262,7 +235,7 @@ fn validate_input_max_size_expr(expr: &Arc) -> Result { ScalarValue::Int16(Some(q)) if q > 0 => q as usize, ScalarValue::Int8(Some(q)) if q > 0 => q as usize, sv => { - return not_impl_err!( + return plan_err!( "Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal (got data type {}).", sv.data_type() ) diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs index 4f282301ce5bd..b56b2b118e73b 100644 --- a/datafusion/functions-aggregate/src/lib.rs +++ b/datafusion/functions-aggregate/src/lib.rs @@ -81,6 +81,7 @@ pub mod hyperloglog; pub mod median; pub mod min_max; pub mod nth_value; +pub mod percentile_cont; pub mod regr; pub mod stddev; pub mod string_agg; @@ -88,6 +89,7 @@ pub mod sum; pub mod variance; pub mod planner; +mod utils; use crate::approx_percentile_cont::approx_percentile_cont_udaf; use crate::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight_udaf; @@ -123,6 +125,7 @@ pub mod expr_fn { pub use super::min_max::max; pub use super::min_max::min; pub use super::nth_value::nth_value; + pub use super::percentile_cont::percentile_cont; pub use super::regr::regr_avgx; pub use super::regr::regr_avgy; pub use super::regr::regr_count; @@ -171,6 +174,7 @@ pub fn all_default_aggregate_functions() -> Vec> { approx_distinct::approx_distinct_udaf(), approx_percentile_cont_udaf(), approx_percentile_cont_with_weight_udaf(), + percentile_cont::percentile_cont_udaf(), string_agg::string_agg_udaf(), bit_and_or_xor::bit_and_udaf(), bit_and_or_xor::bit_or_udaf(), diff --git a/datafusion/functions-aggregate/src/percentile_cont.rs b/datafusion/functions-aggregate/src/percentile_cont.rs new file mode 100644 index 0000000000000..8e9e9a3144d48 --- /dev/null +++ b/datafusion/functions-aggregate/src/percentile_cont.rs @@ -0,0 +1,814 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fmt::{Debug, Formatter}; +use std::mem::{size_of, size_of_val}; +use std::sync::Arc; + +use arrow::array::{ + ArrowNumericType, BooleanArray, ListArray, PrimitiveArray, PrimitiveBuilder, +}; +use arrow::buffer::{OffsetBuffer, ScalarBuffer}; +use arrow::{ + array::{Array, ArrayRef, AsArray}, + datatypes::{ + ArrowNativeType, DataType, Decimal128Type, Decimal256Type, Decimal32Type, + Decimal64Type, Field, FieldRef, Float16Type, Float32Type, Float64Type, + }, +}; + +use arrow::array::ArrowNativeTypeOp; + +use datafusion_common::{ + internal_datafusion_err, internal_err, plan_err, DataFusionError, HashSet, Result, + ScalarValue, +}; +use datafusion_expr::expr::{AggregateFunction, Sort}; +use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; +use datafusion_expr::type_coercion::aggregates::NUMERICS; +use datafusion_expr::utils::format_state_name; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature, + Volatility, +}; +use datafusion_expr::{EmitTo, GroupsAccumulator}; +use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate; +use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::filtered_null_mask; +use datafusion_functions_aggregate_common::utils::Hashable; +use datafusion_macros::user_doc; + +use crate::utils::validate_percentile_expr; + +/// Precision multiplier for linear interpolation calculations. +/// +/// This value of 1,000,000 was chosen to balance precision with overflow safety: +/// - Provides 6 decimal places of precision for the fractional component +/// - Small enough to avoid overflow when multiplied with typical numeric values +/// - Sufficient precision for most statistical applications +/// +/// The interpolation formula: `lower + (upper - lower) * fraction` +/// is computed as: `lower + ((upper - lower) * (fraction * PRECISION)) / PRECISION` +/// to avoid floating-point operations on integer types while maintaining precision. +const INTERPOLATION_PRECISION: usize = 1_000_000; + +create_func!(PercentileCont, percentile_cont_udaf); + +/// Computes the exact percentile continuous of a set of numbers +pub fn percentile_cont(order_by: Sort, percentile: Expr) -> Expr { + let expr = order_by.expr.clone(); + let args = vec![expr, percentile]; + + Expr::AggregateFunction(AggregateFunction::new_udf( + percentile_cont_udaf(), + args, + false, + None, + vec![order_by], + None, + )) +} + +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the exact percentile of input values, interpolating between values if needed.", + syntax_example = "percentile_cont(percentile) WITHIN GROUP (ORDER BY expression)", + sql_example = r#"```sql +> SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) FROM table_name; ++----------------------------------------------------------+ +| percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) | ++----------------------------------------------------------+ +| 45.5 | ++----------------------------------------------------------+ +``` + +An alternate syntax is also supported: +```sql +> SELECT percentile_cont(column_name, 0.75) FROM table_name; ++---------------------------------------+ +| percentile_cont(column_name, 0.75) | ++---------------------------------------+ +| 45.5 | ++---------------------------------------+ +```"#, + standard_argument(name = "expression", prefix = "The"), + argument( + name = "percentile", + description = "Percentile to compute. Must be a float value between 0 and 1 (inclusive)." + ) +)] +/// PERCENTILE_CONT aggregate expression. This uses an exact calculation and stores all values +/// in memory before computing the result. If an approximation is sufficient then +/// APPROX_PERCENTILE_CONT provides a much more efficient solution. +/// +/// If using the distinct variation, the memory usage will be similarly high if the +/// cardinality is high as it stores all distinct values in memory before computing the +/// result, but if cardinality is low then memory usage will also be lower. +#[derive(PartialEq, Eq, Hash)] +pub struct PercentileCont { + signature: Signature, + aliases: Vec, +} + +impl Debug for PercentileCont { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("PercentileCont") + .field("name", &self.name()) + .field("signature", &self.signature) + .finish() + } +} + +impl Default for PercentileCont { + fn default() -> Self { + Self::new() + } +} + +impl PercentileCont { + pub fn new() -> Self { + let mut variants = Vec::with_capacity(NUMERICS.len()); + // Accept any numeric value paired with a float64 percentile + for num in NUMERICS { + variants.push(TypeSignature::Exact(vec![num.clone(), DataType::Float64])); + } + Self { + signature: Signature::one_of(variants, Volatility::Immutable), + aliases: vec![String::from("quantile_cont")], + } + } + + fn create_accumulator(&self, args: AccumulatorArgs) -> Result> { + let percentile = validate_percentile_expr(&args.exprs[1], "PERCENTILE_CONT")?; + + let is_descending = args + .order_bys + .first() + .map(|sort_expr| sort_expr.options.descending) + .unwrap_or(false); + + let percentile = if is_descending { + 1.0 - percentile + } else { + percentile + }; + + macro_rules! helper { + ($t:ty, $dt:expr) => { + if args.is_distinct { + Ok(Box::new(DistinctPercentileContAccumulator::<$t> { + data_type: $dt.clone(), + distinct_values: HashSet::new(), + percentile, + })) + } else { + Ok(Box::new(PercentileContAccumulator::<$t> { + data_type: $dt.clone(), + all_values: vec![], + percentile, + })) + } + }; + } + + let input_dt = args.exprs[0].data_type(args.schema)?; + match input_dt { + // For integer types, use Float64 internally since percentile_cont returns Float64 + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 => helper!(Float64Type, DataType::Float64), + DataType::Float16 => helper!(Float16Type, input_dt), + DataType::Float32 => helper!(Float32Type, input_dt), + DataType::Float64 => helper!(Float64Type, input_dt), + DataType::Decimal32(_, _) => helper!(Decimal32Type, input_dt), + DataType::Decimal64(_, _) => helper!(Decimal64Type, input_dt), + DataType::Decimal128(_, _) => helper!(Decimal128Type, input_dt), + DataType::Decimal256(_, _) => helper!(Decimal256Type, input_dt), + _ => Err(DataFusionError::NotImplemented(format!( + "PercentileContAccumulator not supported for {} with {}", + args.name, input_dt, + ))), + } + } +} + +impl AggregateUDFImpl for PercentileCont { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + "percentile_cont" + } + + fn aliases(&self) -> &[String] { + &self.aliases + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if !arg_types[0].is_numeric() { + return plan_err!("percentile_cont requires numeric input types"); + } + // PERCENTILE_CONT performs linear interpolation and should return a float type + // For integer inputs, return Float64 (matching PostgreSQL/DuckDB behavior) + // For float inputs, preserve the float type + match &arg_types[0] { + DataType::Float16 | DataType::Float32 | DataType::Float64 => { + Ok(arg_types[0].clone()) + } + DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) => Ok(arg_types[0].clone()), + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 => Ok(DataType::Float64), + // Shouldn't happen due to signature check, but just in case + dt => plan_err!( + "percentile_cont does not support input type {}, must be numeric", + dt + ), + } + } + + fn state_fields(&self, args: StateFieldsArgs) -> Result> { + //Intermediate state is a list of the elements we have collected so far + let input_type = args.input_fields[0].data_type().clone(); + // For integer types, we store as Float64 internally + let storage_type = match &input_type { + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 => DataType::Float64, + _ => input_type, + }; + + let field = Field::new_list_field(storage_type, true); + let state_name = if args.is_distinct { + "distinct_percentile_cont" + } else { + "percentile_cont" + }; + + Ok(vec![Field::new( + format_state_name(args.name, state_name), + DataType::List(Arc::new(field)), + true, + ) + .into()]) + } + + fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { + self.create_accumulator(acc_args) + } + + fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool { + !args.is_distinct + } + + fn create_groups_accumulator( + &self, + args: AccumulatorArgs, + ) -> Result> { + let num_args = args.exprs.len(); + if num_args != 2 { + return internal_err!( + "percentile_cont should have 2 args, but found num args:{}", + args.exprs.len() + ); + } + + let percentile = validate_percentile_expr(&args.exprs[1], "PERCENTILE_CONT")?; + + let is_descending = args + .order_bys + .first() + .map(|sort_expr| sort_expr.options.descending) + .unwrap_or(false); + + let percentile = if is_descending { + 1.0 - percentile + } else { + percentile + }; + + macro_rules! helper { + ($t:ty, $dt:expr) => { + Ok(Box::new(PercentileContGroupsAccumulator::<$t>::new( + $dt, percentile, + ))) + }; + } + + let input_dt = args.exprs[0].data_type(args.schema)?; + match input_dt { + // For integer types, use Float64 internally since percentile_cont returns Float64 + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 => helper!(Float64Type, DataType::Float64), + DataType::Float16 => helper!(Float16Type, input_dt), + DataType::Float32 => helper!(Float32Type, input_dt), + DataType::Float64 => helper!(Float64Type, input_dt), + DataType::Decimal32(_, _) => helper!(Decimal32Type, input_dt), + DataType::Decimal64(_, _) => helper!(Decimal64Type, input_dt), + DataType::Decimal128(_, _) => helper!(Decimal128Type, input_dt), + DataType::Decimal256(_, _) => helper!(Decimal256Type, input_dt), + _ => Err(DataFusionError::NotImplemented(format!( + "PercentileContGroupsAccumulator not supported for {} with {}", + args.name, input_dt, + ))), + } + } + + fn supports_null_handling_clause(&self) -> bool { + false + } + + fn is_ordered_set_aggregate(&self) -> bool { + true + } + + fn documentation(&self) -> Option<&Documentation> { + self.doc() + } +} + +/// The percentile_cont accumulator accumulates the raw input values +/// as native types. +/// +/// The intermediate state is represented as a List of scalar values updated by +/// `merge_batch` and a `Vec` of native values that are converted to scalar values +/// in the final evaluation step so that we avoid expensive conversions and +/// allocations during `update_batch`. +struct PercentileContAccumulator { + data_type: DataType, + all_values: Vec, + percentile: f64, +} + +impl Debug for PercentileContAccumulator { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "PercentileContAccumulator({}, percentile={})", + self.data_type, self.percentile + ) + } +} + +impl Accumulator for PercentileContAccumulator { + fn state(&mut self) -> Result> { + // Convert `all_values` to `ListArray` and return a single List ScalarValue + + // Build offsets + let offsets = + OffsetBuffer::new(ScalarBuffer::from(vec![0, self.all_values.len() as i32])); + + // Build inner array + let values_array = PrimitiveArray::::new( + ScalarBuffer::from(std::mem::take(&mut self.all_values)), + None, + ) + .with_data_type(self.data_type.clone()); + + // Build the result list array + let list_array = ListArray::new( + Arc::new(Field::new_list_field(self.data_type.clone(), true)), + offsets, + Arc::new(values_array), + None, + ); + + Ok(vec![ScalarValue::List(Arc::new(list_array))]) + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + // Cast to target type if needed (e.g., integer to Float64) + let values = if values[0].data_type() != &self.data_type { + arrow::compute::cast(&values[0], &self.data_type)? + } else { + Arc::clone(&values[0]) + }; + + let values = values.as_primitive::(); + self.all_values.reserve(values.len() - values.null_count()); + self.all_values.extend(values.iter().flatten()); + Ok(()) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + let array = states[0].as_list::(); + for v in array.iter().flatten() { + self.update_batch(&[v])? + } + Ok(()) + } + + fn evaluate(&mut self) -> Result { + let d = std::mem::take(&mut self.all_values); + let value = calculate_percentile::(d, self.percentile); + ScalarValue::new_primitive::(value, &self.data_type) + } + + fn size(&self) -> usize { + size_of_val(self) + self.all_values.capacity() * size_of::() + } +} + +/// The percentile_cont groups accumulator accumulates the raw input values +/// +/// For calculating the exact percentile of groups, we need to store all values +/// of groups before final evaluation. +/// So values in each group will be stored in a `Vec`, and the total group values +/// will be actually organized as a `Vec>`. +/// +#[derive(Debug)] +struct PercentileContGroupsAccumulator { + data_type: DataType, + group_values: Vec>, + percentile: f64, +} + +impl PercentileContGroupsAccumulator { + pub fn new(data_type: DataType, percentile: f64) -> Self { + Self { + data_type, + group_values: Vec::new(), + percentile, + } + } +} + +impl GroupsAccumulator + for PercentileContGroupsAccumulator +{ + fn update_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&BooleanArray>, + total_num_groups: usize, + ) -> Result<()> { + // For ordered-set aggregates, we only care about the ORDER BY column (first element) + // The percentile parameter is already stored in self.percentile + + // Cast to target type if needed (e.g., integer to Float64) + let values_array = if values[0].data_type() != &self.data_type { + arrow::compute::cast(&values[0], &self.data_type)? + } else { + Arc::clone(&values[0]) + }; + + let values = values_array.as_primitive::(); + + // Push the `not nulls + not filtered` row into its group + self.group_values.resize(total_num_groups, Vec::new()); + accumulate( + group_indices, + values, + opt_filter, + |group_index, new_value| { + self.group_values[group_index].push(new_value); + }, + ); + + Ok(()) + } + + fn merge_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + // Since aggregate filter should be applied in partial stage, in final stage there should be no filter + _opt_filter: Option<&BooleanArray>, + total_num_groups: usize, + ) -> Result<()> { + assert_eq!(values.len(), 1, "one argument to merge_batch"); + + let input_group_values = values[0].as_list::(); + + // Ensure group values big enough + self.group_values.resize(total_num_groups, Vec::new()); + + // Extend values to related groups + group_indices + .iter() + .zip(input_group_values.iter()) + .for_each(|(&group_index, values_opt)| { + if let Some(values) = values_opt { + let values = values.as_primitive::(); + self.group_values[group_index].extend(values.values().iter()); + } + }); + + Ok(()) + } + + fn state(&mut self, emit_to: EmitTo) -> Result> { + // Emit values + let emit_group_values = emit_to.take_needed(&mut self.group_values); + + // Build offsets + let mut offsets = Vec::with_capacity(self.group_values.len() + 1); + offsets.push(0); + let mut cur_len = 0_i32; + for group_value in &emit_group_values { + cur_len += group_value.len() as i32; + offsets.push(cur_len); + } + let offsets = OffsetBuffer::new(ScalarBuffer::from(offsets)); + + // Build inner array + let flatten_group_values = + emit_group_values.into_iter().flatten().collect::>(); + let group_values_array = + PrimitiveArray::::new(ScalarBuffer::from(flatten_group_values), None) + .with_data_type(self.data_type.clone()); + + // Build the result list array + let result_list_array = ListArray::new( + Arc::new(Field::new_list_field(self.data_type.clone(), true)), + offsets, + Arc::new(group_values_array), + None, + ); + + Ok(vec![Arc::new(result_list_array)]) + } + + fn evaluate(&mut self, emit_to: EmitTo) -> Result { + // Emit values + let emit_group_values = emit_to.take_needed(&mut self.group_values); + + // Calculate percentile for each group + let mut evaluate_result_builder = + PrimitiveBuilder::::new().with_data_type(self.data_type.clone()); + for values in emit_group_values { + let value = calculate_percentile::(values, self.percentile); + evaluate_result_builder.append_option(value); + } + + Ok(Arc::new(evaluate_result_builder.finish())) + } + + fn convert_to_state( + &self, + values: &[ArrayRef], + opt_filter: Option<&BooleanArray>, + ) -> Result> { + assert_eq!(values.len(), 1, "one argument to merge_batch"); + + // Cast to target type if needed (e.g., integer to Float64) + let values_array = if values[0].data_type() != &self.data_type { + arrow::compute::cast(&values[0], &self.data_type)? + } else { + Arc::clone(&values[0]) + }; + + let input_array = values_array.as_primitive::(); + + // Directly convert the input array to states, each row will be + // seen as a respective group. + // For detail, the `input_array` will be converted to a `ListArray`. + // And if row is `not null + not filtered`, it will be converted to a list + // with only one element; otherwise, this row in `ListArray` will be set + // to null. + + // Reuse values buffer in `input_array` to build `values` in `ListArray` + let values = PrimitiveArray::::new(input_array.values().clone(), None) + .with_data_type(self.data_type.clone()); + + // `offsets` in `ListArray`, each row as a list element + let offset_end = i32::try_from(input_array.len()).map_err(|e| { + internal_datafusion_err!( + "cast array_len to i32 failed in convert_to_state of group percentile_cont, err:{e:?}" + ) + })?; + let offsets = (0..=offset_end).collect::>(); + // Safety: The offsets vector is constructed as a sequential range from 0 to input_array.len(), + // which guarantees all OffsetBuffer invariants: + // 1. Offsets are monotonically increasing (each element is prev + 1) + // 2. No offset exceeds the values array length (max offset = input_array.len()) + // 3. First offset is 0 and last offset equals the total length + // Therefore new_unchecked is safe to use here. + let offsets = unsafe { OffsetBuffer::new_unchecked(ScalarBuffer::from(offsets)) }; + + // `nulls` for converted `ListArray` + let nulls = filtered_null_mask(opt_filter, input_array); + + let converted_list_array = ListArray::new( + Arc::new(Field::new_list_field(self.data_type.clone(), true)), + offsets, + Arc::new(values), + nulls, + ); + + Ok(vec![Arc::new(converted_list_array)]) + } + + fn supports_convert_to_state(&self) -> bool { + true + } + + fn size(&self) -> usize { + self.group_values + .iter() + .map(|values| values.capacity() * size_of::()) + .sum::() + // account for size of self.group_values too + + self.group_values.capacity() * size_of::>() + } +} + +/// The distinct percentile_cont accumulator accumulates the raw input values +/// using a HashSet to eliminate duplicates. +/// +/// The intermediate state is represented as a List of scalar values updated by +/// `merge_batch` and a `Vec` of `ArrayRef` that are converted to scalar values +/// in the final evaluation step so that we avoid expensive conversions and +/// allocations during `update_batch`. +struct DistinctPercentileContAccumulator { + data_type: DataType, + distinct_values: HashSet>, + percentile: f64, +} + +impl Debug for DistinctPercentileContAccumulator { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DistinctPercentileContAccumulator({}, percentile={})", + self.data_type, self.percentile + ) + } +} + +impl Accumulator for DistinctPercentileContAccumulator { + fn state(&mut self) -> Result> { + let all_values = self + .distinct_values + .iter() + .map(|x| ScalarValue::new_primitive::(Some(x.0), &self.data_type)) + .collect::>>()?; + + let arr = ScalarValue::new_list_nullable(&all_values, &self.data_type); + Ok(vec![ScalarValue::List(arr)]) + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + if values.is_empty() { + return Ok(()); + } + + // Cast to target type if needed (e.g., integer to Float64) + let values = if values[0].data_type() != &self.data_type { + arrow::compute::cast(&values[0], &self.data_type)? + } else { + Arc::clone(&values[0]) + }; + + let array = values.as_primitive::(); + match array.nulls().filter(|x| x.null_count() > 0) { + Some(n) => { + for idx in n.valid_indices() { + self.distinct_values.insert(Hashable(array.value(idx))); + } + } + None => array.values().iter().for_each(|x| { + self.distinct_values.insert(Hashable(*x)); + }), + } + Ok(()) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + let array = states[0].as_list::(); + for v in array.iter().flatten() { + self.update_batch(&[v])? + } + Ok(()) + } + + fn evaluate(&mut self) -> Result { + let d = std::mem::take(&mut self.distinct_values) + .into_iter() + .map(|v| v.0) + .collect::>(); + let value = calculate_percentile::(d, self.percentile); + ScalarValue::new_primitive::(value, &self.data_type) + } + + fn size(&self) -> usize { + size_of_val(self) + self.distinct_values.capacity() * size_of::() + } +} + +/// Calculate the percentile value for a given set of values. +/// This function performs an exact calculation by sorting all values. +/// +/// The percentile is calculated using linear interpolation between closest ranks. +/// For percentile p and n values: +/// - If p * (n-1) is an integer, return the value at that position +/// - Otherwise, interpolate between the two closest values +fn calculate_percentile( + mut values: Vec, + percentile: f64, +) -> Option { + let cmp = |x: &T::Native, y: &T::Native| x.compare(*y); + + let len = values.len(); + if len == 0 { + None + } else if len == 1 { + Some(values[0]) + } else if percentile == 0.0 { + // Get minimum value + Some( + *values + .iter() + .min_by(|a, b| cmp(a, b)) + .expect("we checked for len > 0 a few lines above"), + ) + } else if percentile == 1.0 { + // Get maximum value + Some( + *values + .iter() + .max_by(|a, b| cmp(a, b)) + .expect("we checked for len > 0 a few lines above"), + ) + } else { + // Calculate the index using the formula: p * (n - 1) + let index = percentile * ((len - 1) as f64); + let lower_index = index.floor() as usize; + let upper_index = index.ceil() as usize; + + if lower_index == upper_index { + // Exact index, return the value at that position + let (_, value, _) = values.select_nth_unstable_by(lower_index, cmp); + Some(*value) + } else { + // Need to interpolate between two values + // First, partition at lower_index to get the lower value + let (_, lower_value, _) = values.select_nth_unstable_by(lower_index, cmp); + let lower_value = *lower_value; + + // Then partition at upper_index to get the upper value + let (_, upper_value, _) = values.select_nth_unstable_by(upper_index, cmp); + let upper_value = *upper_value; + + // Linear interpolation using wrapping arithmetic + // We use wrapping operations here (matching the approach in median.rs) because: + // 1. Both values come from the input data, so diff is bounded by the value range + // 2. fraction is between 0 and 1, and INTERPOLATION_PRECISION is small enough + // to prevent overflow when combined with typical numeric ranges + // 3. The result is guaranteed to be between lower_value and upper_value + // 4. For floating-point types, wrapping ops behave the same as standard ops + let fraction = index - (lower_index as f64); + let diff = upper_value.sub_wrapping(lower_value); + let interpolated = lower_value.add_wrapping( + diff.mul_wrapping(T::Native::usize_as( + (fraction * INTERPOLATION_PRECISION as f64) as usize, + )) + .div_wrapping(T::Native::usize_as(INTERPOLATION_PRECISION)), + ); + Some(interpolated) + } + } +} diff --git a/datafusion/functions-aggregate/src/utils.rs b/datafusion/functions-aggregate/src/utils.rs new file mode 100644 index 0000000000000..c058b64f95727 --- /dev/null +++ b/datafusion/functions-aggregate/src/utils.rs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::RecordBatch; +use arrow::datatypes::Schema; +use datafusion_common::{internal_err, plan_err, DataFusionError, Result, ScalarValue}; +use datafusion_expr::ColumnarValue; +use datafusion_physical_expr_common::physical_expr::PhysicalExpr; + +/// Evaluates a physical expression to extract its scalar value. +/// +/// This is used to extract constant values from expressions (like percentile parameters) +/// by evaluating them against an empty record batch. +pub(crate) fn get_scalar_value(expr: &Arc) -> Result { + let empty_schema = Arc::new(Schema::empty()); + let batch = RecordBatch::new_empty(Arc::clone(&empty_schema)); + if let ColumnarValue::Scalar(s) = expr.evaluate(&batch)? { + Ok(s) + } else { + internal_err!("Didn't expect ColumnarValue::Array") + } +} + +/// Validates that a percentile expression is a literal float value between 0.0 and 1.0. +/// +/// Used by both `percentile_cont` and `approx_percentile_cont` to validate their +/// percentile parameters. +pub(crate) fn validate_percentile_expr( + expr: &Arc, + fn_name: &str, +) -> Result { + let scalar_value = get_scalar_value(expr).map_err(|_e| { + DataFusionError::Plan(format!( + "Percentile value for '{fn_name}' must be a literal" + )) + })?; + + let percentile = match scalar_value { + ScalarValue::Float32(Some(value)) => value as f64, + ScalarValue::Float64(Some(value)) => value, + sv => { + return plan_err!( + "Percentile value for '{fn_name}' must be Float32 or Float64 literal (got data type {})", + sv.data_type() + ) + } + }; + + // Ensure the percentile is between 0 and 1. + if !(0.0..=1.0).contains(&percentile) { + return plan_err!( + "Percentile value must be between 0.0 and 1.0 inclusive, {percentile} is invalid" + ); + } + Ok(percentile) +} diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 9d6c7b11add6d..a5973afc0a93d 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -144,7 +144,7 @@ statement error Failed to coerce arguments to satisfy a call to 'approx_percenti SELECT approx_percentile_cont_with_weight(c2, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 # csv_query_approx_percentile_cont_with_histogram_bins -statement error DataFusion error: This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\. +statement error DataFusion error: Error during planning: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\. SELECT c1, approx_percentile_cont(0.95, -1000) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function @@ -156,10 +156,10 @@ SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c3) FROM aggre statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Float64, Float64, Float64 to the signature OneOf(.*) failed(.|\n)* SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 -statement error DataFusion error: This feature is not implemented: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal +statement error DataFusion error: Error during planning: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal SELECT approx_percentile_cont(c12) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 -statement error DataFusion error: This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal +statement error DataFusion error: Error during planning: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal SELECT approx_percentile_cont(0.95, c5) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 statement error DataFusion error: Error during planning: \[IGNORE | RESPECT\] NULLS are not permitted for approx_percentile_cont @@ -3356,6 +3356,342 @@ c 4 d 4 e 4 +##################### +## percentile_cont tests (exact percentile calculation) +##################### + +# Test error conditions for percentile_cont +statement error DataFusion error: Error during planning: Percentile value must be between 0.0 and 1.0 inclusive +SELECT percentile_cont(1.5) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: Percentile value must be between 0.0 and 1.0 inclusive +SELECT percentile_cont(-0.1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: Percentile value for 'PERCENTILE_CONT' must be a literal +SELECT percentile_cont(c2) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: \[IGNORE | RESPECT\] NULLS are not permitted for percentile_cont +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) IGNORE NULLS FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: \[IGNORE | RESPECT\] NULLS are not permitted for percentile_cont +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) RESPECT NULLS FROM aggregate_test_100 + +statement error DataFusion error: This feature is not implemented: Only a single ordering expression is permitted in a WITHIN GROUP clause +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c3, c2) FROM aggregate_test_100 + +# Not supported over sliding windows +query error DataFusion error: Error during planning: OVER and WITHIN GROUP clause cannot be used together +SELECT percentile_cont(0.5) +WITHIN GROUP (ORDER BY c3) +OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) +FROM aggregate_test_100 + +# Test basic percentile_cont with WITHIN GROUP syntax +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100 +---- +3 + +query R +SELECT percentile_cont(0.0) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100 +---- +1 + +query R +SELECT percentile_cont(1.0) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100 +---- +5 + +query R +SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100 +---- +2 + +query R +SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100 +---- +4 + +# Test that percentile_cont(0.5) equals median +query I +SELECT median(c2) FROM aggregate_test_100 +---- +3 + +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100 +---- +3 + +# Test with descending order +query R +SELECT percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) FROM aggregate_test_100 +---- +-101.25 + +query R +SELECT percentile_cont(0.05) WITHIN GROUP (ORDER BY c3 DESC) FROM aggregate_test_100 +---- +118.099998 + +# Test with GROUP BY +query TR +SELECT c1, percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1 +---- +a -25 +b 17 +c 1 +d 46.5 +e 64 + +query TR +SELECT c1, percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1 +---- +a 65 +b 68 +c 118 +d 123.299998 +e 112 + +# Test with NULLs +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v) +---- +2 + +# Test with all NULLs +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v) +---- +NULL + +# Test with empty set +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1)) as t (v) WHERE v > 10 +---- +NULL + +# Test with single value +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (42)) as t (v) +---- +42 + +# Test with float values for interpolation +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1.0), (2.0), (3.0), (4.0)) as t (v) +---- +2.5 + +query R +SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (1.0), (2.0), (3.0), (4.0)) as t (v) +---- +1.75 + +query R +SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (1.0), (2.0), (3.0), (4.0)) as t (v) +---- +3.25 + +# Test with various numeric types +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c7) FROM aggregate_test_100 +---- +134.5 + +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c8) FROM aggregate_test_100 +---- +30634 + +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c11) FROM aggregate_test_100 +---- +0.4906719 + +# Test edge case with two values (tests interpolation) +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (10.0), (20.0)) as t (v) +---- +15 + +query R +SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (10.0), (20.0)) as t (v) +---- +12.5 + +query R +SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (10.0), (20.0)) as t (v) +---- +17.5 + +# Test integer inputs requiring interpolation (should return float) +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4)) as t (v) +---- +2.5 + +query R +SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4)) as t (v) +---- +1.75 + +query R +SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4)) as t (v) +---- +3.25 + +# Test with exact percentile values (no interpolation needed) +query R +SELECT percentile_cont(0.0) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v) +---- +1 + +query R +SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v) +---- +2 + +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v) +---- +3 + +query R +SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v) +---- +4 + +query R +SELECT percentile_cont(1.0) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v) +---- +5 + +# Test with negative numbers +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (-10), (-5), (0), (5), (10)) as t (v) +---- +0 + +query R +SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (-10), (-5), (0), (5), (10)) as t (v) +---- +-5 + +query R +SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (-10), (-5), (0), (5), (10)) as t (v) +---- +5 + +# Test comparison: percentile_cont should give exact results +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 +---- +15.5 + +# Compare with approx_percentile_cont (should be close but may not be exact) +query B +SELECT ABS(percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) - approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c3)) < 5 FROM aggregate_test_100 +---- +true + +# Test percentile_cont without WITHIN GROUP clause (alternate syntax) +query R +SELECT percentile_cont(c2, 0.5) FROM aggregate_test_100 +---- +3 + +query R +SELECT percentile_cont(c2, 0.0) FROM aggregate_test_100 +---- +1 + +query R +SELECT percentile_cont(c2, 1.0) FROM aggregate_test_100 +---- +5 + +query R +SELECT percentile_cont(c2, 0.25) FROM aggregate_test_100 +---- +2 + +query R +SELECT percentile_cont(c2, 0.75) FROM aggregate_test_100 +---- +4 + +# Verify alternate syntax gives same results as WITHIN GROUP syntax +query B +SELECT percentile_cont(c2, 0.5) = percentile_cont(0.5) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100 +---- +true + +query B +SELECT percentile_cont(c3, 0.5) = percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 +---- +true + +# Test alternate syntax with GROUP BY +query TR +SELECT c1, percentile_cont(c3, 0.5) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1 +---- +a -25 +b 17 +c 1 +d 46.5 +e 64 + +# Verify alternate syntax with GROUP BY gives same results as WITHIN GROUP +query TB +SELECT c1, percentile_cont(c3, 0.95) = percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1 +---- +a true +b true +c true +d true +e true + +# Test ascending vs descending equivalence: percentile_cont(0.4) ASC should equal percentile_cont(0.6) DESC +# This tests the mathematical property that the pth percentile ascending = (1-p)th percentile descending +# Using a simple controlled dataset to demonstrate the property + +# Show 0.4 ascending +query R +SELECT percentile_cont(0.4) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v) +---- +2.6 + +# Show 0.6 descending (should be same as 0.4 ascending) +query R +SELECT percentile_cont(0.6) WITHIN GROUP (ORDER BY v DESC) FROM (VALUES (1), (2), (3), (4), (5)) as t (v) +---- +2.6 + +# Show 0.3 ascending +query R +SELECT percentile_cont(0.3) WITHIN GROUP (ORDER BY v) FROM (VALUES (10), (20), (30), (40), (50)) as t (v) +---- +21.99999 + +# Show 0.7 descending (should be same as 0.3 ascending) +query R +SELECT percentile_cont(0.7) WITHIN GROUP (ORDER BY v DESC) FROM (VALUES (10), (20), (30), (40), (50)) as t (v) +---- +22 + +# Show 0.25 ascending on larger dataset +query R +SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5), (6), (7), (8)) as t (v) +---- +2.75 + +# Show 0.75 descending (should be same as 0.25 ascending) +query R +SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v DESC) FROM (VALUES (1), (2), (3), (4), (5), (6), (7), (8)) as t (v) +---- +2.75 + # array_agg_zero query ? SELECT ARRAY_AGG([]) diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index 205962031b1d0..f17e09f2ce9d0 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -65,6 +65,8 @@ Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/ - [mean](#mean) - [median](#median) - [min](#min) +- [percentile_cont](#percentile_cont) +- [quantile_cont](#quantile_cont) - [string_agg](#string_agg) - [sum](#sum) - [var](#var) @@ -388,6 +390,49 @@ min(expression) +----------------------+ ``` +### `percentile_cont` + +Returns the exact percentile of input values, interpolating between values if needed. + +```sql +percentile_cont(percentile) WITHIN GROUP (ORDER BY expression) +``` + +#### Arguments + +- **expression**: The expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). + +#### Example + +```sql +> SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) FROM table_name; ++----------------------------------------------------------+ +| percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) | ++----------------------------------------------------------+ +| 45.5 | ++----------------------------------------------------------+ +``` + +An alternate syntax is also supported: + +```sql +> SELECT percentile_cont(column_name, 0.75) FROM table_name; ++---------------------------------------+ +| percentile_cont(column_name, 0.75) | ++---------------------------------------+ +| 45.5 | ++---------------------------------------+ +``` + +#### Aliases + +- quantile_cont + +### `quantile_cont` + +_Alias of [percentile_cont](#percentile_cont)._ + ### `string_agg` Concatenates the values of string expressions and places separator values between them. If ordering is required, strings are concatenated in the specified order. This aggregation function can only mix DISTINCT and ORDER BY if the ordering expression is exactly the same as the first argument expression. From 621a24978a7a9c6d2b27973d1853dbc8776a56b5 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Fri, 17 Oct 2025 17:09:46 +1100 Subject: [PATCH 018/109] fix: Re-bump latest datafusion-testing module so extended tests succeed (#18110) Looks like #17988 accidentally reverted the bump from #18096 --- datafusion-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-testing b/datafusion-testing index 905df5f65cc9d..eccb0e4a42634 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit 905df5f65cc9d0851719c21f5a4dd5cd77621f19 +Subproject commit eccb0e4a426344ef3faf534cd60e02e9c3afd3ac From ffe64e3103b037750aa927057a17984f8bf0bf7d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 17 Oct 2025 20:04:54 +1100 Subject: [PATCH 019/109] chore(deps): bump taiki-e/install-action from 2.62.31 to 2.62.33 (#18113) Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.62.31 to 2.62.33.
Release notes

Sourced from taiki-e/install-action's releases.

2.62.33

  • Update mise@latest to 2025.10.10.

2.62.32

  • Update syft@latest to 1.34.2.

  • Update vacuum@latest to 0.18.7.

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

[2.62.33] - 2025-10-17

  • Update mise@latest to 2025.10.10.

[2.62.32] - 2025-10-16

  • Update syft@latest to 1.34.2.

  • Update vacuum@latest to 0.18.7.

[2.62.31] - 2025-10-16

  • Update protoc@latest to 3.33.0.

  • Update uv@latest to 0.9.3.

  • Update syft@latest to 1.34.1.

  • Update mise@latest to 2025.10.9.

  • Update cargo-shear@latest to 1.6.0.

[2.62.30] - 2025-10-15

  • Update vacuum@latest to 0.18.6.

  • Update zizmor@latest to 1.15.2.

[2.62.29] - 2025-10-14

  • Update zizmor@latest to 1.15.1.

  • Update cargo-nextest@latest to 0.9.106.

  • Update mise@latest to 2025.10.8.

  • Update ubi@latest to 0.8.1.

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.62.31&new-version=2.62.33)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/audit.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 00bfa1e1b285f..98e6c35ada3b4 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@0005e0116e92d8489d8d96fbff83f061c79ba95a # v2.62.31 + uses: taiki-e/install-action@e43a5023a747770bfcb71ae048541a681714b951 # v2.62.33 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9fa033fce646f..09be2f2ad9e4a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -412,7 +412,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@0005e0116e92d8489d8d96fbff83f061c79ba95a # v2.62.31 + uses: taiki-e/install-action@e43a5023a747770bfcb71ae048541a681714b951 # v2.62.33 with: tool: wasm-pack - name: Run tests with headless mode @@ -739,7 +739,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@0005e0116e92d8489d8d96fbff83f061c79ba95a # v2.62.31 + uses: taiki-e/install-action@e43a5023a747770bfcb71ae048541a681714b951 # v2.62.33 with: tool: cargo-msrv From 0ae9fdcb0da0cfaf180848115705fe3a9b7de343 Mon Sep 17 00:00:00 2001 From: Enrico La Sala Date: Fri, 17 Oct 2025 11:29:46 +0200 Subject: [PATCH 020/109] Adding hiop as known user (#18114) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Doesn't close an issue. ## Rationale for this change Hi we are hiop, a Serverless Data Logistic Platform. We use DataFusion as a core part of our backend engine, and it plays a crucial role in our data infrastructure. Our team members are passionate about the project and actively try contribute to its development (@dariocurr). We’d love to have Hiop listed among the Known Users to show our support and help the DataFusion community continue to grow. ## What changes are included in this PR? Just adding hiop as known user ## Are these changes tested? ## Are there any user-facing changes? --- docs/source/user-guide/introduction.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md index dc4825dc06dfb..778562d55ffcb 100644 --- a/docs/source/user-guide/introduction.md +++ b/docs/source/user-guide/introduction.md @@ -109,6 +109,7 @@ Here are some active projects using DataFusion: - [Funnel](https://funnel.io/) Data Platform powering Marketing Intelligence applications. - [GlareDB](https://github.com/GlareDB/glaredb) Fast SQL database for querying and analyzing distributed data. - [GreptimeDB] Open Source & Cloud Native Distributed Time Series Database +- [hiop](https://hiop.io) Serverless Data Logistic Platform - [HoraeDB] Distributed Time-Series Database - [Iceberg-rust](https://github.com/apache/iceberg-rust) Rust implementation of Apache Iceberg - [InfluxDB] Time Series Database From a9ecd683060ae019fe198a09911b46dee384d9d5 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Fri, 17 Oct 2025 21:37:42 +1100 Subject: [PATCH 021/109] chore: remove unnecessary `skip_failed_rules` config in slt (#18117) ## Which issue does this PR close? - Closes #3695 - Closes #3797 ## Rationale for this change Was looking at above issues and I don't believe we skip the failed rules for any tests anymore (default for the config is also `false`), apart from this cleanup, so filing this PR so we can close the issues. Seems we only do in this `window.slt` test after this fix: https://github.com/apache/datafusion/blob/621a24978a7a9c6d2b27973d1853dbc8776a56b5/datafusion/sqllogictest/test_files/window.slt#L2587-L2611 Which seems intentional. ## What changes are included in this PR? Remove unnecessary `skip_failed_rules` config. ## Are these changes tested? Existing tests. ## Are there any user-facing changes? No. --- datafusion/sqllogictest/test_files/timestamps.slt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 1a7ff41d64a66..38b599260de19 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -643,11 +643,7 @@ select date '1994-01-01' - interval '1' day as date; ---- 1993-12-31 - # cast_string_to_time() -statement ok -set datafusion.optimizer.skip_failed_rules = false - query DDDD select time '08:09:10.123456789' as time_nano, @@ -668,10 +664,6 @@ SELECT TIME '24:01:02' as time; query error Arrow error: Parser error: Invalid timezone "ZZ": failed to parse timezone SELECT TIMESTAMP '2023-12-05T21:58:10.45ZZ'; -statement ok -set datafusion.optimizer.skip_failed_rules = true - - # cast_to_timestamp_twice query P select to_timestamp(a) from (select to_timestamp(1) as a) A; From fe955058ac779dbf00e2e04ebd721aa2951a6537 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Fri, 17 Oct 2025 11:38:30 +0100 Subject: [PATCH 022/109] move repartition to insta (#18106) Related https://github.com/apache/datafusion/pull/16324 https://github.com/apache/datafusion/pull/16617 almost there! --- .../physical-plan/src/repartition/mod.rs | 60 ++++++++----------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index a5bf68a63c387..dafde268ba737 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1782,16 +1782,12 @@ mod test { /// `$PLAN`: the plan to optimized /// macro_rules! assert_plan { - ($EXPECTED_PLAN_LINES: expr, $PLAN: expr) => { + ($PLAN: expr, @ $EXPECTED: expr) => { let formatted = crate::displayable($PLAN).indent(true).to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES - .iter().map(|s| *s).collect(); - - assert_eq!( - expected_plan_lines, actual, - "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n" + insta::assert_snapshot!( + formatted, + @$EXPECTED ); }; } @@ -1808,13 +1804,12 @@ mod test { .with_preserve_order(); // Repartition should preserve order - let expected_plan = [ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c0@0 ASC", - " UnionExec", - " DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC", - " DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC", - ]; - assert_plan!(expected_plan, &exec); + assert_plan!(&exec, @r" + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c0@0 ASC + UnionExec + DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC + DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC + "); Ok(()) } @@ -1824,16 +1819,15 @@ mod test { let sort_exprs = sort_exprs(&schema); let source = sorted_memory_exec(&schema, sort_exprs); // output is sorted, but has only a single partition, so no need to sort - let exec = RepartitionExec::try_new(source, Partitioning::RoundRobinBatch(10)) - .unwrap() + let exec = RepartitionExec::try_new(source, Partitioning::RoundRobinBatch(10))? .with_preserve_order(); // Repartition should not preserve order - let expected_plan = [ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC", - ]; - assert_plan!(expected_plan, &exec); + assert_plan!(&exec, @r" + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC + "); + Ok(()) } @@ -1848,13 +1842,12 @@ mod test { .with_preserve_order(); // Repartition should not preserve order, as there is no order to preserve - let expected_plan = [ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " UnionExec", - " DataSourceExec: partitions=1, partition_sizes=[0]", - " DataSourceExec: partitions=1, partition_sizes=[0]", - ]; - assert_plan!(expected_plan, &exec); + assert_plan!(&exec, @r" + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + UnionExec + DataSourceExec: partitions=1, partition_sizes=[0] + DataSourceExec: partitions=1, partition_sizes=[0] + "); Ok(()) } @@ -1869,11 +1862,10 @@ mod test { .unwrap(); // Repartition should not preserve order - let expected_plan = [ - "RepartitionExec: partitioning=RoundRobinBatch(20), input_partitions=1", - " DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC", - ]; - assert_plan!(expected_plan, exec.as_ref()); + assert_plan!(exec.as_ref(), @r" + RepartitionExec: partitioning=RoundRobinBatch(20), input_partitions=1 + DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC + "); Ok(()) } From 3272ebe9989fb7fca2bbf9954658e197ed6a58cb Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 17 Oct 2025 06:43:54 -0400 Subject: [PATCH 023/109] refactor: move ListingTable over to the catalog-listing-table crate (#18080) ## Which issue does this PR close? - This addresses part of https://github.com/apache/datafusion/issues/17713 - Closes https://github.com/apache/datafusion/issues/14462 ## Rationale for this change In order to remove the `datafusion` core crate from `proto` as a dependency, we need to access `ListingTable` but it is within the `core` crate. There already exists a `datafusion-catalog-listing` which is bare and appears to be the place this should exist. ## What changes are included in this PR? Move `ListingTable` and some of its dependent structs over to the `datafusion-catalog-listing` crate. There is one dependency I wasn't able to remove from the `core` crate, which is inferring the listing table configuration options. That is because within this method it downcasts `Session` to `SessionState`. If a downstream user ever attempts to implement `Session` themselves, these methods also would not work. Because it would cause a circular dependency, we cannot also lift the method we need out of `SessionState` to `Session`. Instead I took the approach of splitting off the two methods that require `SessionState` as an extension trait for the listing table config. From the git diff this appears to be a large change (+1637/-1519) however the *vast* majority of that is copying the code from one file into another. I have added a comment on the significant change. ## Are these changes tested? Existing unit tests show no regression. This is just a code refactor. ## Are there any user-facing changes? Users may need to update their use paths. --- Cargo.lock | 3 + .../examples/custom_file_casts.rs | 2 +- .../examples/json_shredding.rs | 2 +- datafusion/catalog-listing/Cargo.toml | 6 + datafusion/catalog-listing/src/config.rs | 360 ++++ datafusion/catalog-listing/src/mod.rs | 7 + datafusion/catalog-listing/src/options.rs | 411 +++++ datafusion/catalog-listing/src/table.rs | 788 ++++++++ .../core/src/datasource/dynamic_file.rs | 1 + datafusion/core/src/datasource/listing/mod.rs | 3 +- .../core/src/datasource/listing/table.rs | 1596 +---------------- datafusion/core/tests/catalog/memory.rs | 2 +- .../core/tests/parquet/schema_adapter.rs | 4 +- 13 files changed, 1657 insertions(+), 1528 deletions(-) create mode 100644 datafusion/catalog-listing/src/config.rs create mode 100644 datafusion/catalog-listing/src/options.rs create mode 100644 datafusion/catalog-listing/src/table.rs diff --git a/Cargo.lock b/Cargo.lock index 7b09121595d67..0392c8147ad2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1913,12 +1913,15 @@ dependencies = [ "datafusion-catalog", "datafusion-common", "datafusion-datasource", + "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-plan", "futures", + "itertools 0.14.0", "log", "object_store", "tokio", diff --git a/datafusion-examples/examples/custom_file_casts.rs b/datafusion-examples/examples/custom_file_casts.rs index 65ca096820640..4d97ecd91dc64 100644 --- a/datafusion-examples/examples/custom_file_casts.rs +++ b/datafusion-examples/examples/custom_file_casts.rs @@ -25,7 +25,7 @@ use datafusion::common::not_impl_err; use datafusion::common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion::common::{Result, ScalarValue}; use datafusion::datasource::listing::{ - ListingTable, ListingTableConfig, ListingTableUrl, + ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl, }; use datafusion::execution::context::SessionContext; use datafusion::execution::object_store::ObjectStoreUrl; diff --git a/datafusion-examples/examples/json_shredding.rs b/datafusion-examples/examples/json_shredding.rs index c7d0146a001f7..a2e83bc9510ab 100644 --- a/datafusion-examples/examples/json_shredding.rs +++ b/datafusion-examples/examples/json_shredding.rs @@ -27,7 +27,7 @@ use datafusion::common::tree_node::{ }; use datafusion::common::{assert_contains, exec_datafusion_err, Result}; use datafusion::datasource::listing::{ - ListingTable, ListingTableConfig, ListingTableUrl, + ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl, }; use datafusion::execution::context::SessionContext; use datafusion::execution::object_store::ObjectStoreUrl; diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml index 69f952ae98407..4eaeed675a206 100644 --- a/datafusion/catalog-listing/Cargo.toml +++ b/datafusion/catalog-listing/Cargo.toml @@ -39,14 +39,17 @@ datafusion-datasource = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-physical-expr = { workspace = true } +datafusion-physical-expr-adapter = { workspace = true } datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } futures = { workspace = true } +itertools = { workspace = true } log = { workspace = true } object_store = { workspace = true } tokio = { workspace = true } [dev-dependencies] +datafusion-datasource-parquet = { workspace = true } [lints] workspace = true @@ -54,3 +57,6 @@ workspace = true [lib] name = "datafusion_catalog_listing" path = "src/mod.rs" + +[package.metadata.cargo-machete] +ignored = ["datafusion-datasource-parquet"] diff --git a/datafusion/catalog-listing/src/config.rs b/datafusion/catalog-listing/src/config.rs new file mode 100644 index 0000000000000..90f44de4fdbc8 --- /dev/null +++ b/datafusion/catalog-listing/src/config.rs @@ -0,0 +1,360 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::options::ListingOptions; +use arrow::datatypes::{DataType, Schema, SchemaRef}; +use datafusion_catalog::Session; +use datafusion_common::{config_err, internal_err}; +use datafusion_datasource::file_compression_type::FileCompressionType; +use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_datasource::ListingTableUrl; +use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; +use std::str::FromStr; +use std::sync::Arc; + +/// Indicates the source of the schema for a [`crate::ListingTable`] +// PartialEq required for assert_eq! in tests +#[derive(Debug, Clone, Copy, PartialEq, Default)] +pub enum SchemaSource { + /// Schema is not yet set (initial state) + #[default] + Unset, + /// Schema was inferred from first table_path + Inferred, + /// Schema was specified explicitly via with_schema + Specified, +} + +/// Configuration for creating a [`crate::ListingTable`] +/// +/// # Schema Evolution Support +/// +/// This configuration supports schema evolution through the optional +/// [`SchemaAdapterFactory`]. You might want to override the default factory when you need: +/// +/// - **Type coercion requirements**: When you need custom logic for converting between +/// different Arrow data types (e.g., Int32 ↔ Int64, Utf8 ↔ LargeUtf8) +/// - **Column mapping**: You need to map columns with a legacy name to a new name +/// - **Custom handling of missing columns**: By default they are filled in with nulls, but you may e.g. want to fill them in with `0` or `""`. +/// +/// If not specified, a [`datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory`] +/// will be used, which handles basic schema compatibility cases. +/// +#[derive(Debug, Clone, Default)] +pub struct ListingTableConfig { + /// Paths on the `ObjectStore` for creating [`crate::ListingTable`]. + /// They should share the same schema and object store. + pub table_paths: Vec, + /// Optional `SchemaRef` for the to be created [`crate::ListingTable`]. + /// + /// See details on [`ListingTableConfig::with_schema`] + pub file_schema: Option, + /// Optional [`ListingOptions`] for the to be created [`crate::ListingTable`]. + /// + /// See details on [`ListingTableConfig::with_listing_options`] + pub options: Option, + /// Tracks the source of the schema information + pub(crate) schema_source: SchemaSource, + /// Optional [`SchemaAdapterFactory`] for creating schema adapters + pub(crate) schema_adapter_factory: Option>, + /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters + pub(crate) expr_adapter_factory: Option>, +} + +impl ListingTableConfig { + /// Creates new [`ListingTableConfig`] for reading the specified URL + pub fn new(table_path: ListingTableUrl) -> Self { + Self { + table_paths: vec![table_path], + ..Default::default() + } + } + + /// Creates new [`ListingTableConfig`] with multiple table paths. + /// + /// See `ListingTableConfigExt::infer_options` for details on what happens with multiple paths + pub fn new_with_multi_paths(table_paths: Vec) -> Self { + Self { + table_paths, + ..Default::default() + } + } + + /// Returns the source of the schema for this configuration + pub fn schema_source(&self) -> SchemaSource { + self.schema_source + } + /// Set the `schema` for the overall [`crate::ListingTable`] + /// + /// [`crate::ListingTable`] will automatically coerce, when possible, the schema + /// for individual files to match this schema. + /// + /// If a schema is not provided, it is inferred using + /// [`Self::infer_schema`]. + /// + /// If the schema is provided, it must contain only the fields in the file + /// without the table partitioning columns. + /// + /// # Example: Specifying Table Schema + /// ```rust + /// # use std::sync::Arc; + /// # use datafusion_catalog_listing::{ListingTableConfig, ListingOptions}; + /// # use datafusion_datasource::ListingTableUrl; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// # use arrow::datatypes::{Schema, Field, DataType}; + /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap(); + /// # let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())); + /// let schema = Arc::new(Schema::new(vec![ + /// Field::new("id", DataType::Int64, false), + /// Field::new("name", DataType::Utf8, true), + /// ])); + /// + /// let config = ListingTableConfig::new(table_paths) + /// .with_listing_options(listing_options) // Set options first + /// .with_schema(schema); // Then set schema + /// ``` + pub fn with_schema(self, schema: SchemaRef) -> Self { + // Note: We preserve existing options state, but downstream code may expect + // options to be set. Consider calling with_listing_options() or infer_options() + // before operations that require options to be present. + debug_assert!( + self.options.is_some() || cfg!(test), + "ListingTableConfig::with_schema called without options set. \ + Consider calling with_listing_options() or infer_options() first to avoid panics in downstream code." + ); + + Self { + file_schema: Some(schema), + schema_source: SchemaSource::Specified, + ..self + } + } + + /// Add `listing_options` to [`ListingTableConfig`] + /// + /// If not provided, format and other options are inferred via + /// `ListingTableConfigExt::infer_options`. + /// + /// # Example: Configuring Parquet Files with Custom Options + /// ```rust + /// # use std::sync::Arc; + /// # use datafusion_catalog_listing::{ListingTableConfig, ListingOptions}; + /// # use datafusion_datasource::ListingTableUrl; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap(); + /// let options = ListingOptions::new(Arc::new(ParquetFormat::default())) + /// .with_file_extension(".parquet") + /// .with_collect_stat(true); + /// + /// let config = ListingTableConfig::new(table_paths) + /// .with_listing_options(options); // Configure file format and options + /// ``` + pub fn with_listing_options(self, listing_options: ListingOptions) -> Self { + // Note: This method properly sets options, but be aware that downstream + // methods like infer_schema() and try_new() require both schema and options + // to be set to function correctly. + debug_assert!( + !self.table_paths.is_empty() || cfg!(test), + "ListingTableConfig::with_listing_options called without table_paths set. \ + Consider calling new() or new_with_multi_paths() first to establish table paths." + ); + + Self { + options: Some(listing_options), + ..self + } + } + + /// Returns a tuple of `(file_extension, optional compression_extension)` + /// + /// For example a path ending with blah.test.csv.gz returns `("csv", Some("gz"))` + /// For example a path ending with blah.test.csv returns `("csv", None)` + pub fn infer_file_extension_and_compression_type( + path: &str, + ) -> datafusion_common::Result<(String, Option)> { + let mut exts = path.rsplit('.'); + + let split = exts.next().unwrap_or(""); + + let file_compression_type = FileCompressionType::from_str(split) + .unwrap_or(FileCompressionType::UNCOMPRESSED); + + if file_compression_type.is_compressed() { + let split2 = exts.next().unwrap_or(""); + Ok((split2.to_string(), Some(split.to_string()))) + } else { + Ok((split.to_string(), None)) + } + } + + /// Infer the [`SchemaRef`] based on `table_path`s. + /// + /// This method infers the table schema using the first `table_path`. + /// See [`ListingOptions::infer_schema`] for more details + /// + /// # Errors + /// * if `self.options` is not set. See [`Self::with_listing_options`] + pub async fn infer_schema( + self, + state: &dyn Session, + ) -> datafusion_common::Result { + match self.options { + Some(options) => { + let ListingTableConfig { + table_paths, + file_schema, + options: _, + schema_source, + schema_adapter_factory, + expr_adapter_factory: physical_expr_adapter_factory, + } = self; + + let (schema, new_schema_source) = match file_schema { + Some(schema) => (schema, schema_source), // Keep existing source if schema exists + None => { + if let Some(url) = table_paths.first() { + ( + options.infer_schema(state, url).await?, + SchemaSource::Inferred, + ) + } else { + (Arc::new(Schema::empty()), SchemaSource::Inferred) + } + } + }; + + Ok(Self { + table_paths, + file_schema: Some(schema), + options: Some(options), + schema_source: new_schema_source, + schema_adapter_factory, + expr_adapter_factory: physical_expr_adapter_factory, + }) + } + None => internal_err!("No `ListingOptions` set for inferring schema"), + } + } + + /// Infer the partition columns from `table_paths`. + /// + /// # Errors + /// * if `self.options` is not set. See [`Self::with_listing_options`] + pub async fn infer_partitions_from_path( + self, + state: &dyn Session, + ) -> datafusion_common::Result { + match self.options { + Some(options) => { + let Some(url) = self.table_paths.first() else { + return config_err!("No table path found"); + }; + let partitions = options + .infer_partitions(state, url) + .await? + .into_iter() + .map(|col_name| { + ( + col_name, + DataType::Dictionary( + Box::new(DataType::UInt16), + Box::new(DataType::Utf8), + ), + ) + }) + .collect::>(); + let options = options.with_table_partition_cols(partitions); + Ok(Self { + table_paths: self.table_paths, + file_schema: self.file_schema, + options: Some(options), + schema_source: self.schema_source, + schema_adapter_factory: self.schema_adapter_factory, + expr_adapter_factory: self.expr_adapter_factory, + }) + } + None => config_err!("No `ListingOptions` set for inferring schema"), + } + } + + /// Set the [`SchemaAdapterFactory`] for the [`crate::ListingTable`] + /// + /// The schema adapter factory is used to create schema adapters that can + /// handle schema evolution and type conversions when reading files with + /// different schemas than the table schema. + /// + /// If not provided, a default schema adapter factory will be used. + /// + /// # Example: Custom Schema Adapter for Type Coercion + /// ```rust + /// # use std::sync::Arc; + /// # use datafusion_catalog_listing::{ListingTableConfig, ListingOptions}; + /// # use datafusion_datasource::schema_adapter::{SchemaAdapterFactory, SchemaAdapter}; + /// # use datafusion_datasource::ListingTableUrl; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// # use arrow::datatypes::{SchemaRef, Schema, Field, DataType}; + /// # + /// # #[derive(Debug)] + /// # struct MySchemaAdapterFactory; + /// # impl SchemaAdapterFactory for MySchemaAdapterFactory { + /// # fn create(&self, _projected_table_schema: SchemaRef, _file_schema: SchemaRef) -> Box { + /// # unimplemented!() + /// # } + /// # } + /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap(); + /// # let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())); + /// # let table_schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)])); + /// let config = ListingTableConfig::new(table_paths) + /// .with_listing_options(listing_options) + /// .with_schema(table_schema) + /// .with_schema_adapter_factory(Arc::new(MySchemaAdapterFactory)); + /// ``` + pub fn with_schema_adapter_factory( + self, + schema_adapter_factory: Arc, + ) -> Self { + Self { + schema_adapter_factory: Some(schema_adapter_factory), + ..self + } + } + + /// Get the [`SchemaAdapterFactory`] for this configuration + pub fn schema_adapter_factory(&self) -> Option<&Arc> { + self.schema_adapter_factory.as_ref() + } + + /// Set the [`PhysicalExprAdapterFactory`] for the [`crate::ListingTable`] + /// + /// The expression adapter factory is used to create physical expression adapters that can + /// handle schema evolution and type conversions when evaluating expressions + /// with different schemas than the table schema. + /// + /// If not provided, a default physical expression adapter factory will be used unless a custom + /// `SchemaAdapterFactory` is set, in which case only the `SchemaAdapterFactory` will be used. + /// + /// See for details on this transition. + pub fn with_expr_adapter_factory( + self, + expr_adapter_factory: Arc, + ) -> Self { + Self { + expr_adapter_factory: Some(expr_adapter_factory), + ..self + } + } +} diff --git a/datafusion/catalog-listing/src/mod.rs b/datafusion/catalog-listing/src/mod.rs index 1322577b207ab..90d04b46b8067 100644 --- a/datafusion/catalog-listing/src/mod.rs +++ b/datafusion/catalog-listing/src/mod.rs @@ -24,4 +24,11 @@ // https://github.com/apache/datafusion/issues/11143 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] +mod config; pub mod helpers; +mod options; +mod table; + +pub use config::{ListingTableConfig, SchemaSource}; +pub use options::ListingOptions; +pub use table::ListingTable; diff --git a/datafusion/catalog-listing/src/options.rs b/datafusion/catalog-listing/src/options.rs new file mode 100644 index 0000000000000..3cbf3573e9519 --- /dev/null +++ b/datafusion/catalog-listing/src/options.rs @@ -0,0 +1,411 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, SchemaRef}; +use datafusion_catalog::Session; +use datafusion_common::plan_err; +use datafusion_datasource::file_format::FileFormat; +use datafusion_datasource::ListingTableUrl; +use datafusion_execution::config::SessionConfig; +use datafusion_expr::SortExpr; +use futures::StreamExt; +use futures::{future, TryStreamExt}; +use itertools::Itertools; +use std::sync::Arc; + +/// Options for creating a [`crate::ListingTable`] +#[derive(Clone, Debug)] +pub struct ListingOptions { + /// A suffix on which files should be filtered (leave empty to + /// keep all files on the path) + pub file_extension: String, + /// The file format + pub format: Arc, + /// The expected partition column names in the folder structure. + /// See [Self::with_table_partition_cols] for details + pub table_partition_cols: Vec<(String, DataType)>, + /// Set true to try to guess statistics from the files. + /// This can add a lot of overhead as it will usually require files + /// to be opened and at least partially parsed. + pub collect_stat: bool, + /// Group files to avoid that the number of partitions exceeds + /// this limit + pub target_partitions: usize, + /// Optional pre-known sort order(s). Must be `SortExpr`s. + /// + /// DataFusion may take advantage of this ordering to omit sorts + /// or use more efficient algorithms. Currently sortedness must be + /// provided if it is known by some external mechanism, but may in + /// the future be automatically determined, for example using + /// parquet metadata. + /// + /// See + /// + /// NOTE: This attribute stores all equivalent orderings (the outer `Vec`) + /// where each ordering consists of an individual lexicographic + /// ordering (encapsulated by a `Vec`). If there aren't + /// multiple equivalent orderings, the outer `Vec` will have a + /// single element. + pub file_sort_order: Vec>, +} + +impl ListingOptions { + /// Creates an options instance with the given format + /// Default values: + /// - use default file extension filter + /// - no input partition to discover + /// - one target partition + /// - do not collect statistics + pub fn new(format: Arc) -> Self { + Self { + file_extension: format.get_ext(), + format, + table_partition_cols: vec![], + collect_stat: false, + target_partitions: 1, + file_sort_order: vec![], + } + } + + /// Set options from [`SessionConfig`] and returns self. + /// + /// Currently this sets `target_partitions` and `collect_stat` + /// but if more options are added in the future that need to be coordinated + /// they will be synchronized through this method. + pub fn with_session_config_options(mut self, config: &SessionConfig) -> Self { + self = self.with_target_partitions(config.target_partitions()); + self = self.with_collect_stat(config.collect_statistics()); + self + } + + /// Set file extension on [`ListingOptions`] and returns self. + /// + /// # Example + /// ``` + /// # use std::sync::Arc; + /// # use datafusion_catalog_listing::ListingOptions; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// + /// let listing_options = ListingOptions::new(Arc::new( + /// ParquetFormat::default() + /// )) + /// .with_file_extension(".parquet"); + /// + /// assert_eq!(listing_options.file_extension, ".parquet"); + /// ``` + pub fn with_file_extension(mut self, file_extension: impl Into) -> Self { + self.file_extension = file_extension.into(); + self + } + + /// Optionally set file extension on [`ListingOptions`] and returns self. + /// + /// If `file_extension` is `None`, the file extension will not be changed + /// + /// # Example + /// ``` + /// # use std::sync::Arc; + /// # use datafusion_catalog_listing::ListingOptions; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// + /// let extension = Some(".parquet"); + /// let listing_options = ListingOptions::new(Arc::new( + /// ParquetFormat::default() + /// )) + /// .with_file_extension_opt(extension); + /// + /// assert_eq!(listing_options.file_extension, ".parquet"); + /// ``` + pub fn with_file_extension_opt(mut self, file_extension: Option) -> Self + where + S: Into, + { + if let Some(file_extension) = file_extension { + self.file_extension = file_extension.into(); + } + self + } + + /// Set `table partition columns` on [`ListingOptions`] and returns self. + /// + /// "partition columns," used to support [Hive Partitioning], are + /// columns added to the data that is read, based on the folder + /// structure where the data resides. + /// + /// For example, give the following files in your filesystem: + /// + /// ```text + /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet + /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet + /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet + /// ``` + /// + /// A [`crate::ListingTable`] created at `/mnt/nyctaxi/` with partition + /// columns "year" and "month" will include new `year` and `month` + /// columns while reading the files. The `year` column would have + /// value `2022` and the `month` column would have value `01` for + /// the rows read from + /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet` + /// + ///# Notes + /// + /// - If only one level (e.g. `year` in the example above) is + /// specified, the other levels are ignored but the files are + /// still read. + /// + /// - Files that don't follow this partitioning scheme will be + /// ignored. + /// + /// - Since the columns have the same value for all rows read from + /// each individual file (such as dates), they are typically + /// dictionary encoded for efficiency. You may use + /// [`wrap_partition_type_in_dict`] to request a + /// dictionary-encoded type. + /// + /// - The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself. + /// + /// # Example + /// + /// ``` + /// # use std::sync::Arc; + /// # use arrow::datatypes::DataType; + /// # use datafusion_expr::col; + /// # use datafusion_catalog_listing::ListingOptions; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// + /// // listing options for files with paths such as `/mnt/data/col_a=x/col_b=y/data.parquet` + /// // `col_a` and `col_b` will be included in the data read from those files + /// let listing_options = ListingOptions::new(Arc::new( + /// ParquetFormat::default() + /// )) + /// .with_table_partition_cols(vec![("col_a".to_string(), DataType::Utf8), + /// ("col_b".to_string(), DataType::Utf8)]); + /// + /// assert_eq!(listing_options.table_partition_cols, vec![("col_a".to_string(), DataType::Utf8), + /// ("col_b".to_string(), DataType::Utf8)]); + /// ``` + /// + /// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html + /// [`wrap_partition_type_in_dict`]: datafusion_datasource::file_scan_config::wrap_partition_type_in_dict + pub fn with_table_partition_cols( + mut self, + table_partition_cols: Vec<(String, DataType)>, + ) -> Self { + self.table_partition_cols = table_partition_cols; + self + } + + /// Set stat collection on [`ListingOptions`] and returns self. + /// + /// ``` + /// # use std::sync::Arc; + /// # use datafusion_catalog_listing::ListingOptions; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// + /// let listing_options = ListingOptions::new(Arc::new( + /// ParquetFormat::default() + /// )) + /// .with_collect_stat(true); + /// + /// assert_eq!(listing_options.collect_stat, true); + /// ``` + pub fn with_collect_stat(mut self, collect_stat: bool) -> Self { + self.collect_stat = collect_stat; + self + } + + /// Set number of target partitions on [`ListingOptions`] and returns self. + /// + /// ``` + /// # use std::sync::Arc; + /// # use datafusion_catalog_listing::ListingOptions; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// + /// let listing_options = ListingOptions::new(Arc::new( + /// ParquetFormat::default() + /// )) + /// .with_target_partitions(8); + /// + /// assert_eq!(listing_options.target_partitions, 8); + /// ``` + pub fn with_target_partitions(mut self, target_partitions: usize) -> Self { + self.target_partitions = target_partitions; + self + } + + /// Set file sort order on [`ListingOptions`] and returns self. + /// + /// ``` + /// # use std::sync::Arc; + /// # use datafusion_expr::col; + /// # use datafusion_catalog_listing::ListingOptions; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// + /// // Tell datafusion that the files are sorted by column "a" + /// let file_sort_order = vec![vec![ + /// col("a").sort(true, true) + /// ]]; + /// + /// let listing_options = ListingOptions::new(Arc::new( + /// ParquetFormat::default() + /// )) + /// .with_file_sort_order(file_sort_order.clone()); + /// + /// assert_eq!(listing_options.file_sort_order, file_sort_order); + /// ``` + pub fn with_file_sort_order(mut self, file_sort_order: Vec>) -> Self { + self.file_sort_order = file_sort_order; + self + } + + /// Infer the schema of the files at the given path on the provided object store. + /// + /// If the table_path contains one or more files (i.e. it is a directory / + /// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`] + /// + /// Note: The inferred schema does not include any partitioning columns. + /// + /// This method is called as part of creating a [`crate::ListingTable`]. + pub async fn infer_schema<'a>( + &'a self, + state: &dyn Session, + table_path: &'a ListingTableUrl, + ) -> datafusion_common::Result { + let store = state.runtime_env().object_store(table_path)?; + + let files: Vec<_> = table_path + .list_all_files(state, store.as_ref(), &self.file_extension) + .await? + // Empty files cannot affect schema but may throw when trying to read for it + .try_filter(|object_meta| future::ready(object_meta.size > 0)) + .try_collect() + .await?; + + let schema = self.format.infer_schema(state, &store, &files).await?; + + Ok(schema) + } + + /// Infers the partition columns stored in `LOCATION` and compares + /// them with the columns provided in `PARTITIONED BY` to help prevent + /// accidental corrupts of partitioned tables. + /// + /// Allows specifying partial partitions. + pub async fn validate_partitions( + &self, + state: &dyn Session, + table_path: &ListingTableUrl, + ) -> datafusion_common::Result<()> { + if self.table_partition_cols.is_empty() { + return Ok(()); + } + + if !table_path.is_collection() { + return plan_err!( + "Can't create a partitioned table backed by a single file, \ + perhaps the URL is missing a trailing slash?" + ); + } + + let inferred = self.infer_partitions(state, table_path).await?; + + // no partitioned files found on disk + if inferred.is_empty() { + return Ok(()); + } + + let table_partition_names = self + .table_partition_cols + .iter() + .map(|(col_name, _)| col_name.clone()) + .collect_vec(); + + if inferred.len() < table_partition_names.len() { + return plan_err!( + "Inferred partitions to be {:?}, but got {:?}", + inferred, + table_partition_names + ); + } + + // match prefix to allow creating tables with partial partitions + for (idx, col) in table_partition_names.iter().enumerate() { + if &inferred[idx] != col { + return plan_err!( + "Inferred partitions to be {:?}, but got {:?}", + inferred, + table_partition_names + ); + } + } + + Ok(()) + } + + /// Infer the partitioning at the given path on the provided object store. + /// For performance reasons, it doesn't read all the files on disk + /// and therefore may fail to detect invalid partitioning. + pub async fn infer_partitions( + &self, + state: &dyn Session, + table_path: &ListingTableUrl, + ) -> datafusion_common::Result> { + let store = state.runtime_env().object_store(table_path)?; + + // only use 10 files for inference + // This can fail to detect inconsistent partition keys + // A DFS traversal approach of the store can help here + let files: Vec<_> = table_path + .list_all_files(state, store.as_ref(), &self.file_extension) + .await? + .take(10) + .try_collect() + .await?; + + let stripped_path_parts = files.iter().map(|file| { + table_path + .strip_prefix(&file.location) + .unwrap() + .collect_vec() + }); + + let partition_keys = stripped_path_parts + .map(|path_parts| { + path_parts + .into_iter() + .rev() + .skip(1) // get parents only; skip the file itself + .rev() + // Partitions are expected to follow the format "column_name=value", so we + // should ignore any path part that cannot be parsed into the expected format + .filter(|s| s.contains('=')) + .map(|s| s.split('=').take(1).collect()) + .collect_vec() + }) + .collect_vec(); + + match partition_keys.into_iter().all_equal_value() { + Ok(v) => Ok(v), + Err(None) => Ok(vec![]), + Err(Some(diff)) => { + let mut sorted_diff = [diff.0, diff.1]; + sorted_diff.sort(); + plan_err!("Found mixed partition values on disk {:?}", sorted_diff) + } + } + } +} diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs new file mode 100644 index 0000000000000..e9ac1bf097a22 --- /dev/null +++ b/datafusion/catalog-listing/src/table.rs @@ -0,0 +1,788 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::config::SchemaSource; +use crate::helpers::{expr_applicable_for_cols, pruned_partition_list}; +use crate::{ListingOptions, ListingTableConfig}; +use arrow::datatypes::{Field, Schema, SchemaBuilder, SchemaRef}; +use async_trait::async_trait; +use datafusion_catalog::{ScanArgs, ScanResult, Session, TableProvider}; +use datafusion_common::stats::Precision; +use datafusion_common::{ + internal_datafusion_err, plan_err, project_schema, Constraints, DataFusionError, + SchemaExt, Statistics, +}; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_groups::FileGroup; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use datafusion_datasource::file_sink_config::FileSinkConfig; +use datafusion_datasource::schema_adapter::{ + DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, +}; +use datafusion_datasource::{ + compute_all_files_statistics, ListingTableUrl, PartitionedFile, +}; +use datafusion_execution::cache::cache_manager::FileStatisticsCache; +use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache; +use datafusion_expr::dml::InsertOp; +use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType}; +use datafusion_physical_expr::create_lex_ordering; +use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; +use datafusion_physical_expr_common::sort_expr::LexOrdering; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::ExecutionPlan; +use futures::{future, stream, Stream, StreamExt, TryStreamExt}; +use object_store::ObjectStore; +use std::any::Any; +use std::collections::HashMap; +use std::sync::Arc; + +/// Built in [`TableProvider`] that reads data from one or more files as a single table. +/// +/// The files are read using an [`ObjectStore`] instance, for example from +/// local files or objects from AWS S3. +/// +/// # Features: +/// * Reading multiple files as a single table +/// * Hive style partitioning (e.g., directories named `date=2024-06-01`) +/// * Merges schemas from files with compatible but not identical schemas (see [`ListingTableConfig::file_schema`]) +/// * `limit`, `filter` and `projection` pushdown for formats that support it (e.g., +/// Parquet) +/// * Statistics collection and pruning based on file metadata +/// * Pre-existing sort order (see [`ListingOptions::file_sort_order`]) +/// * Metadata caching to speed up repeated queries (see [`FileMetadataCache`]) +/// * Statistics caching (see [`FileStatisticsCache`]) +/// +/// [`FileMetadataCache`]: datafusion_execution::cache::cache_manager::FileMetadataCache +/// +/// # Reading Directories and Hive Style Partitioning +/// +/// For example, given the `table1` directory (or object store prefix) +/// +/// ```text +/// table1 +/// ├── file1.parquet +/// └── file2.parquet +/// ``` +/// +/// A `ListingTable` would read the files `file1.parquet` and `file2.parquet` as +/// a single table, merging the schemas if the files have compatible but not +/// identical schemas. +/// +/// Given the `table2` directory (or object store prefix) +/// +/// ```text +/// table2 +/// ├── date=2024-06-01 +/// │ ├── file3.parquet +/// │ └── file4.parquet +/// └── date=2024-06-02 +/// └── file5.parquet +/// ``` +/// +/// A `ListingTable` would read the files `file3.parquet`, `file4.parquet`, and +/// `file5.parquet` as a single table, again merging schemas if necessary. +/// +/// Given the hive style partitioning structure (e.g,. directories named +/// `date=2024-06-01` and `date=2026-06-02`), `ListingTable` also adds a `date` +/// column when reading the table: +/// * The files in `table2/date=2024-06-01` will have the value `2024-06-01` +/// * The files in `table2/date=2024-06-02` will have the value `2024-06-02`. +/// +/// If the query has a predicate like `WHERE date = '2024-06-01'` +/// only the corresponding directory will be read. +/// +/// # See Also +/// +/// 1. [`ListingTableConfig`]: Configuration options +/// 1. [`DataSourceExec`]: `ExecutionPlan` used by `ListingTable` +/// +/// [`DataSourceExec`]: datafusion_datasource::source::DataSourceExec +/// +/// # Caching Metadata +/// +/// Some formats, such as Parquet, use the `FileMetadataCache` to cache file +/// metadata that is needed to execute but expensive to read, such as row +/// groups and statistics. The cache is scoped to the `SessionContext` and can +/// be configured via the [runtime config options]. +/// +/// [runtime config options]: https://datafusion.apache.org/user-guide/configs.html#runtime-configuration-settings +/// +/// # Example: Read a directory of parquet files using a [`ListingTable`] +/// +/// ```no_run +/// # use datafusion_common::Result; +/// # use std::sync::Arc; +/// # use datafusion_catalog::TableProvider; +/// # use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig}; +/// # use datafusion_datasource::ListingTableUrl; +/// # use datafusion_datasource_parquet::file_format::ParquetFormat;/// # +/// # use datafusion_catalog::Session; +/// async fn get_listing_table(session: &dyn Session) -> Result> { +/// let table_path = "/path/to/parquet"; +/// +/// // Parse the path +/// let table_path = ListingTableUrl::parse(table_path)?; +/// +/// // Create default parquet options +/// let file_format = ParquetFormat::new(); +/// let listing_options = ListingOptions::new(Arc::new(file_format)) +/// .with_file_extension(".parquet"); +/// +/// // Resolve the schema +/// let resolved_schema = listing_options +/// .infer_schema(session, &table_path) +/// .await?; +/// +/// let config = ListingTableConfig::new(table_path) +/// .with_listing_options(listing_options) +/// .with_schema(resolved_schema); +/// +/// // Create a new TableProvider +/// let provider = Arc::new(ListingTable::try_new(config)?); +/// +/// # Ok(provider) +/// # } +/// ``` +#[derive(Debug, Clone)] +pub struct ListingTable { + table_paths: Vec, + /// `file_schema` contains only the columns physically stored in the data files themselves. + /// - Represents the actual fields found in files like Parquet, CSV, etc. + /// - Used when reading the raw data from files + file_schema: SchemaRef, + /// `table_schema` combines `file_schema` + partition columns + /// - Partition columns are derived from directory paths (not stored in files) + /// - These are columns like "year=2022/month=01" in paths like `/data/year=2022/month=01/file.parquet` + table_schema: SchemaRef, + /// Indicates how the schema was derived (inferred or explicitly specified) + schema_source: SchemaSource, + /// Options used to configure the listing table such as the file format + /// and partitioning information + options: ListingOptions, + /// The SQL definition for this table, if any + definition: Option, + /// Cache for collected file statistics + collected_statistics: FileStatisticsCache, + /// Constraints applied to this table + constraints: Constraints, + /// Column default expressions for columns that are not physically present in the data files + column_defaults: HashMap, + /// Optional [`SchemaAdapterFactory`] for creating schema adapters + schema_adapter_factory: Option>, + /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters + expr_adapter_factory: Option>, +} + +impl ListingTable { + /// Create new [`ListingTable`] + /// + /// See documentation and example on [`ListingTable`] and [`ListingTableConfig`] + pub fn try_new(config: ListingTableConfig) -> datafusion_common::Result { + // Extract schema_source before moving other parts of the config + let schema_source = config.schema_source(); + + let file_schema = config + .file_schema + .ok_or_else(|| internal_datafusion_err!("No schema provided."))?; + + let options = config + .options + .ok_or_else(|| internal_datafusion_err!("No ListingOptions provided"))?; + + // Add the partition columns to the file schema + let mut builder = SchemaBuilder::from(file_schema.as_ref().to_owned()); + for (part_col_name, part_col_type) in &options.table_partition_cols { + builder.push(Field::new(part_col_name, part_col_type.clone(), false)); + } + + let table_schema = Arc::new( + builder + .finish() + .with_metadata(file_schema.metadata().clone()), + ); + + let table = Self { + table_paths: config.table_paths, + file_schema, + table_schema, + schema_source, + options, + definition: None, + collected_statistics: Arc::new(DefaultFileStatisticsCache::default()), + constraints: Constraints::default(), + column_defaults: HashMap::new(), + schema_adapter_factory: config.schema_adapter_factory, + expr_adapter_factory: config.expr_adapter_factory, + }; + + Ok(table) + } + + /// Assign constraints + pub fn with_constraints(mut self, constraints: Constraints) -> Self { + self.constraints = constraints; + self + } + + /// Assign column defaults + pub fn with_column_defaults( + mut self, + column_defaults: HashMap, + ) -> Self { + self.column_defaults = column_defaults; + self + } + + /// Set the [`FileStatisticsCache`] used to cache parquet file statistics. + /// + /// Setting a statistics cache on the `SessionContext` can avoid refetching statistics + /// multiple times in the same session. + /// + /// If `None`, creates a new [`DefaultFileStatisticsCache`] scoped to this query. + pub fn with_cache(mut self, cache: Option) -> Self { + self.collected_statistics = + cache.unwrap_or_else(|| Arc::new(DefaultFileStatisticsCache::default())); + self + } + + /// Specify the SQL definition for this table, if any + pub fn with_definition(mut self, definition: Option) -> Self { + self.definition = definition; + self + } + + /// Get paths ref + pub fn table_paths(&self) -> &Vec { + &self.table_paths + } + + /// Get options ref + pub fn options(&self) -> &ListingOptions { + &self.options + } + + /// Get the schema source + pub fn schema_source(&self) -> SchemaSource { + self.schema_source + } + + /// Set the [`SchemaAdapterFactory`] for this [`ListingTable`] + /// + /// The schema adapter factory is used to create schema adapters that can + /// handle schema evolution and type conversions when reading files with + /// different schemas than the table schema. + /// + /// # Example: Adding Schema Evolution Support + /// ```rust + /// # use std::sync::Arc; + /// # use datafusion_catalog_listing::{ListingTable, ListingTableConfig, ListingOptions}; + /// # use datafusion_datasource::ListingTableUrl; + /// # use datafusion_datasource::schema_adapter::{DefaultSchemaAdapterFactory, SchemaAdapter}; + /// # use datafusion_datasource_parquet::file_format::ParquetFormat; + /// # use arrow::datatypes::{SchemaRef, Schema, Field, DataType}; + /// # let table_path = ListingTableUrl::parse("file:///path/to/data").unwrap(); + /// # let options = ListingOptions::new(Arc::new(ParquetFormat::default())); + /// # let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)])); + /// # let config = ListingTableConfig::new(table_path).with_listing_options(options).with_schema(schema); + /// # let table = ListingTable::try_new(config).unwrap(); + /// let table_with_evolution = table + /// .with_schema_adapter_factory(Arc::new(DefaultSchemaAdapterFactory)); + /// ``` + /// See [`ListingTableConfig::with_schema_adapter_factory`] for an example of custom SchemaAdapterFactory. + pub fn with_schema_adapter_factory( + self, + schema_adapter_factory: Arc, + ) -> Self { + Self { + schema_adapter_factory: Some(schema_adapter_factory), + ..self + } + } + + /// Get the [`SchemaAdapterFactory`] for this table + pub fn schema_adapter_factory(&self) -> Option<&Arc> { + self.schema_adapter_factory.as_ref() + } + + /// Creates a schema adapter for mapping between file and table schemas + /// + /// Uses the configured schema adapter factory if available, otherwise falls back + /// to the default implementation. + fn create_schema_adapter(&self) -> Box { + let table_schema = self.schema(); + match &self.schema_adapter_factory { + Some(factory) => { + factory.create_with_projected_schema(Arc::clone(&table_schema)) + } + None => DefaultSchemaAdapterFactory::from_schema(Arc::clone(&table_schema)), + } + } + + /// Creates a file source and applies schema adapter factory if available + fn create_file_source_with_schema_adapter( + &self, + ) -> datafusion_common::Result> { + let mut source = self.options.format.file_source(); + // Apply schema adapter to source if available + // + // The source will use this SchemaAdapter to adapt data batches as they flow up the plan. + // Note: ListingTable also creates a SchemaAdapter in `scan()` but that is only used to adapt collected statistics. + if let Some(factory) = &self.schema_adapter_factory { + source = source.with_schema_adapter_factory(Arc::clone(factory))?; + } + Ok(source) + } + + /// If file_sort_order is specified, creates the appropriate physical expressions + pub fn try_create_output_ordering( + &self, + execution_props: &ExecutionProps, + ) -> datafusion_common::Result> { + create_lex_ordering( + &self.table_schema, + &self.options.file_sort_order, + execution_props, + ) + } +} + +// Expressions can be used for partition pruning if they can be evaluated using +// only the partition columns and there are partition columns. +fn can_be_evaluated_for_partition_pruning( + partition_column_names: &[&str], + expr: &Expr, +) -> bool { + !partition_column_names.is_empty() + && expr_applicable_for_cols(partition_column_names, expr) +} + +#[async_trait] +impl TableProvider for ListingTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.table_schema) + } + + fn constraints(&self) -> Option<&Constraints> { + Some(&self.constraints) + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> datafusion_common::Result> { + let options = ScanArgs::default() + .with_projection(projection.map(|p| p.as_slice())) + .with_filters(Some(filters)) + .with_limit(limit); + Ok(self.scan_with_args(state, options).await?.into_inner()) + } + + async fn scan_with_args<'a>( + &self, + state: &dyn Session, + args: ScanArgs<'a>, + ) -> datafusion_common::Result { + let projection = args.projection().map(|p| p.to_vec()); + let filters = args.filters().map(|f| f.to_vec()).unwrap_or_default(); + let limit = args.limit(); + + // extract types of partition columns + let table_partition_cols = self + .options + .table_partition_cols + .iter() + .map(|col| Ok(self.table_schema.field_with_name(&col.0)?.clone())) + .collect::>>()?; + + let table_partition_col_names = table_partition_cols + .iter() + .map(|field| field.name().as_str()) + .collect::>(); + + // If the filters can be resolved using only partition cols, there is no need to + // pushdown it to TableScan, otherwise, `unhandled` pruning predicates will be generated + let (partition_filters, filters): (Vec<_>, Vec<_>) = + filters.iter().cloned().partition(|filter| { + can_be_evaluated_for_partition_pruning(&table_partition_col_names, filter) + }); + + // We should not limit the number of partitioned files to scan if there are filters and limit + // at the same time. This is because the limit should be applied after the filters are applied. + let statistic_file_limit = if filters.is_empty() { limit } else { None }; + + let (mut partitioned_file_lists, statistics) = self + .list_files_for_scan(state, &partition_filters, statistic_file_limit) + .await?; + + // if no files need to be read, return an `EmptyExec` + if partitioned_file_lists.is_empty() { + let projected_schema = project_schema(&self.schema(), projection.as_ref())?; + return Ok(ScanResult::new(Arc::new(EmptyExec::new(projected_schema)))); + } + + let output_ordering = self.try_create_output_ordering(state.execution_props())?; + match state + .config_options() + .execution + .split_file_groups_by_statistics + .then(|| { + output_ordering.first().map(|output_ordering| { + FileScanConfig::split_groups_by_statistics_with_target_partitions( + &self.table_schema, + &partitioned_file_lists, + output_ordering, + self.options.target_partitions, + ) + }) + }) + .flatten() + { + Some(Err(e)) => log::debug!("failed to split file groups by statistics: {e}"), + Some(Ok(new_groups)) => { + if new_groups.len() <= self.options.target_partitions { + partitioned_file_lists = new_groups; + } else { + log::debug!("attempted to split file groups by statistics, but there were more file groups than target_partitions; falling back to unordered") + } + } + None => {} // no ordering required + }; + + let Some(object_store_url) = + self.table_paths.first().map(ListingTableUrl::object_store) + else { + return Ok(ScanResult::new(Arc::new(EmptyExec::new(Arc::new( + Schema::empty(), + ))))); + }; + + let file_source = self.create_file_source_with_schema_adapter()?; + + // create the execution plan + let plan = self + .options + .format + .create_physical_plan( + state, + FileScanConfigBuilder::new( + object_store_url, + Arc::clone(&self.file_schema), + file_source, + ) + .with_file_groups(partitioned_file_lists) + .with_constraints(self.constraints.clone()) + .with_statistics(statistics) + .with_projection(projection) + .with_limit(limit) + .with_output_ordering(output_ordering) + .with_table_partition_cols(table_partition_cols) + .with_expr_adapter(self.expr_adapter_factory.clone()) + .build(), + ) + .await?; + + Ok(ScanResult::new(plan)) + } + + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> datafusion_common::Result> { + let partition_column_names = self + .options + .table_partition_cols + .iter() + .map(|col| col.0.as_str()) + .collect::>(); + filters + .iter() + .map(|filter| { + if can_be_evaluated_for_partition_pruning(&partition_column_names, filter) + { + // if filter can be handled by partition pruning, it is exact + return Ok(TableProviderFilterPushDown::Exact); + } + + Ok(TableProviderFilterPushDown::Inexact) + }) + .collect() + } + + fn get_table_definition(&self) -> Option<&str> { + self.definition.as_deref() + } + + async fn insert_into( + &self, + state: &dyn Session, + input: Arc, + insert_op: InsertOp, + ) -> datafusion_common::Result> { + // Check that the schema of the plan matches the schema of this table. + self.schema() + .logically_equivalent_names_and_types(&input.schema())?; + + let table_path = &self.table_paths()[0]; + if !table_path.is_collection() { + return plan_err!( + "Inserting into a ListingTable backed by a single file is not supported, URL is possibly missing a trailing `/`. \ + To append to an existing file use StreamTable, e.g. by using CREATE UNBOUNDED EXTERNAL TABLE" + ); + } + + // Get the object store for the table path. + let store = state.runtime_env().object_store(table_path)?; + + let file_list_stream = pruned_partition_list( + state, + store.as_ref(), + table_path, + &[], + &self.options.file_extension, + &self.options.table_partition_cols, + ) + .await?; + + let file_group = file_list_stream.try_collect::>().await?.into(); + let keep_partition_by_columns = + state.config_options().execution.keep_partition_by_columns; + + // Sink related option, apart from format + let config = FileSinkConfig { + original_url: String::default(), + object_store_url: self.table_paths()[0].object_store(), + table_paths: self.table_paths().clone(), + file_group, + output_schema: self.schema(), + table_partition_cols: self.options.table_partition_cols.clone(), + insert_op, + keep_partition_by_columns, + file_extension: self.options().format.get_ext(), + }; + + let orderings = self.try_create_output_ordering(state.execution_props())?; + // It is sufficient to pass only one of the equivalent orderings: + let order_requirements = orderings.into_iter().next().map(Into::into); + + self.options() + .format + .create_writer_physical_plan(input, state, config, order_requirements) + .await + } + + fn get_column_default(&self, column: &str) -> Option<&Expr> { + self.column_defaults.get(column) + } +} + +impl ListingTable { + /// Get the list of files for a scan as well as the file level statistics. + /// The list is grouped to let the execution plan know how the files should + /// be distributed to different threads / executors. + pub async fn list_files_for_scan<'a>( + &'a self, + ctx: &'a dyn Session, + filters: &'a [Expr], + limit: Option, + ) -> datafusion_common::Result<(Vec, Statistics)> { + let store = if let Some(url) = self.table_paths.first() { + ctx.runtime_env().object_store(url)? + } else { + return Ok((vec![], Statistics::new_unknown(&self.file_schema))); + }; + // list files (with partitions) + let file_list = future::try_join_all(self.table_paths.iter().map(|table_path| { + pruned_partition_list( + ctx, + store.as_ref(), + table_path, + filters, + &self.options.file_extension, + &self.options.table_partition_cols, + ) + })) + .await?; + let meta_fetch_concurrency = + ctx.config_options().execution.meta_fetch_concurrency; + let file_list = stream::iter(file_list).flatten_unordered(meta_fetch_concurrency); + // collect the statistics if required by the config + let files = file_list + .map(|part_file| async { + let part_file = part_file?; + let statistics = if self.options.collect_stat { + self.do_collect_statistics(ctx, &store, &part_file).await? + } else { + Arc::new(Statistics::new_unknown(&self.file_schema)) + }; + Ok(part_file.with_statistics(statistics)) + }) + .boxed() + .buffer_unordered(ctx.config_options().execution.meta_fetch_concurrency); + + let (file_group, inexact_stats) = + get_files_with_limit(files, limit, self.options.collect_stat).await?; + + let file_groups = file_group.split_files(self.options.target_partitions); + let (mut file_groups, mut stats) = compute_all_files_statistics( + file_groups, + self.schema(), + self.options.collect_stat, + inexact_stats, + )?; + + let schema_adapter = self.create_schema_adapter(); + let (schema_mapper, _) = schema_adapter.map_schema(self.file_schema.as_ref())?; + + stats.column_statistics = + schema_mapper.map_column_statistics(&stats.column_statistics)?; + file_groups.iter_mut().try_for_each(|file_group| { + if let Some(stat) = file_group.statistics_mut() { + stat.column_statistics = + schema_mapper.map_column_statistics(&stat.column_statistics)?; + } + Ok::<_, DataFusionError>(()) + })?; + Ok((file_groups, stats)) + } + + /// Collects statistics for a given partitioned file. + /// + /// This method first checks if the statistics for the given file are already cached. + /// If they are, it returns the cached statistics. + /// If they are not, it infers the statistics from the file and stores them in the cache. + async fn do_collect_statistics( + &self, + ctx: &dyn Session, + store: &Arc, + part_file: &PartitionedFile, + ) -> datafusion_common::Result> { + match self + .collected_statistics + .get_with_extra(&part_file.object_meta.location, &part_file.object_meta) + { + Some(statistics) => Ok(statistics), + None => { + let statistics = self + .options + .format + .infer_stats( + ctx, + store, + Arc::clone(&self.file_schema), + &part_file.object_meta, + ) + .await?; + let statistics = Arc::new(statistics); + self.collected_statistics.put_with_extra( + &part_file.object_meta.location, + Arc::clone(&statistics), + &part_file.object_meta, + ); + Ok(statistics) + } + } + } +} + +/// Processes a stream of partitioned files and returns a `FileGroup` containing the files. +/// +/// This function collects files from the provided stream until either: +/// 1. The stream is exhausted +/// 2. The accumulated number of rows exceeds the provided `limit` (if specified) +/// +/// # Arguments +/// * `files` - A stream of `Result` items to process +/// * `limit` - An optional row count limit. If provided, the function will stop collecting files +/// once the accumulated number of rows exceeds this limit +/// * `collect_stats` - Whether to collect and accumulate statistics from the files +/// +/// # Returns +/// A `Result` containing a `FileGroup` with the collected files +/// and a boolean indicating whether the statistics are inexact. +/// +/// # Note +/// The function will continue processing files if statistics are not available or if the +/// limit is not provided. If `collect_stats` is false, statistics won't be accumulated +/// but files will still be collected. +async fn get_files_with_limit( + files: impl Stream>, + limit: Option, + collect_stats: bool, +) -> datafusion_common::Result<(FileGroup, bool)> { + let mut file_group = FileGroup::default(); + // Fusing the stream allows us to call next safely even once it is finished. + let mut all_files = Box::pin(files.fuse()); + enum ProcessingState { + ReadingFiles, + ReachedLimit, + } + + let mut state = ProcessingState::ReadingFiles; + let mut num_rows = Precision::Absent; + + while let Some(file_result) = all_files.next().await { + // Early exit if we've already reached our limit + if matches!(state, ProcessingState::ReachedLimit) { + break; + } + + let file = file_result?; + + // Update file statistics regardless of state + if collect_stats { + if let Some(file_stats) = &file.statistics { + num_rows = if file_group.is_empty() { + // For the first file, just take its row count + file_stats.num_rows + } else { + // For subsequent files, accumulate the counts + num_rows.add(&file_stats.num_rows) + }; + } + } + + // Always add the file to our group + file_group.push(file); + + // Check if we've hit the limit (if one was specified) + if let Some(limit) = limit { + if let Precision::Exact(row_count) = num_rows { + if row_count > limit { + state = ProcessingState::ReachedLimit; + } + } + } + } + // If we still have files in the stream, it means that the limit kicked + // in, and the statistic could have been different had we processed the + // files in a different order. + let inexact_stats = all_files.next().await.is_some(); + Ok((file_group, inexact_stats)) +} diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs index b30d53e586911..256a11ba693b5 100644 --- a/datafusion/core/src/datasource/dynamic_file.rs +++ b/datafusion/core/src/datasource/dynamic_file.rs @@ -20,6 +20,7 @@ use std::sync::Arc; +use crate::datasource::listing::ListingTableConfigExt; use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl}; use crate::datasource::TableProvider; use crate::error::Result; diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs index a58db55bccb61..c206566a65941 100644 --- a/datafusion/core/src/datasource/listing/mod.rs +++ b/datafusion/core/src/datasource/listing/mod.rs @@ -20,7 +20,8 @@ mod table; pub use datafusion_catalog_listing::helpers; +pub use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig}; pub use datafusion_datasource::{ FileRange, ListingTableUrl, PartitionedFile, PartitionedFileStream, }; -pub use table::{ListingOptions, ListingTable, ListingTableConfig}; +pub use table::ListingTableConfigExt; diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 4ffb6d41864f3..3333b70676203 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -15,226 +15,42 @@ // specific language governing permissions and limitations // under the License. -//! The table implementation. - -use super::{ - helpers::{expr_applicable_for_cols, pruned_partition_list}, - ListingTableUrl, PartitionedFile, -}; -use crate::{ - datasource::file_format::{file_compression_type::FileCompressionType, FileFormat}, - datasource::physical_plan::FileSinkConfig, - execution::context::SessionState, -}; -use arrow::datatypes::{DataType, Field, SchemaBuilder, SchemaRef}; -use arrow_schema::Schema; +use crate::execution::SessionState; use async_trait::async_trait; -use datafusion_catalog::{ScanArgs, ScanResult, Session, TableProvider}; -use datafusion_common::{ - config_datafusion_err, config_err, internal_datafusion_err, internal_err, plan_err, - project_schema, stats::Precision, Constraints, DataFusionError, Result, SchemaExt, -}; -use datafusion_datasource::{ - compute_all_files_statistics, - file::FileSource, - file_groups::FileGroup, - file_scan_config::{FileScanConfig, FileScanConfigBuilder}, - schema_adapter::{DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory}, -}; -use datafusion_execution::{ - cache::{cache_manager::FileStatisticsCache, cache_unit::DefaultFileStatisticsCache}, - config::SessionConfig, -}; -use datafusion_expr::execution_props::ExecutionProps; -use datafusion_expr::{ - dml::InsertOp, Expr, SortExpr, TableProviderFilterPushDown, TableType, -}; -use datafusion_physical_expr::create_lex_ordering; -use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; -use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics}; -use futures::{future, stream, Stream, StreamExt, TryStreamExt}; -use itertools::Itertools; -use object_store::ObjectStore; -use std::{any::Any, collections::HashMap, str::FromStr, sync::Arc}; - -/// Indicates the source of the schema for a [`ListingTable`] -// PartialEq required for assert_eq! in tests -#[derive(Debug, Clone, Copy, PartialEq, Default)] -pub enum SchemaSource { - /// Schema is not yet set (initial state) - #[default] - Unset, - /// Schema was inferred from first table_path - Inferred, - /// Schema was specified explicitly via with_schema - Specified, -} +use datafusion_catalog_listing::{ListingOptions, ListingTableConfig}; +use datafusion_common::{config_datafusion_err, internal_datafusion_err}; +use datafusion_session::Session; +use futures::StreamExt; +use std::collections::HashMap; -/// Configuration for creating a [`ListingTable`] -/// -/// # Schema Evolution Support -/// -/// This configuration supports schema evolution through the optional -/// [`SchemaAdapterFactory`]. You might want to override the default factory when you need: +/// Extension trait for [`ListingTableConfig`] that supports inferring schemas /// -/// - **Type coercion requirements**: When you need custom logic for converting between -/// different Arrow data types (e.g., Int32 ↔ Int64, Utf8 ↔ LargeUtf8) -/// - **Column mapping**: You need to map columns with a legacy name to a new name -/// - **Custom handling of missing columns**: By default they are filled in with nulls, but you may e.g. want to fill them in with `0` or `""`. -/// -/// If not specified, a [`DefaultSchemaAdapterFactory`] will be used, which handles -/// basic schema compatibility cases. -/// -#[derive(Debug, Clone, Default)] -pub struct ListingTableConfig { - /// Paths on the `ObjectStore` for creating `ListingTable`. - /// They should share the same schema and object store. - pub table_paths: Vec, - /// Optional `SchemaRef` for the to be created `ListingTable`. - /// - /// See details on [`ListingTableConfig::with_schema`] - pub file_schema: Option, - /// Optional [`ListingOptions`] for the to be created [`ListingTable`]. - /// - /// See details on [`ListingTableConfig::with_listing_options`] - pub options: Option, - /// Tracks the source of the schema information - schema_source: SchemaSource, - /// Optional [`SchemaAdapterFactory`] for creating schema adapters - schema_adapter_factory: Option>, - /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters - expr_adapter_factory: Option>, -} - -impl ListingTableConfig { - /// Creates new [`ListingTableConfig`] for reading the specified URL - pub fn new(table_path: ListingTableUrl) -> Self { - Self { - table_paths: vec![table_path], - ..Default::default() - } - } - - /// Creates new [`ListingTableConfig`] with multiple table paths. - /// - /// See [`Self::infer_options`] for details on what happens with multiple paths - pub fn new_with_multi_paths(table_paths: Vec) -> Self { - Self { - table_paths, - ..Default::default() - } - } - - /// Returns the source of the schema for this configuration - pub fn schema_source(&self) -> SchemaSource { - self.schema_source - } - /// Set the `schema` for the overall [`ListingTable`] - /// - /// [`ListingTable`] will automatically coerce, when possible, the schema - /// for individual files to match this schema. - /// - /// If a schema is not provided, it is inferred using - /// [`Self::infer_schema`]. - /// - /// If the schema is provided, it must contain only the fields in the file - /// without the table partitioning columns. - /// - /// # Example: Specifying Table Schema - /// ```rust - /// # use std::sync::Arc; - /// # use datafusion::datasource::listing::{ListingTableConfig, ListingOptions, ListingTableUrl}; - /// # use datafusion::datasource::file_format::parquet::ParquetFormat; - /// # use arrow::datatypes::{Schema, Field, DataType}; - /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap(); - /// # let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())); - /// let schema = Arc::new(Schema::new(vec![ - /// Field::new("id", DataType::Int64, false), - /// Field::new("name", DataType::Utf8, true), - /// ])); - /// - /// let config = ListingTableConfig::new(table_paths) - /// .with_listing_options(listing_options) // Set options first - /// .with_schema(schema); // Then set schema - /// ``` - pub fn with_schema(self, schema: SchemaRef) -> Self { - // Note: We preserve existing options state, but downstream code may expect - // options to be set. Consider calling with_listing_options() or infer_options() - // before operations that require options to be present. - debug_assert!( - self.options.is_some() || cfg!(test), - "ListingTableConfig::with_schema called without options set. \ - Consider calling with_listing_options() or infer_options() first to avoid panics in downstream code." - ); - - Self { - file_schema: Some(schema), - schema_source: SchemaSource::Specified, - ..self - } - } - - /// Add `listing_options` to [`ListingTableConfig`] - /// - /// If not provided, format and other options are inferred via - /// [`Self::infer_options`]. - /// - /// # Example: Configuring Parquet Files with Custom Options - /// ```rust - /// # use std::sync::Arc; - /// # use datafusion::datasource::listing::{ListingTableConfig, ListingOptions, ListingTableUrl}; - /// # use datafusion::datasource::file_format::parquet::ParquetFormat; - /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap(); - /// let options = ListingOptions::new(Arc::new(ParquetFormat::default())) - /// .with_file_extension(".parquet") - /// .with_collect_stat(true); - /// - /// let config = ListingTableConfig::new(table_paths) - /// .with_listing_options(options); // Configure file format and options - /// ``` - pub fn with_listing_options(self, listing_options: ListingOptions) -> Self { - // Note: This method properly sets options, but be aware that downstream - // methods like infer_schema() and try_new() require both schema and options - // to be set to function correctly. - debug_assert!( - !self.table_paths.is_empty() || cfg!(test), - "ListingTableConfig::with_listing_options called without table_paths set. \ - Consider calling new() or new_with_multi_paths() first to establish table paths." - ); - - Self { - options: Some(listing_options), - ..self - } - } - - /// Returns a tuple of `(file_extension, optional compression_extension)` - /// - /// For example a path ending with blah.test.csv.gz returns `("csv", Some("gz"))` - /// For example a path ending with blah.test.csv returns `("csv", None)` - fn infer_file_extension_and_compression_type( - path: &str, - ) -> Result<(String, Option)> { - let mut exts = path.rsplit('.'); - - let split = exts.next().unwrap_or(""); - - let file_compression_type = FileCompressionType::from_str(split) - .unwrap_or(FileCompressionType::UNCOMPRESSED); - - if file_compression_type.is_compressed() { - let split2 = exts.next().unwrap_or(""); - Ok((split2.to_string(), Some(split.to_string()))) - } else { - Ok((split.to_string(), None)) - } - } - +/// This trait exists because the following inference methods only +/// work for [`SessionState`] implementations of [`Session`]. +/// See [`ListingTableConfig`] for the remaining inference methods. +#[async_trait] +pub trait ListingTableConfigExt { /// Infer `ListingOptions` based on `table_path` and file suffix. /// /// The format is inferred based on the first `table_path`. - pub async fn infer_options(self, state: &dyn Session) -> Result { + async fn infer_options( + self, + state: &dyn Session, + ) -> datafusion_common::Result; + + /// Convenience method to call both [`Self::infer_options`] and [`ListingTableConfig::infer_schema`] + async fn infer( + self, + state: &dyn Session, + ) -> datafusion_common::Result; +} + +#[async_trait] +impl ListingTableConfigExt for ListingTableConfig { + async fn infer_options( + self, + state: &dyn Session, + ) -> datafusion_common::Result { let store = if let Some(url) = self.table_paths.first() { state.runtime_env().object_store(url)? } else { @@ -281,1299 +97,19 @@ impl ListingTableConfig { .with_target_partitions(state.config().target_partitions()) .with_collect_stat(state.config().collect_statistics()); - Ok(Self { - table_paths: self.table_paths, - file_schema: self.file_schema, - options: Some(listing_options), - schema_source: self.schema_source, - schema_adapter_factory: self.schema_adapter_factory, - expr_adapter_factory: self.expr_adapter_factory, - }) - } - - /// Infer the [`SchemaRef`] based on `table_path`s. - /// - /// This method infers the table schema using the first `table_path`. - /// See [`ListingOptions::infer_schema`] for more details - /// - /// # Errors - /// * if `self.options` is not set. See [`Self::with_listing_options`] - pub async fn infer_schema(self, state: &dyn Session) -> Result { - match self.options { - Some(options) => { - let ListingTableConfig { - table_paths, - file_schema, - options: _, - schema_source, - schema_adapter_factory, - expr_adapter_factory: physical_expr_adapter_factory, - } = self; - - let (schema, new_schema_source) = match file_schema { - Some(schema) => (schema, schema_source), // Keep existing source if schema exists - None => { - if let Some(url) = table_paths.first() { - ( - options.infer_schema(state, url).await?, - SchemaSource::Inferred, - ) - } else { - (Arc::new(Schema::empty()), SchemaSource::Inferred) - } - } - }; - - Ok(Self { - table_paths, - file_schema: Some(schema), - options: Some(options), - schema_source: new_schema_source, - schema_adapter_factory, - expr_adapter_factory: physical_expr_adapter_factory, - }) - } - None => internal_err!("No `ListingOptions` set for inferring schema"), - } + Ok(self.with_listing_options(listing_options)) } - /// Convenience method to call both [`Self::infer_options`] and [`Self::infer_schema`] - pub async fn infer(self, state: &dyn Session) -> Result { + async fn infer(self, state: &dyn Session) -> datafusion_common::Result { self.infer_options(state).await?.infer_schema(state).await } - - /// Infer the partition columns from `table_paths`. - /// - /// # Errors - /// * if `self.options` is not set. See [`Self::with_listing_options`] - pub async fn infer_partitions_from_path(self, state: &dyn Session) -> Result { - match self.options { - Some(options) => { - let Some(url) = self.table_paths.first() else { - return config_err!("No table path found"); - }; - let partitions = options - .infer_partitions(state, url) - .await? - .into_iter() - .map(|col_name| { - ( - col_name, - DataType::Dictionary( - Box::new(DataType::UInt16), - Box::new(DataType::Utf8), - ), - ) - }) - .collect::>(); - let options = options.with_table_partition_cols(partitions); - Ok(Self { - table_paths: self.table_paths, - file_schema: self.file_schema, - options: Some(options), - schema_source: self.schema_source, - schema_adapter_factory: self.schema_adapter_factory, - expr_adapter_factory: self.expr_adapter_factory, - }) - } - None => config_err!("No `ListingOptions` set for inferring schema"), - } - } - - /// Set the [`SchemaAdapterFactory`] for the [`ListingTable`] - /// - /// The schema adapter factory is used to create schema adapters that can - /// handle schema evolution and type conversions when reading files with - /// different schemas than the table schema. - /// - /// If not provided, a default schema adapter factory will be used. - /// - /// # Example: Custom Schema Adapter for Type Coercion - /// ```rust - /// # use std::sync::Arc; - /// # use datafusion::datasource::listing::{ListingTableConfig, ListingOptions, ListingTableUrl}; - /// # use datafusion::datasource::schema_adapter::{SchemaAdapterFactory, SchemaAdapter}; - /// # use datafusion::datasource::file_format::parquet::ParquetFormat; - /// # use arrow::datatypes::{SchemaRef, Schema, Field, DataType}; - /// # - /// # #[derive(Debug)] - /// # struct MySchemaAdapterFactory; - /// # impl SchemaAdapterFactory for MySchemaAdapterFactory { - /// # fn create(&self, _projected_table_schema: SchemaRef, _file_schema: SchemaRef) -> Box { - /// # unimplemented!() - /// # } - /// # } - /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap(); - /// # let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())); - /// # let table_schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)])); - /// let config = ListingTableConfig::new(table_paths) - /// .with_listing_options(listing_options) - /// .with_schema(table_schema) - /// .with_schema_adapter_factory(Arc::new(MySchemaAdapterFactory)); - /// ``` - pub fn with_schema_adapter_factory( - self, - schema_adapter_factory: Arc, - ) -> Self { - Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self - } - } - - /// Get the [`SchemaAdapterFactory`] for this configuration - pub fn schema_adapter_factory(&self) -> Option<&Arc> { - self.schema_adapter_factory.as_ref() - } - - /// Set the [`PhysicalExprAdapterFactory`] for the [`ListingTable`] - /// - /// The expression adapter factory is used to create physical expression adapters that can - /// handle schema evolution and type conversions when evaluating expressions - /// with different schemas than the table schema. - /// - /// If not provided, a default physical expression adapter factory will be used unless a custom - /// `SchemaAdapterFactory` is set, in which case only the `SchemaAdapterFactory` will be used. - /// - /// See for details on this transition. - pub fn with_expr_adapter_factory( - self, - expr_adapter_factory: Arc, - ) -> Self { - Self { - expr_adapter_factory: Some(expr_adapter_factory), - ..self - } - } -} - -/// Options for creating a [`ListingTable`] -#[derive(Clone, Debug)] -pub struct ListingOptions { - /// A suffix on which files should be filtered (leave empty to - /// keep all files on the path) - pub file_extension: String, - /// The file format - pub format: Arc, - /// The expected partition column names in the folder structure. - /// See [Self::with_table_partition_cols] for details - pub table_partition_cols: Vec<(String, DataType)>, - /// Set true to try to guess statistics from the files. - /// This can add a lot of overhead as it will usually require files - /// to be opened and at least partially parsed. - pub collect_stat: bool, - /// Group files to avoid that the number of partitions exceeds - /// this limit - pub target_partitions: usize, - /// Optional pre-known sort order(s). Must be `SortExpr`s. - /// - /// DataFusion may take advantage of this ordering to omit sorts - /// or use more efficient algorithms. Currently sortedness must be - /// provided if it is known by some external mechanism, but may in - /// the future be automatically determined, for example using - /// parquet metadata. - /// - /// See - /// - /// NOTE: This attribute stores all equivalent orderings (the outer `Vec`) - /// where each ordering consists of an individual lexicographic - /// ordering (encapsulated by a `Vec`). If there aren't - /// multiple equivalent orderings, the outer `Vec` will have a - /// single element. - pub file_sort_order: Vec>, -} - -impl ListingOptions { - /// Creates an options instance with the given format - /// Default values: - /// - use default file extension filter - /// - no input partition to discover - /// - one target partition - /// - do not collect statistics - pub fn new(format: Arc) -> Self { - Self { - file_extension: format.get_ext(), - format, - table_partition_cols: vec![], - collect_stat: false, - target_partitions: 1, - file_sort_order: vec![], - } - } - - /// Set options from [`SessionConfig`] and returns self. - /// - /// Currently this sets `target_partitions` and `collect_stat` - /// but if more options are added in the future that need to be coordinated - /// they will be synchronized through this method. - pub fn with_session_config_options(mut self, config: &SessionConfig) -> Self { - self = self.with_target_partitions(config.target_partitions()); - self = self.with_collect_stat(config.collect_statistics()); - self - } - - /// Set file extension on [`ListingOptions`] and returns self. - /// - /// # Example - /// ``` - /// # use std::sync::Arc; - /// # use datafusion::prelude::SessionContext; - /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat}; - /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_extension(".parquet"); - /// - /// assert_eq!(listing_options.file_extension, ".parquet"); - /// ``` - pub fn with_file_extension(mut self, file_extension: impl Into) -> Self { - self.file_extension = file_extension.into(); - self - } - - /// Optionally set file extension on [`ListingOptions`] and returns self. - /// - /// If `file_extension` is `None`, the file extension will not be changed - /// - /// # Example - /// ``` - /// # use std::sync::Arc; - /// # use datafusion::prelude::SessionContext; - /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat}; - /// let extension = Some(".parquet"); - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_extension_opt(extension); - /// - /// assert_eq!(listing_options.file_extension, ".parquet"); - /// ``` - pub fn with_file_extension_opt(mut self, file_extension: Option) -> Self - where - S: Into, - { - if let Some(file_extension) = file_extension { - self.file_extension = file_extension.into(); - } - self - } - - /// Set `table partition columns` on [`ListingOptions`] and returns self. - /// - /// "partition columns," used to support [Hive Partitioning], are - /// columns added to the data that is read, based on the folder - /// structure where the data resides. - /// - /// For example, give the following files in your filesystem: - /// - /// ```text - /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet - /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet - /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet - /// ``` - /// - /// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition - /// columns "year" and "month" will include new `year` and `month` - /// columns while reading the files. The `year` column would have - /// value `2022` and the `month` column would have value `01` for - /// the rows read from - /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet` - /// - ///# Notes - /// - /// - If only one level (e.g. `year` in the example above) is - /// specified, the other levels are ignored but the files are - /// still read. - /// - /// - Files that don't follow this partitioning scheme will be - /// ignored. - /// - /// - Since the columns have the same value for all rows read from - /// each individual file (such as dates), they are typically - /// dictionary encoded for efficiency. You may use - /// [`wrap_partition_type_in_dict`] to request a - /// dictionary-encoded type. - /// - /// - The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself. - /// - /// # Example - /// - /// ``` - /// # use std::sync::Arc; - /// # use arrow::datatypes::DataType; - /// # use datafusion::prelude::col; - /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat}; - /// - /// // listing options for files with paths such as `/mnt/data/col_a=x/col_b=y/data.parquet` - /// // `col_a` and `col_b` will be included in the data read from those files - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_table_partition_cols(vec![("col_a".to_string(), DataType::Utf8), - /// ("col_b".to_string(), DataType::Utf8)]); - /// - /// assert_eq!(listing_options.table_partition_cols, vec![("col_a".to_string(), DataType::Utf8), - /// ("col_b".to_string(), DataType::Utf8)]); - /// ``` - /// - /// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html - /// [`wrap_partition_type_in_dict`]: crate::datasource::physical_plan::wrap_partition_type_in_dict - pub fn with_table_partition_cols( - mut self, - table_partition_cols: Vec<(String, DataType)>, - ) -> Self { - self.table_partition_cols = table_partition_cols; - self - } - - /// Set stat collection on [`ListingOptions`] and returns self. - /// - /// ``` - /// # use std::sync::Arc; - /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat}; - /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_collect_stat(true); - /// - /// assert_eq!(listing_options.collect_stat, true); - /// ``` - pub fn with_collect_stat(mut self, collect_stat: bool) -> Self { - self.collect_stat = collect_stat; - self - } - - /// Set number of target partitions on [`ListingOptions`] and returns self. - /// - /// ``` - /// # use std::sync::Arc; - /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat}; - /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_target_partitions(8); - /// - /// assert_eq!(listing_options.target_partitions, 8); - /// ``` - pub fn with_target_partitions(mut self, target_partitions: usize) -> Self { - self.target_partitions = target_partitions; - self - } - - /// Set file sort order on [`ListingOptions`] and returns self. - /// - /// ``` - /// # use std::sync::Arc; - /// # use datafusion::prelude::col; - /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat}; - /// - /// // Tell datafusion that the files are sorted by column "a" - /// let file_sort_order = vec![vec![ - /// col("a").sort(true, true) - /// ]]; - /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_sort_order(file_sort_order.clone()); - /// - /// assert_eq!(listing_options.file_sort_order, file_sort_order); - /// ``` - pub fn with_file_sort_order(mut self, file_sort_order: Vec>) -> Self { - self.file_sort_order = file_sort_order; - self - } - - /// Infer the schema of the files at the given path on the provided object store. - /// - /// If the table_path contains one or more files (i.e. it is a directory / - /// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`] - /// - /// Note: The inferred schema does not include any partitioning columns. - /// - /// This method is called as part of creating a [`ListingTable`]. - pub async fn infer_schema<'a>( - &'a self, - state: &dyn Session, - table_path: &'a ListingTableUrl, - ) -> Result { - let store = state.runtime_env().object_store(table_path)?; - - let files: Vec<_> = table_path - .list_all_files(state, store.as_ref(), &self.file_extension) - .await? - // Empty files cannot affect schema but may throw when trying to read for it - .try_filter(|object_meta| future::ready(object_meta.size > 0)) - .try_collect() - .await?; - - let schema = self.format.infer_schema(state, &store, &files).await?; - - Ok(schema) - } - - /// Infers the partition columns stored in `LOCATION` and compares - /// them with the columns provided in `PARTITIONED BY` to help prevent - /// accidental corrupts of partitioned tables. - /// - /// Allows specifying partial partitions. - pub async fn validate_partitions( - &self, - state: &dyn Session, - table_path: &ListingTableUrl, - ) -> Result<()> { - if self.table_partition_cols.is_empty() { - return Ok(()); - } - - if !table_path.is_collection() { - return plan_err!( - "Can't create a partitioned table backed by a single file, \ - perhaps the URL is missing a trailing slash?" - ); - } - - let inferred = self.infer_partitions(state, table_path).await?; - - // no partitioned files found on disk - if inferred.is_empty() { - return Ok(()); - } - - let table_partition_names = self - .table_partition_cols - .iter() - .map(|(col_name, _)| col_name.clone()) - .collect_vec(); - - if inferred.len() < table_partition_names.len() { - return plan_err!( - "Inferred partitions to be {:?}, but got {:?}", - inferred, - table_partition_names - ); - } - - // match prefix to allow creating tables with partial partitions - for (idx, col) in table_partition_names.iter().enumerate() { - if &inferred[idx] != col { - return plan_err!( - "Inferred partitions to be {:?}, but got {:?}", - inferred, - table_partition_names - ); - } - } - - Ok(()) - } - - /// Infer the partitioning at the given path on the provided object store. - /// For performance reasons, it doesn't read all the files on disk - /// and therefore may fail to detect invalid partitioning. - pub(crate) async fn infer_partitions( - &self, - state: &dyn Session, - table_path: &ListingTableUrl, - ) -> Result> { - let store = state.runtime_env().object_store(table_path)?; - - // only use 10 files for inference - // This can fail to detect inconsistent partition keys - // A DFS traversal approach of the store can help here - let files: Vec<_> = table_path - .list_all_files(state, store.as_ref(), &self.file_extension) - .await? - .take(10) - .try_collect() - .await?; - - let stripped_path_parts = files.iter().map(|file| { - table_path - .strip_prefix(&file.location) - .unwrap() - .collect_vec() - }); - - let partition_keys = stripped_path_parts - .map(|path_parts| { - path_parts - .into_iter() - .rev() - .skip(1) // get parents only; skip the file itself - .rev() - // Partitions are expected to follow the format "column_name=value", so we - // should ignore any path part that cannot be parsed into the expected format - .filter(|s| s.contains('=')) - .map(|s| s.split('=').take(1).collect()) - .collect_vec() - }) - .collect_vec(); - - match partition_keys.into_iter().all_equal_value() { - Ok(v) => Ok(v), - Err(None) => Ok(vec![]), - Err(Some(diff)) => { - let mut sorted_diff = [diff.0, diff.1]; - sorted_diff.sort(); - plan_err!("Found mixed partition values on disk {:?}", sorted_diff) - } - } - } -} - -/// Built in [`TableProvider`] that reads data from one or more files as a single table. -/// -/// The files are read using an [`ObjectStore`] instance, for example from -/// local files or objects from AWS S3. -/// -/// # Features: -/// * Reading multiple files as a single table -/// * Hive style partitioning (e.g., directories named `date=2024-06-01`) -/// * Merges schemas from files with compatible but not identical schemas (see [`ListingTableConfig::file_schema`]) -/// * `limit`, `filter` and `projection` pushdown for formats that support it (e.g., -/// Parquet) -/// * Statistics collection and pruning based on file metadata -/// * Pre-existing sort order (see [`ListingOptions::file_sort_order`]) -/// * Metadata caching to speed up repeated queries (see [`FileMetadataCache`]) -/// * Statistics caching (see [`FileStatisticsCache`]) -/// -/// [`FileMetadataCache`]: datafusion_execution::cache::cache_manager::FileMetadataCache -/// -/// # Reading Directories and Hive Style Partitioning -/// -/// For example, given the `table1` directory (or object store prefix) -/// -/// ```text -/// table1 -/// ├── file1.parquet -/// └── file2.parquet -/// ``` -/// -/// A `ListingTable` would read the files `file1.parquet` and `file2.parquet` as -/// a single table, merging the schemas if the files have compatible but not -/// identical schemas. -/// -/// Given the `table2` directory (or object store prefix) -/// -/// ```text -/// table2 -/// ├── date=2024-06-01 -/// │ ├── file3.parquet -/// │ └── file4.parquet -/// └── date=2024-06-02 -/// └── file5.parquet -/// ``` -/// -/// A `ListingTable` would read the files `file3.parquet`, `file4.parquet`, and -/// `file5.parquet` as a single table, again merging schemas if necessary. -/// -/// Given the hive style partitioning structure (e.g,. directories named -/// `date=2024-06-01` and `date=2026-06-02`), `ListingTable` also adds a `date` -/// column when reading the table: -/// * The files in `table2/date=2024-06-01` will have the value `2024-06-01` -/// * The files in `table2/date=2024-06-02` will have the value `2024-06-02`. -/// -/// If the query has a predicate like `WHERE date = '2024-06-01'` -/// only the corresponding directory will be read. -/// -/// # See Also -/// -/// 1. [`ListingTableConfig`]: Configuration options -/// 1. [`DataSourceExec`]: `ExecutionPlan` used by `ListingTable` -/// -/// [`DataSourceExec`]: crate::datasource::source::DataSourceExec -/// -/// # Caching Metadata -/// -/// Some formats, such as Parquet, use the `FileMetadataCache` to cache file -/// metadata that is needed to execute but expensive to read, such as row -/// groups and statistics. The cache is scoped to the [`SessionContext`] and can -/// be configured via the [runtime config options]. -/// -/// [`SessionContext`]: crate::prelude::SessionContext -/// [runtime config options]: https://datafusion.apache.org/user-guide/configs.html#runtime-configuration-settings -/// -/// # Example: Read a directory of parquet files using a [`ListingTable`] -/// -/// ```no_run -/// # use datafusion::prelude::SessionContext; -/// # use datafusion::error::Result; -/// # use std::sync::Arc; -/// # use datafusion::datasource::{ -/// # listing::{ -/// # ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, -/// # }, -/// # file_format::parquet::ParquetFormat, -/// # }; -/// # #[tokio::main] -/// # async fn main() -> Result<()> { -/// let ctx = SessionContext::new(); -/// let session_state = ctx.state(); -/// let table_path = "/path/to/parquet"; -/// -/// // Parse the path -/// let table_path = ListingTableUrl::parse(table_path)?; -/// -/// // Create default parquet options -/// let file_format = ParquetFormat::new(); -/// let listing_options = ListingOptions::new(Arc::new(file_format)) -/// .with_file_extension(".parquet"); -/// -/// // Resolve the schema -/// let resolved_schema = listing_options -/// .infer_schema(&session_state, &table_path) -/// .await?; -/// -/// let config = ListingTableConfig::new(table_path) -/// .with_listing_options(listing_options) -/// .with_schema(resolved_schema); -/// -/// // Create a new TableProvider -/// let provider = Arc::new(ListingTable::try_new(config)?); -/// -/// // This provider can now be read as a dataframe: -/// let df = ctx.read_table(provider.clone()); -/// -/// // or registered as a named table: -/// ctx.register_table("my_table", provider); -/// -/// # Ok(()) -/// # } -/// ``` -#[derive(Debug, Clone)] -pub struct ListingTable { - table_paths: Vec, - /// `file_schema` contains only the columns physically stored in the data files themselves. - /// - Represents the actual fields found in files like Parquet, CSV, etc. - /// - Used when reading the raw data from files - file_schema: SchemaRef, - /// `table_schema` combines `file_schema` + partition columns - /// - Partition columns are derived from directory paths (not stored in files) - /// - These are columns like "year=2022/month=01" in paths like `/data/year=2022/month=01/file.parquet` - table_schema: SchemaRef, - /// Indicates how the schema was derived (inferred or explicitly specified) - schema_source: SchemaSource, - /// Options used to configure the listing table such as the file format - /// and partitioning information - options: ListingOptions, - /// The SQL definition for this table, if any - definition: Option, - /// Cache for collected file statistics - collected_statistics: FileStatisticsCache, - /// Constraints applied to this table - constraints: Constraints, - /// Column default expressions for columns that are not physically present in the data files - column_defaults: HashMap, - /// Optional [`SchemaAdapterFactory`] for creating schema adapters - schema_adapter_factory: Option>, - /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters - expr_adapter_factory: Option>, -} - -impl ListingTable { - /// Create new [`ListingTable`] - /// - /// See documentation and example on [`ListingTable`] and [`ListingTableConfig`] - pub fn try_new(config: ListingTableConfig) -> Result { - // Extract schema_source before moving other parts of the config - let schema_source = config.schema_source(); - - let file_schema = config - .file_schema - .ok_or_else(|| internal_datafusion_err!("No schema provided."))?; - - let options = config - .options - .ok_or_else(|| internal_datafusion_err!("No ListingOptions provided"))?; - - // Add the partition columns to the file schema - let mut builder = SchemaBuilder::from(file_schema.as_ref().to_owned()); - for (part_col_name, part_col_type) in &options.table_partition_cols { - builder.push(Field::new(part_col_name, part_col_type.clone(), false)); - } - - let table_schema = Arc::new( - builder - .finish() - .with_metadata(file_schema.metadata().clone()), - ); - - let table = Self { - table_paths: config.table_paths, - file_schema, - table_schema, - schema_source, - options, - definition: None, - collected_statistics: Arc::new(DefaultFileStatisticsCache::default()), - constraints: Constraints::default(), - column_defaults: HashMap::new(), - schema_adapter_factory: config.schema_adapter_factory, - expr_adapter_factory: config.expr_adapter_factory, - }; - - Ok(table) - } - - /// Assign constraints - pub fn with_constraints(mut self, constraints: Constraints) -> Self { - self.constraints = constraints; - self - } - - /// Assign column defaults - pub fn with_column_defaults( - mut self, - column_defaults: HashMap, - ) -> Self { - self.column_defaults = column_defaults; - self - } - - /// Set the [`FileStatisticsCache`] used to cache parquet file statistics. - /// - /// Setting a statistics cache on the `SessionContext` can avoid refetching statistics - /// multiple times in the same session. - /// - /// If `None`, creates a new [`DefaultFileStatisticsCache`] scoped to this query. - pub fn with_cache(mut self, cache: Option) -> Self { - self.collected_statistics = - cache.unwrap_or_else(|| Arc::new(DefaultFileStatisticsCache::default())); - self - } - - /// Specify the SQL definition for this table, if any - pub fn with_definition(mut self, definition: Option) -> Self { - self.definition = definition; - self - } - - /// Get paths ref - pub fn table_paths(&self) -> &Vec { - &self.table_paths - } - - /// Get options ref - pub fn options(&self) -> &ListingOptions { - &self.options - } - - /// Get the schema source - pub fn schema_source(&self) -> SchemaSource { - self.schema_source - } - - /// Set the [`SchemaAdapterFactory`] for this [`ListingTable`] - /// - /// The schema adapter factory is used to create schema adapters that can - /// handle schema evolution and type conversions when reading files with - /// different schemas than the table schema. - /// - /// # Example: Adding Schema Evolution Support - /// ```rust - /// # use std::sync::Arc; - /// # use datafusion::datasource::listing::{ListingTable, ListingTableConfig, ListingOptions, ListingTableUrl}; - /// # use datafusion::datasource::schema_adapter::{DefaultSchemaAdapterFactory, SchemaAdapter}; - /// # use datafusion::datasource::file_format::parquet::ParquetFormat; - /// # use arrow::datatypes::{SchemaRef, Schema, Field, DataType}; - /// # let table_path = ListingTableUrl::parse("file:///path/to/data").unwrap(); - /// # let options = ListingOptions::new(Arc::new(ParquetFormat::default())); - /// # let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)])); - /// # let config = ListingTableConfig::new(table_path).with_listing_options(options).with_schema(schema); - /// # let table = ListingTable::try_new(config).unwrap(); - /// let table_with_evolution = table - /// .with_schema_adapter_factory(Arc::new(DefaultSchemaAdapterFactory)); - /// ``` - /// See [`ListingTableConfig::with_schema_adapter_factory`] for an example of custom SchemaAdapterFactory. - pub fn with_schema_adapter_factory( - self, - schema_adapter_factory: Arc, - ) -> Self { - Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self - } - } - - /// Get the [`SchemaAdapterFactory`] for this table - pub fn schema_adapter_factory(&self) -> Option<&Arc> { - self.schema_adapter_factory.as_ref() - } - - /// Creates a schema adapter for mapping between file and table schemas - /// - /// Uses the configured schema adapter factory if available, otherwise falls back - /// to the default implementation. - fn create_schema_adapter(&self) -> Box { - let table_schema = self.schema(); - match &self.schema_adapter_factory { - Some(factory) => { - factory.create_with_projected_schema(Arc::clone(&table_schema)) - } - None => DefaultSchemaAdapterFactory::from_schema(Arc::clone(&table_schema)), - } - } - - /// Creates a file source and applies schema adapter factory if available - fn create_file_source_with_schema_adapter(&self) -> Result> { - let mut source = self.options.format.file_source(); - // Apply schema adapter to source if available - // - // The source will use this SchemaAdapter to adapt data batches as they flow up the plan. - // Note: ListingTable also creates a SchemaAdapter in `scan()` but that is only used to adapt collected statistics. - if let Some(factory) = &self.schema_adapter_factory { - source = source.with_schema_adapter_factory(Arc::clone(factory))?; - } - Ok(source) - } - - /// If file_sort_order is specified, creates the appropriate physical expressions - fn try_create_output_ordering( - &self, - execution_props: &ExecutionProps, - ) -> Result> { - create_lex_ordering( - &self.table_schema, - &self.options.file_sort_order, - execution_props, - ) - } -} - -// Expressions can be used for partition pruning if they can be evaluated using -// only the partition columns and there are partition columns. -fn can_be_evaluated_for_partition_pruning( - partition_column_names: &[&str], - expr: &Expr, -) -> bool { - !partition_column_names.is_empty() - && expr_applicable_for_cols(partition_column_names, expr) -} - -#[async_trait] -impl TableProvider for ListingTable { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - Arc::clone(&self.table_schema) - } - - fn constraints(&self) -> Option<&Constraints> { - Some(&self.constraints) - } - - fn table_type(&self) -> TableType { - TableType::Base - } - - async fn scan( - &self, - state: &dyn Session, - projection: Option<&Vec>, - filters: &[Expr], - limit: Option, - ) -> Result> { - let options = ScanArgs::default() - .with_projection(projection.map(|p| p.as_slice())) - .with_filters(Some(filters)) - .with_limit(limit); - Ok(self.scan_with_args(state, options).await?.into_inner()) - } - - async fn scan_with_args<'a>( - &self, - state: &dyn Session, - args: ScanArgs<'a>, - ) -> Result { - let projection = args.projection().map(|p| p.to_vec()); - let filters = args.filters().map(|f| f.to_vec()).unwrap_or_default(); - let limit = args.limit(); - - // extract types of partition columns - let table_partition_cols = self - .options - .table_partition_cols - .iter() - .map(|col| Ok(self.table_schema.field_with_name(&col.0)?.clone())) - .collect::>>()?; - - let table_partition_col_names = table_partition_cols - .iter() - .map(|field| field.name().as_str()) - .collect::>(); - - // If the filters can be resolved using only partition cols, there is no need to - // pushdown it to TableScan, otherwise, `unhandled` pruning predicates will be generated - let (partition_filters, filters): (Vec<_>, Vec<_>) = - filters.iter().cloned().partition(|filter| { - can_be_evaluated_for_partition_pruning(&table_partition_col_names, filter) - }); - - // We should not limit the number of partitioned files to scan if there are filters and limit - // at the same time. This is because the limit should be applied after the filters are applied. - let statistic_file_limit = if filters.is_empty() { limit } else { None }; - - let (mut partitioned_file_lists, statistics) = self - .list_files_for_scan(state, &partition_filters, statistic_file_limit) - .await?; - - // if no files need to be read, return an `EmptyExec` - if partitioned_file_lists.is_empty() { - let projected_schema = project_schema(&self.schema(), projection.as_ref())?; - return Ok(ScanResult::new(Arc::new(EmptyExec::new(projected_schema)))); - } - - let output_ordering = self.try_create_output_ordering(state.execution_props())?; - match state - .config_options() - .execution - .split_file_groups_by_statistics - .then(|| { - output_ordering.first().map(|output_ordering| { - FileScanConfig::split_groups_by_statistics_with_target_partitions( - &self.table_schema, - &partitioned_file_lists, - output_ordering, - self.options.target_partitions, - ) - }) - }) - .flatten() - { - Some(Err(e)) => log::debug!("failed to split file groups by statistics: {e}"), - Some(Ok(new_groups)) => { - if new_groups.len() <= self.options.target_partitions { - partitioned_file_lists = new_groups; - } else { - log::debug!("attempted to split file groups by statistics, but there were more file groups than target_partitions; falling back to unordered") - } - } - None => {} // no ordering required - }; - - let Some(object_store_url) = - self.table_paths.first().map(ListingTableUrl::object_store) - else { - return Ok(ScanResult::new(Arc::new(EmptyExec::new(Arc::new( - Schema::empty(), - ))))); - }; - - let file_source = self.create_file_source_with_schema_adapter()?; - - // create the execution plan - let plan = self - .options - .format - .create_physical_plan( - state, - FileScanConfigBuilder::new( - object_store_url, - Arc::clone(&self.file_schema), - file_source, - ) - .with_file_groups(partitioned_file_lists) - .with_constraints(self.constraints.clone()) - .with_statistics(statistics) - .with_projection(projection) - .with_limit(limit) - .with_output_ordering(output_ordering) - .with_table_partition_cols(table_partition_cols) - .with_expr_adapter(self.expr_adapter_factory.clone()) - .build(), - ) - .await?; - - Ok(ScanResult::new(plan)) - } - - fn supports_filters_pushdown( - &self, - filters: &[&Expr], - ) -> Result> { - let partition_column_names = self - .options - .table_partition_cols - .iter() - .map(|col| col.0.as_str()) - .collect::>(); - filters - .iter() - .map(|filter| { - if can_be_evaluated_for_partition_pruning(&partition_column_names, filter) - { - // if filter can be handled by partition pruning, it is exact - return Ok(TableProviderFilterPushDown::Exact); - } - - Ok(TableProviderFilterPushDown::Inexact) - }) - .collect() - } - - fn get_table_definition(&self) -> Option<&str> { - self.definition.as_deref() - } - - async fn insert_into( - &self, - state: &dyn Session, - input: Arc, - insert_op: InsertOp, - ) -> Result> { - // Check that the schema of the plan matches the schema of this table. - self.schema() - .logically_equivalent_names_and_types(&input.schema())?; - - let table_path = &self.table_paths()[0]; - if !table_path.is_collection() { - return plan_err!( - "Inserting into a ListingTable backed by a single file is not supported, URL is possibly missing a trailing `/`. \ - To append to an existing file use StreamTable, e.g. by using CREATE UNBOUNDED EXTERNAL TABLE" - ); - } - - // Get the object store for the table path. - let store = state.runtime_env().object_store(table_path)?; - - let file_list_stream = pruned_partition_list( - state, - store.as_ref(), - table_path, - &[], - &self.options.file_extension, - &self.options.table_partition_cols, - ) - .await?; - - let file_group = file_list_stream.try_collect::>().await?.into(); - let keep_partition_by_columns = - state.config_options().execution.keep_partition_by_columns; - - // Sink related option, apart from format - let config = FileSinkConfig { - original_url: String::default(), - object_store_url: self.table_paths()[0].object_store(), - table_paths: self.table_paths().clone(), - file_group, - output_schema: self.schema(), - table_partition_cols: self.options.table_partition_cols.clone(), - insert_op, - keep_partition_by_columns, - file_extension: self.options().format.get_ext(), - }; - - let orderings = self.try_create_output_ordering(state.execution_props())?; - // It is sufficient to pass only one of the equivalent orderings: - let order_requirements = orderings.into_iter().next().map(Into::into); - - self.options() - .format - .create_writer_physical_plan(input, state, config, order_requirements) - .await - } - - fn get_column_default(&self, column: &str) -> Option<&Expr> { - self.column_defaults.get(column) - } -} - -impl ListingTable { - /// Get the list of files for a scan as well as the file level statistics. - /// The list is grouped to let the execution plan know how the files should - /// be distributed to different threads / executors. - async fn list_files_for_scan<'a>( - &'a self, - ctx: &'a dyn Session, - filters: &'a [Expr], - limit: Option, - ) -> Result<(Vec, Statistics)> { - let store = if let Some(url) = self.table_paths.first() { - ctx.runtime_env().object_store(url)? - } else { - return Ok((vec![], Statistics::new_unknown(&self.file_schema))); - }; - // list files (with partitions) - let file_list = future::try_join_all(self.table_paths.iter().map(|table_path| { - pruned_partition_list( - ctx, - store.as_ref(), - table_path, - filters, - &self.options.file_extension, - &self.options.table_partition_cols, - ) - })) - .await?; - let meta_fetch_concurrency = - ctx.config_options().execution.meta_fetch_concurrency; - let file_list = stream::iter(file_list).flatten_unordered(meta_fetch_concurrency); - // collect the statistics if required by the config - let files = file_list - .map(|part_file| async { - let part_file = part_file?; - let statistics = if self.options.collect_stat { - self.do_collect_statistics(ctx, &store, &part_file).await? - } else { - Arc::new(Statistics::new_unknown(&self.file_schema)) - }; - Ok(part_file.with_statistics(statistics)) - }) - .boxed() - .buffer_unordered(ctx.config_options().execution.meta_fetch_concurrency); - - let (file_group, inexact_stats) = - get_files_with_limit(files, limit, self.options.collect_stat).await?; - - let file_groups = file_group.split_files(self.options.target_partitions); - let (mut file_groups, mut stats) = compute_all_files_statistics( - file_groups, - self.schema(), - self.options.collect_stat, - inexact_stats, - )?; - - let schema_adapter = self.create_schema_adapter(); - let (schema_mapper, _) = schema_adapter.map_schema(self.file_schema.as_ref())?; - - stats.column_statistics = - schema_mapper.map_column_statistics(&stats.column_statistics)?; - file_groups.iter_mut().try_for_each(|file_group| { - if let Some(stat) = file_group.statistics_mut() { - stat.column_statistics = - schema_mapper.map_column_statistics(&stat.column_statistics)?; - } - Ok::<_, DataFusionError>(()) - })?; - Ok((file_groups, stats)) - } - - /// Collects statistics for a given partitioned file. - /// - /// This method first checks if the statistics for the given file are already cached. - /// If they are, it returns the cached statistics. - /// If they are not, it infers the statistics from the file and stores them in the cache. - async fn do_collect_statistics( - &self, - ctx: &dyn Session, - store: &Arc, - part_file: &PartitionedFile, - ) -> Result> { - match self - .collected_statistics - .get_with_extra(&part_file.object_meta.location, &part_file.object_meta) - { - Some(statistics) => Ok(statistics), - None => { - let statistics = self - .options - .format - .infer_stats( - ctx, - store, - Arc::clone(&self.file_schema), - &part_file.object_meta, - ) - .await?; - let statistics = Arc::new(statistics); - self.collected_statistics.put_with_extra( - &part_file.object_meta.location, - Arc::clone(&statistics), - &part_file.object_meta, - ); - Ok(statistics) - } - } - } -} - -/// Processes a stream of partitioned files and returns a `FileGroup` containing the files. -/// -/// This function collects files from the provided stream until either: -/// 1. The stream is exhausted -/// 2. The accumulated number of rows exceeds the provided `limit` (if specified) -/// -/// # Arguments -/// * `files` - A stream of `Result` items to process -/// * `limit` - An optional row count limit. If provided, the function will stop collecting files -/// once the accumulated number of rows exceeds this limit -/// * `collect_stats` - Whether to collect and accumulate statistics from the files -/// -/// # Returns -/// A `Result` containing a `FileGroup` with the collected files -/// and a boolean indicating whether the statistics are inexact. -/// -/// # Note -/// The function will continue processing files if statistics are not available or if the -/// limit is not provided. If `collect_stats` is false, statistics won't be accumulated -/// but files will still be collected. -async fn get_files_with_limit( - files: impl Stream>, - limit: Option, - collect_stats: bool, -) -> Result<(FileGroup, bool)> { - let mut file_group = FileGroup::default(); - // Fusing the stream allows us to call next safely even once it is finished. - let mut all_files = Box::pin(files.fuse()); - enum ProcessingState { - ReadingFiles, - ReachedLimit, - } - - let mut state = ProcessingState::ReadingFiles; - let mut num_rows = Precision::Absent; - - while let Some(file_result) = all_files.next().await { - // Early exit if we've already reached our limit - if matches!(state, ProcessingState::ReachedLimit) { - break; - } - - let file = file_result?; - - // Update file statistics regardless of state - if collect_stats { - if let Some(file_stats) = &file.statistics { - num_rows = if file_group.is_empty() { - // For the first file, just take its row count - file_stats.num_rows - } else { - // For subsequent files, accumulate the counts - num_rows.add(&file_stats.num_rows) - }; - } - } - - // Always add the file to our group - file_group.push(file); - - // Check if we've hit the limit (if one was specified) - if let Some(limit) = limit { - if let Precision::Exact(row_count) = num_rows { - if row_count > limit { - state = ProcessingState::ReachedLimit; - } - } - } - } - // If we still have files in the stream, it means that the limit kicked - // in, and the statistic could have been different had we processed the - // files in a different order. - let inexact_stats = all_files.next().await.is_some(); - Ok((file_group, inexact_stats)) } #[cfg(test)] mod tests { - use super::*; #[cfg(feature = "parquet")] use crate::datasource::file_format::parquet::ParquetFormat; + use crate::datasource::listing::table::ListingTableConfigExt; use crate::prelude::*; use crate::{ datasource::{ @@ -1587,21 +123,34 @@ mod tests { }, }; use arrow::{compute::SortOptions, record_batch::RecordBatch}; + use arrow_schema::{DataType, Field, Schema, SchemaRef}; + use datafusion_catalog::TableProvider; + use datafusion_catalog_listing::{ + ListingOptions, ListingTable, ListingTableConfig, SchemaSource, + }; use datafusion_common::{ - assert_contains, + assert_contains, plan_err, stats::Precision, test_util::{batches_to_string, datafusion_test_data}, - ColumnStatistics, ScalarValue, + ColumnStatistics, DataFusionError, Result, ScalarValue, }; + use datafusion_datasource::file_compression_type::FileCompressionType; + use datafusion_datasource::file_format::FileFormat; use datafusion_datasource::schema_adapter::{ SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; + use datafusion_datasource::ListingTableUrl; + use datafusion_expr::dml::InsertOp; use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator}; use datafusion_physical_expr::expressions::binary; use datafusion_physical_expr::PhysicalSortExpr; + use datafusion_physical_expr_common::sort_expr::LexOrdering; + use datafusion_physical_plan::empty::EmptyExec; use datafusion_physical_plan::{collect, ExecutionPlanProperties}; use rstest::rstest; + use std::collections::HashMap; use std::io::Write; + use std::sync::Arc; use tempfile::TempDir; use url::Url; @@ -1638,10 +187,13 @@ mod tests { let ctx = SessionContext::new(); let testdata = datafusion_test_data(); let filename = format!("{testdata}/aggregate_simple.csv"); - let table_path = ListingTableUrl::parse(filename).unwrap(); + let table_path = ListingTableUrl::parse(filename)?; // Test default schema source - let config = ListingTableConfig::new(table_path.clone()); + let format = CsvFormat::default(); + let options = ListingOptions::new(Arc::new(format)); + let config = + ListingTableConfig::new(table_path.clone()).with_listing_options(options); assert_eq!(config.schema_source(), SchemaSource::Unset); // Test schema source after setting a schema explicitly @@ -1650,18 +202,13 @@ mod tests { assert_eq!(config_with_schema.schema_source(), SchemaSource::Specified); // Test schema source after inferring schema - let format = CsvFormat::default(); - let options = ListingOptions::new(Arc::new(format)); - let config_with_options = config.with_listing_options(options.clone()); - assert_eq!(config_with_options.schema_source(), SchemaSource::Unset); + assert_eq!(config.schema_source(), SchemaSource::Unset); - let config_with_inferred = config_with_options.infer_schema(&ctx.state()).await?; + let config_with_inferred = config.infer_schema(&ctx.state()).await?; assert_eq!(config_with_inferred.schema_source(), SchemaSource::Inferred); // Test schema preservation through operations - let config_with_schema_and_options = config_with_schema - .clone() - .with_listing_options(options.clone()); + let config_with_schema_and_options = config_with_schema.clone(); assert_eq!( config_with_schema_and_options.schema_source(), SchemaSource::Specified @@ -1836,7 +383,7 @@ mod tests { .with_table_partition_cols(vec![(String::from("p1"), DataType::Utf8)]) .with_target_partitions(4); - let table_path = ListingTableUrl::parse("test:///table/").unwrap(); + let table_path = ListingTableUrl::parse("test:///table/")?; let file_schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)])); let config = ListingTableConfig::new(table_path) @@ -1872,7 +419,7 @@ mod tests { ) -> Result> { let testdata = crate::test_util::parquet_test_data(); let filename = format!("{testdata}/{name}"); - let table_path = ListingTableUrl::parse(filename).unwrap(); + let table_path = ListingTableUrl::parse(filename)?; let config = ListingTableConfig::new(table_path) .infer(&ctx.state()) @@ -1899,7 +446,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]); - let table_path = ListingTableUrl::parse(table_prefix).unwrap(); + let table_path = ListingTableUrl::parse(table_prefix)?; let config = ListingTableConfig::new(table_path) .with_listing_options(opt) .with_schema(Arc::new(schema)); @@ -2458,7 +1005,7 @@ mod tests { async fn test_infer_options_compressed_csv() -> Result<()> { let testdata = crate::test_util::arrow_test_data(); let filename = format!("{testdata}/csv/aggregate_test_100.csv.gz"); - let table_path = ListingTableUrl::parse(filename).unwrap(); + let table_path = ListingTableUrl::parse(filename)?; let ctx = SessionContext::new(); @@ -2479,12 +1026,15 @@ mod tests { let testdata = datafusion_test_data(); let filename = format!("{testdata}/aggregate_simple.csv"); - let table_path = ListingTableUrl::parse(filename).unwrap(); + let table_path = ListingTableUrl::parse(filename)?; let provided_schema = create_test_schema(); - let config = - ListingTableConfig::new(table_path).with_schema(Arc::clone(&provided_schema)); + let format = CsvFormat::default(); + let options = ListingOptions::new(Arc::new(format)); + let config = ListingTableConfig::new(table_path) + .with_listing_options(options) + .with_schema(Arc::clone(&provided_schema)); let config = config.infer(&ctx.state()).await?; @@ -2549,8 +1099,8 @@ mod tests { table_path1.clone(), table_path2.clone(), ]) - .with_schema(schema_3cols) - .with_listing_options(options.clone()); + .with_listing_options(options.clone()) + .with_schema(schema_3cols); let config2 = config2.infer_schema(&ctx.state()).await?; assert_eq!(config2.schema_source(), SchemaSource::Specified); @@ -2573,8 +1123,8 @@ mod tests { table_path1.clone(), table_path2.clone(), ]) - .with_schema(schema_4cols) - .with_listing_options(options.clone()); + .with_listing_options(options.clone()) + .with_schema(schema_4cols); let config3 = config3.infer_schema(&ctx.state()).await?; assert_eq!(config3.schema_source(), SchemaSource::Specified); @@ -2785,7 +1335,7 @@ mod tests { let testdata = crate::test_util::parquet_test_data(); let filename = format!("{}/{}", testdata, "alltypes_plain.parquet"); - let table_path = ListingTableUrl::parse(filename).unwrap(); + let table_path = ListingTableUrl::parse(filename)?; let ctx = SessionContext::new(); let state = ctx.state(); @@ -2932,7 +1482,7 @@ mod tests { let format = JsonFormat::default(); let opt = ListingOptions::new(Arc::new(format)).with_collect_stat(false); let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]); - let table_path = ListingTableUrl::parse("test:///table/").unwrap(); + let table_path = ListingTableUrl::parse("test:///table/")?; let config = ListingTableConfig::new(table_path) .with_listing_options(opt) @@ -3146,7 +1696,7 @@ mod tests { let format = JsonFormat::default(); let opt = ListingOptions::new(Arc::new(format)).with_collect_stat(collect_stat); let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]); - let table_path = ListingTableUrl::parse("test:///table/").unwrap(); + let table_path = ListingTableUrl::parse("test:///table/")?; let config = ListingTableConfig::new(table_path) .with_listing_options(opt) diff --git a/datafusion/core/tests/catalog/memory.rs b/datafusion/core/tests/catalog/memory.rs index ea9e71fc37467..06ed141b2e8bd 100644 --- a/datafusion/core/tests/catalog/memory.rs +++ b/datafusion/core/tests/catalog/memory.rs @@ -19,7 +19,7 @@ use arrow::datatypes::Schema; use datafusion::catalog::CatalogProvider; use datafusion::datasource::empty::EmptyTable; use datafusion::datasource::listing::{ - ListingTable, ListingTableConfig, ListingTableUrl, + ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl, }; use datafusion::prelude::SessionContext; use datafusion_catalog::memory::*; diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 4ae2fa9b4c399..40fc6176e212b 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -23,7 +23,9 @@ use arrow_schema::{DataType, Field, FieldRef, Schema, SchemaRef}; use bytes::{BufMut, BytesMut}; use datafusion::assert_batches_eq; use datafusion::common::Result; -use datafusion::datasource::listing::{ListingTable, ListingTableConfig}; +use datafusion::datasource::listing::{ + ListingTable, ListingTableConfig, ListingTableConfigExt, +}; use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::DataFusionError; From 8e1d13a9c6e989b8a6216c6752a34fb3e0494e0f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 17 Oct 2025 06:44:10 -0400 Subject: [PATCH 024/109] refactor: move arrow datasource to new `datafusion-datasource-arrow` crate (#18082) ## Which issue does this PR close? - This addresses part of https://github.com/apache/datafusion/issues/17713 but it does not close it. ## Rationale for this change In order to remove `core` from `proto` crate, we need `ArrowFormat` to be available. Similar to the other datasource types (csv, avro, json, parquet) this splits the Arrow IPC file format into its own crate. ## What changes are included in this PR? This is a straight refactor. Code is merely moved around. The size of the diff is the additional files that are required (cargo.toml, readme.md, etc) ## Are these changes tested? Existing unit tests. ## Are there any user-facing changes? Users that include `ArrowSource` may need to update their include paths. For most, the reexports will cover this need. --- .github/workflows/labeler/labeler-config.yml | 2 +- Cargo.lock | 25 +- Cargo.toml | 2 + datafusion/core/Cargo.toml | 4 +- .../core/src/datasource/file_format/arrow.rs | 509 +-------------- .../src/datasource/physical_plan/arrow.rs | 23 + .../core/src/datasource/physical_plan/mod.rs | 5 +- datafusion/datasource-arrow/Cargo.toml | 64 ++ datafusion/datasource-arrow/LICENSE.txt | 212 ++++++ datafusion/datasource-arrow/NOTICE.txt | 5 + datafusion/datasource-arrow/README.md | 34 + .../datasource-arrow/src/file_format.rs | 603 ++++++++++++++++++ datafusion/datasource-arrow/src/mod.rs | 25 + .../src/source.rs} | 5 +- .../tests/data/example.arrow | Bin .../sqllogictest/test_files/arrow_files.slt | 8 +- datafusion/sqllogictest/test_files/ddl.slt | 6 +- .../test_files/repartition_scan.slt | 4 +- dev/release/README.md | 1 + 19 files changed, 1012 insertions(+), 525 deletions(-) create mode 100644 datafusion/core/src/datasource/physical_plan/arrow.rs create mode 100644 datafusion/datasource-arrow/Cargo.toml create mode 100644 datafusion/datasource-arrow/LICENSE.txt create mode 100644 datafusion/datasource-arrow/NOTICE.txt create mode 100644 datafusion/datasource-arrow/README.md create mode 100644 datafusion/datasource-arrow/src/file_format.rs create mode 100644 datafusion/datasource-arrow/src/mod.rs rename datafusion/{core/src/datasource/physical_plan/arrow_file.rs => datasource-arrow/src/source.rs} (98%) rename datafusion/{core => datasource-arrow}/tests/data/example.arrow (100%) diff --git a/.github/workflows/labeler/labeler-config.yml b/.github/workflows/labeler/labeler-config.yml index e408130725215..38d88059dab70 100644 --- a/.github/workflows/labeler/labeler-config.yml +++ b/.github/workflows/labeler/labeler-config.yml @@ -58,7 +58,7 @@ execution: datasource: - changed-files: - - any-glob-to-any-file: ['datafusion/datasource/**/*', 'datafusion/datasource-avro/**/*', 'datafusion/datasource-csv/**/*', 'datafusion/datasource-json/**/*', 'datafusion/datasource-parquet/**/*'] + - any-glob-to-any-file: ['datafusion/datasource/**/*', 'datafusion/datasource-avro/**/*', 'datafusion/datasource-arrow/**/*', 'datafusion/datasource-csv/**/*', 'datafusion/datasource-json/**/*', 'datafusion/datasource-parquet/**/*'] functions: - changed-files: diff --git a/Cargo.lock b/Cargo.lock index 0392c8147ad2c..d69ece6d3fb05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1789,7 +1789,6 @@ name = "datafusion" version = "50.2.0" dependencies = [ "arrow", - "arrow-ipc", "arrow-schema", "async-trait", "bytes", @@ -1803,6 +1802,7 @@ dependencies = [ "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", + "datafusion-datasource-arrow", "datafusion-datasource-avro", "datafusion-datasource-csv", "datafusion-datasource-json", @@ -2030,6 +2030,29 @@ dependencies = [ "zstd", ] +[[package]] +name = "datafusion-datasource-arrow" +version = "50.2.0" +dependencies = [ + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "object_store", + "tokio", +] + [[package]] name = "datafusion-datasource-avro" version = "50.2.0" diff --git a/Cargo.toml b/Cargo.toml index dd0b20de528af..79c14d6cca799 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ members = [ "datafusion/catalog", "datafusion/catalog-listing", "datafusion/datasource", + "datafusion/datasource-arrow", "datafusion/datasource-avro", "datafusion/datasource-csv", "datafusion/datasource-json", @@ -116,6 +117,7 @@ datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "5 datafusion-common = { path = "datafusion/common", version = "50.2.0", default-features = false } datafusion-common-runtime = { path = "datafusion/common-runtime", version = "50.2.0" } datafusion-datasource = { path = "datafusion/datasource", version = "50.2.0", default-features = false } +datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "50.2.0", default-features = false } datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "50.2.0", default-features = false } datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "50.2.0", default-features = false } datafusion-datasource-json = { path = "datafusion/datasource-json", version = "50.2.0", default-features = false } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index d3bc4546588de..a5a715cea94f1 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -47,7 +47,7 @@ compression = [ "bzip2", "flate2", "zstd", - "arrow-ipc/zstd", + "datafusion-datasource-arrow/compression", "datafusion-datasource/compression", ] crypto_expressions = ["datafusion-functions/crypto_expressions"] @@ -109,7 +109,6 @@ extended_tests = [] [dependencies] arrow = { workspace = true } -arrow-ipc = { workspace = true } arrow-schema = { workspace = true, features = ["canonical_extension_types"] } async-trait = { workspace = true } bytes = { workspace = true } @@ -120,6 +119,7 @@ datafusion-catalog-listing = { workspace = true } datafusion-common = { workspace = true, features = ["object_store"] } datafusion-common-runtime = { workspace = true } datafusion-datasource = { workspace = true } +datafusion-datasource-arrow = { workspace = true } datafusion-datasource-avro = { workspace = true, optional = true } datafusion-datasource-csv = { workspace = true } datafusion-datasource-json = { workspace = true } diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 25bc166d657a5..8701f96eb3b84 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -15,510 +15,5 @@ // specific language governing permissions and limitations // under the License. -//! [`ArrowFormat`]: Apache Arrow [`FileFormat`] abstractions -//! -//! Works with files following the [Arrow IPC format](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format) - -use std::any::Any; -use std::borrow::Cow; -use std::collections::HashMap; -use std::fmt::{self, Debug}; -use std::sync::Arc; - -use super::file_compression_type::FileCompressionType; -use super::write::demux::DemuxedStreamReceiver; -use super::write::SharedBuffer; -use super::FileFormatFactory; -use crate::datasource::file_format::write::get_writer_schema; -use crate::datasource::file_format::FileFormat; -use crate::datasource::physical_plan::{ArrowSource, FileSink, FileSinkConfig}; -use crate::error::Result; -use crate::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; - -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::ArrowError; -use arrow::ipc::convert::fb_to_schema; -use arrow::ipc::reader::FileReader; -use arrow::ipc::writer::IpcWriteOptions; -use arrow::ipc::{root_as_message, CompressionType}; -use datafusion_catalog::Session; -use datafusion_common::parsers::CompressionTypeVariant; -use datafusion_common::{ - internal_datafusion_err, not_impl_err, DataFusionError, GetExt, Statistics, - DEFAULT_ARROW_EXTENSION, -}; -use datafusion_common_runtime::{JoinSet, SpawnedTask}; -use datafusion_datasource::display::FileGroupDisplay; -use datafusion_datasource::file::FileSource; -use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; -use datafusion_datasource::sink::{DataSink, DataSinkExec}; -use datafusion_datasource::write::ObjectWriterBuilder; -use datafusion_execution::{SendableRecordBatchStream, TaskContext}; -use datafusion_expr::dml::InsertOp; -use datafusion_physical_expr_common::sort_expr::LexRequirement; - -use async_trait::async_trait; -use bytes::Bytes; -use datafusion_datasource::source::DataSourceExec; -use futures::stream::BoxStream; -use futures::StreamExt; -use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; -use tokio::io::AsyncWriteExt; - -/// Initial writing buffer size. Note this is just a size hint for efficiency. It -/// will grow beyond the set value if needed. -const INITIAL_BUFFER_BYTES: usize = 1048576; - -/// If the buffered Arrow data exceeds this size, it is flushed to object store -const BUFFER_FLUSH_BYTES: usize = 1024000; - -#[derive(Default, Debug)] -/// Factory struct used to create [ArrowFormat] -pub struct ArrowFormatFactory; - -impl ArrowFormatFactory { - /// Creates an instance of [ArrowFormatFactory] - pub fn new() -> Self { - Self {} - } -} - -impl FileFormatFactory for ArrowFormatFactory { - fn create( - &self, - _state: &dyn Session, - _format_options: &HashMap, - ) -> Result> { - Ok(Arc::new(ArrowFormat)) - } - - fn default(&self) -> Arc { - Arc::new(ArrowFormat) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -impl GetExt for ArrowFormatFactory { - fn get_ext(&self) -> String { - // Removes the dot, i.e. ".parquet" -> "parquet" - DEFAULT_ARROW_EXTENSION[1..].to_string() - } -} - -/// Arrow `FileFormat` implementation. -#[derive(Default, Debug)] -pub struct ArrowFormat; - -#[async_trait] -impl FileFormat for ArrowFormat { - fn as_any(&self) -> &dyn Any { - self - } - - fn get_ext(&self) -> String { - ArrowFormatFactory::new().get_ext() - } - - fn get_ext_with_compression( - &self, - file_compression_type: &FileCompressionType, - ) -> Result { - let ext = self.get_ext(); - match file_compression_type.get_variant() { - CompressionTypeVariant::UNCOMPRESSED => Ok(ext), - _ => Err(internal_datafusion_err!( - "Arrow FileFormat does not support compression." - )), - } - } - - fn compression_type(&self) -> Option { - None - } - - async fn infer_schema( - &self, - _state: &dyn Session, - store: &Arc, - objects: &[ObjectMeta], - ) -> Result { - let mut schemas = vec![]; - for object in objects { - let r = store.as_ref().get(&object.location).await?; - let schema = match r.payload { - #[cfg(not(target_arch = "wasm32"))] - GetResultPayload::File(mut file, _) => { - let reader = FileReader::try_new(&mut file, None)?; - reader.schema() - } - GetResultPayload::Stream(stream) => { - infer_schema_from_file_stream(stream).await? - } - }; - schemas.push(schema.as_ref().clone()); - } - let merged_schema = Schema::try_merge(schemas)?; - Ok(Arc::new(merged_schema)) - } - - async fn infer_stats( - &self, - _state: &dyn Session, - _store: &Arc, - table_schema: SchemaRef, - _object: &ObjectMeta, - ) -> Result { - Ok(Statistics::new_unknown(&table_schema)) - } - - async fn create_physical_plan( - &self, - _state: &dyn Session, - conf: FileScanConfig, - ) -> Result> { - let source = Arc::new(ArrowSource::default()); - let config = FileScanConfigBuilder::from(conf) - .with_source(source) - .build(); - - Ok(DataSourceExec::from_data_source(config)) - } - - async fn create_writer_physical_plan( - &self, - input: Arc, - _state: &dyn Session, - conf: FileSinkConfig, - order_requirements: Option, - ) -> Result> { - if conf.insert_op != InsertOp::Append { - return not_impl_err!("Overwrites are not implemented yet for Arrow format"); - } - - let sink = Arc::new(ArrowFileSink::new(conf)); - - Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) - } - - fn file_source(&self) -> Arc { - Arc::new(ArrowSource::default()) - } -} - -/// Implements [`FileSink`] for writing to arrow_ipc files -struct ArrowFileSink { - config: FileSinkConfig, -} - -impl ArrowFileSink { - fn new(config: FileSinkConfig) -> Self { - Self { config } - } -} - -#[async_trait] -impl FileSink for ArrowFileSink { - fn config(&self) -> &FileSinkConfig { - &self.config - } - - async fn spawn_writer_tasks_and_join( - &self, - context: &Arc, - demux_task: SpawnedTask>, - mut file_stream_rx: DemuxedStreamReceiver, - object_store: Arc, - ) -> Result { - let mut file_write_tasks: JoinSet> = - JoinSet::new(); - - let ipc_options = - IpcWriteOptions::try_new(64, false, arrow_ipc::MetadataVersion::V5)? - .try_with_compression(Some(CompressionType::LZ4_FRAME))?; - while let Some((path, mut rx)) = file_stream_rx.recv().await { - let shared_buffer = SharedBuffer::new(INITIAL_BUFFER_BYTES); - let mut arrow_writer = arrow_ipc::writer::FileWriter::try_new_with_options( - shared_buffer.clone(), - &get_writer_schema(&self.config), - ipc_options.clone(), - )?; - let mut object_store_writer = ObjectWriterBuilder::new( - FileCompressionType::UNCOMPRESSED, - &path, - Arc::clone(&object_store), - ) - .with_buffer_size(Some( - context - .session_config() - .options() - .execution - .objectstore_writer_buffer_size, - )) - .build()?; - file_write_tasks.spawn(async move { - let mut row_count = 0; - while let Some(batch) = rx.recv().await { - row_count += batch.num_rows(); - arrow_writer.write(&batch)?; - let mut buff_to_flush = shared_buffer.buffer.try_lock().unwrap(); - if buff_to_flush.len() > BUFFER_FLUSH_BYTES { - object_store_writer - .write_all(buff_to_flush.as_slice()) - .await?; - buff_to_flush.clear(); - } - } - arrow_writer.finish()?; - let final_buff = shared_buffer.buffer.try_lock().unwrap(); - - object_store_writer.write_all(final_buff.as_slice()).await?; - object_store_writer.shutdown().await?; - Ok(row_count) - }); - } - - let mut row_count = 0; - while let Some(result) = file_write_tasks.join_next().await { - match result { - Ok(r) => { - row_count += r?; - } - Err(e) => { - if e.is_panic() { - std::panic::resume_unwind(e.into_panic()); - } else { - unreachable!(); - } - } - } - } - - demux_task - .join_unwind() - .await - .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??; - Ok(row_count as u64) - } -} - -impl Debug for ArrowFileSink { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("ArrowFileSink").finish() - } -} - -impl DisplayAs for ArrowFileSink { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match t { - DisplayFormatType::Default | DisplayFormatType::Verbose => { - write!(f, "ArrowFileSink(file_groups=",)?; - FileGroupDisplay(&self.config.file_group).fmt_as(t, f)?; - write!(f, ")") - } - DisplayFormatType::TreeRender => { - writeln!(f, "format: arrow")?; - write!(f, "file={}", &self.config.original_url) - } - } - } -} - -#[async_trait] -impl DataSink for ArrowFileSink { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> &SchemaRef { - self.config.output_schema() - } - - async fn write_all( - &self, - data: SendableRecordBatchStream, - context: &Arc, - ) -> Result { - FileSink::write_all(self, data, context).await - } -} - -const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1']; -const CONTINUATION_MARKER: [u8; 4] = [0xff; 4]; - -/// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs. -/// See -async fn infer_schema_from_file_stream( - mut stream: BoxStream<'static, object_store::Result>, -) -> Result { - // Expected format: - // - 6 bytes - // - 2 bytes - // - 4 bytes, not present below v0.15.0 - // - 4 bytes - // - // - - // So in first read we need at least all known sized sections, - // which is 6 + 2 + 4 + 4 = 16 bytes. - let bytes = collect_at_least_n_bytes(&mut stream, 16, None).await?; - - // Files should start with these magic bytes - if bytes[0..6] != ARROW_MAGIC { - return Err(ArrowError::ParseError( - "Arrow file does not contain correct header".to_string(), - ))?; - } - - // Since continuation marker bytes added in later versions - let (meta_len, rest_of_bytes_start_index) = if bytes[8..12] == CONTINUATION_MARKER { - (&bytes[12..16], 16) - } else { - (&bytes[8..12], 12) - }; - - let meta_len = [meta_len[0], meta_len[1], meta_len[2], meta_len[3]]; - let meta_len = i32::from_le_bytes(meta_len); - - // Read bytes for Schema message - let block_data = if bytes[rest_of_bytes_start_index..].len() < meta_len as usize { - // Need to read more bytes to decode Message - let mut block_data = Vec::with_capacity(meta_len as usize); - // In case we had some spare bytes in our initial read chunk - block_data.extend_from_slice(&bytes[rest_of_bytes_start_index..]); - let size_to_read = meta_len as usize - block_data.len(); - let block_data = - collect_at_least_n_bytes(&mut stream, size_to_read, Some(block_data)).await?; - Cow::Owned(block_data) - } else { - // Already have the bytes we need - let end_index = meta_len as usize + rest_of_bytes_start_index; - let block_data = &bytes[rest_of_bytes_start_index..end_index]; - Cow::Borrowed(block_data) - }; - - // Decode Schema message - let message = root_as_message(&block_data).map_err(|err| { - ArrowError::ParseError(format!("Unable to read IPC message as metadata: {err:?}")) - })?; - let ipc_schema = message.header_as_schema().ok_or_else(|| { - ArrowError::IpcError("Unable to read IPC message as schema".to_string()) - })?; - let schema = fb_to_schema(ipc_schema); - - Ok(Arc::new(schema)) -} - -async fn collect_at_least_n_bytes( - stream: &mut BoxStream<'static, object_store::Result>, - n: usize, - extend_from: Option>, -) -> Result> { - let mut buf = extend_from.unwrap_or_else(|| Vec::with_capacity(n)); - // If extending existing buffer then ensure we read n additional bytes - let n = n + buf.len(); - while let Some(bytes) = stream.next().await.transpose()? { - buf.extend_from_slice(&bytes); - if buf.len() >= n { - break; - } - } - if buf.len() < n { - return Err(ArrowError::ParseError( - "Unexpected end of byte stream for Arrow IPC file".to_string(), - ))?; - } - Ok(buf) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::execution::context::SessionContext; - - use chrono::DateTime; - use object_store::{chunked::ChunkedStore, memory::InMemory, path::Path}; - - #[tokio::test] - async fn test_infer_schema_stream() -> Result<()> { - let mut bytes = std::fs::read("tests/data/example.arrow")?; - bytes.truncate(bytes.len() - 20); // mangle end to show we don't need to read whole file - let location = Path::parse("example.arrow")?; - let in_memory_store: Arc = Arc::new(InMemory::new()); - in_memory_store.put(&location, bytes.into()).await?; - - let session_ctx = SessionContext::new(); - let state = session_ctx.state(); - let object_meta = ObjectMeta { - location, - last_modified: DateTime::default(), - size: u64::MAX, - e_tag: None, - version: None, - }; - - let arrow_format = ArrowFormat {}; - let expected = vec!["f0: Int64", "f1: Utf8", "f2: Boolean"]; - - // Test chunk sizes where too small so we keep having to read more bytes - // And when large enough that first read contains all we need - for chunk_size in [7, 3000] { - let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), chunk_size)); - let inferred_schema = arrow_format - .infer_schema( - &state, - &(store.clone() as Arc), - std::slice::from_ref(&object_meta), - ) - .await?; - let actual_fields = inferred_schema - .fields() - .iter() - .map(|f| format!("{}: {:?}", f.name(), f.data_type())) - .collect::>(); - assert_eq!(expected, actual_fields); - } - - Ok(()) - } - - #[tokio::test] - async fn test_infer_schema_short_stream() -> Result<()> { - let mut bytes = std::fs::read("tests/data/example.arrow")?; - bytes.truncate(20); // should cause error that file shorter than expected - let location = Path::parse("example.arrow")?; - let in_memory_store: Arc = Arc::new(InMemory::new()); - in_memory_store.put(&location, bytes.into()).await?; - - let session_ctx = SessionContext::new(); - let state = session_ctx.state(); - let object_meta = ObjectMeta { - location, - last_modified: DateTime::default(), - size: u64::MAX, - e_tag: None, - version: None, - }; - - let arrow_format = ArrowFormat {}; - - let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), 7)); - let err = arrow_format - .infer_schema( - &state, - &(store.clone() as Arc), - std::slice::from_ref(&object_meta), - ) - .await; - - assert!(err.is_err()); - assert_eq!( - "Arrow error: Parser error: Unexpected end of byte stream for Arrow IPC file", - err.unwrap_err().to_string().lines().next().unwrap() - ); - - Ok(()) - } -} +//! Re-exports the [`datafusion_datasource_arrow::file_format`] module, and contains tests for it. +pub use datafusion_datasource_arrow::file_format::*; diff --git a/datafusion/core/src/datasource/physical_plan/arrow.rs b/datafusion/core/src/datasource/physical_plan/arrow.rs new file mode 100644 index 0000000000000..392eaa8c4be49 --- /dev/null +++ b/datafusion/core/src/datasource/physical_plan/arrow.rs @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Reexports the [`datafusion_datasource_arrow::source`] module, containing [Arrow] based [`FileSource`]. +//! +//! [Arrow]: https://arrow.apache.org/docs/python/ipc.html +//! [`FileSource`]: datafusion_datasource::file::FileSource + +pub use datafusion_datasource_arrow::source::*; diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 3a9dedaa028f2..1ac292e260fdf 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -17,7 +17,7 @@ //! Execution plans that read file formats -mod arrow_file; +pub mod arrow; pub mod csv; pub mod json; @@ -35,10 +35,9 @@ pub use datafusion_datasource_parquet::source::ParquetSource; #[cfg(feature = "parquet")] pub use datafusion_datasource_parquet::{ParquetFileMetrics, ParquetFileReaderFactory}; -pub use arrow_file::ArrowSource; - pub use json::{JsonOpener, JsonSource}; +pub use arrow::{ArrowOpener, ArrowSource}; pub use csv::{CsvOpener, CsvSource}; pub use datafusion_datasource::file::FileSource; pub use datafusion_datasource::file_groups::FileGroup; diff --git a/datafusion/datasource-arrow/Cargo.toml b/datafusion/datasource-arrow/Cargo.toml new file mode 100644 index 0000000000000..b3d1e3f2accc9 --- /dev/null +++ b/datafusion/datasource-arrow/Cargo.toml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-datasource-arrow" +description = "datafusion-datasource-arrow" +readme = "README.md" +authors.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[package.metadata.docs.rs] +all-features = true + +[dependencies] +arrow = { workspace = true } +arrow-ipc = { workspace = true } +async-trait = { workspace = true } +bytes = { workspace = true } +datafusion-common = { workspace = true, features = ["object_store"] } +datafusion-common-runtime = { workspace = true } +datafusion-datasource = { workspace = true } +datafusion-execution = { workspace = true } +datafusion-expr = { workspace = true } +datafusion-physical-expr-common = { workspace = true } +datafusion-physical-plan = { workspace = true } +datafusion-session = { workspace = true } +futures = { workspace = true } +itertools = { workspace = true } +object_store = { workspace = true } +tokio = { workspace = true } + +[dev-dependencies] +chrono = { workspace = true } + +[lints] +workspace = true + +[lib] +name = "datafusion_datasource_arrow" +path = "src/mod.rs" + +[features] +compression = [ + "arrow-ipc/zstd", +] diff --git a/datafusion/datasource-arrow/LICENSE.txt b/datafusion/datasource-arrow/LICENSE.txt new file mode 100644 index 0000000000000..d74c6b599d2ae --- /dev/null +++ b/datafusion/datasource-arrow/LICENSE.txt @@ -0,0 +1,212 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +This project includes code from Apache Aurora. + +* dev/release/{release,changelog,release-candidate} are based on the scripts from + Apache Aurora + +Copyright: 2016 The Apache Software Foundation. +Home page: https://aurora.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 diff --git a/datafusion/datasource-arrow/NOTICE.txt b/datafusion/datasource-arrow/NOTICE.txt new file mode 100644 index 0000000000000..7f3c80d606c07 --- /dev/null +++ b/datafusion/datasource-arrow/NOTICE.txt @@ -0,0 +1,5 @@ +Apache DataFusion +Copyright 2019-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). diff --git a/datafusion/datasource-arrow/README.md b/datafusion/datasource-arrow/README.md new file mode 100644 index 0000000000000..9901b52105dd4 --- /dev/null +++ b/datafusion/datasource-arrow/README.md @@ -0,0 +1,34 @@ + + +# Apache DataFusion Arrow DataSource + +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. + +This crate is a submodule of DataFusion that defines a Arrow based file source. +It works with files following the [Arrow IPC format]. + +Most projects should use the [`datafusion`] crate directly, which re-exports +this module. If you are already using the [`datafusion`] crate, there is no +reason to use this crate directly in your project as well. + +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[`datafusion`]: https://crates.io/crates/datafusion +[arrow ipc format]: https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs new file mode 100644 index 0000000000000..3b85640804219 --- /dev/null +++ b/datafusion/datasource-arrow/src/file_format.rs @@ -0,0 +1,603 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ArrowFormat`]: Apache Arrow [`FileFormat`] abstractions +//! +//! Works with files following the [Arrow IPC format](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format) + +use std::any::Any; +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt::{self, Debug}; +use std::sync::Arc; + +use arrow::datatypes::{Schema, SchemaRef}; +use arrow::error::ArrowError; +use arrow::ipc::convert::fb_to_schema; +use arrow::ipc::reader::FileReader; +use arrow::ipc::writer::IpcWriteOptions; +use arrow::ipc::{root_as_message, CompressionType}; +use datafusion_common::error::Result; +use datafusion_common::parsers::CompressionTypeVariant; +use datafusion_common::{ + internal_datafusion_err, not_impl_err, DataFusionError, GetExt, Statistics, + DEFAULT_ARROW_EXTENSION, +}; +use datafusion_common_runtime::{JoinSet, SpawnedTask}; +use datafusion_datasource::display::FileGroupDisplay; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use datafusion_datasource::sink::{DataSink, DataSinkExec}; +use datafusion_datasource::write::{ + get_writer_schema, ObjectWriterBuilder, SharedBuffer, +}; +use datafusion_execution::{SendableRecordBatchStream, TaskContext}; +use datafusion_expr::dml::InsertOp; +use datafusion_physical_expr_common::sort_expr::LexRequirement; + +use crate::source::ArrowSource; +use async_trait::async_trait; +use bytes::Bytes; +use datafusion_datasource::file_compression_type::FileCompressionType; +use datafusion_datasource::file_format::{FileFormat, FileFormatFactory}; +use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig}; +use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource::write::demux::DemuxedStreamReceiver; +use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; +use datafusion_session::Session; +use futures::stream::BoxStream; +use futures::StreamExt; +use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; +use tokio::io::AsyncWriteExt; + +/// Initial writing buffer size. Note this is just a size hint for efficiency. It +/// will grow beyond the set value if needed. +const INITIAL_BUFFER_BYTES: usize = 1048576; + +/// If the buffered Arrow data exceeds this size, it is flushed to object store +const BUFFER_FLUSH_BYTES: usize = 1024000; + +#[derive(Default, Debug)] +/// Factory struct used to create [ArrowFormat] +pub struct ArrowFormatFactory; + +impl ArrowFormatFactory { + /// Creates an instance of [ArrowFormatFactory] + pub fn new() -> Self { + Self {} + } +} + +impl FileFormatFactory for ArrowFormatFactory { + fn create( + &self, + _state: &dyn Session, + _format_options: &HashMap, + ) -> Result> { + Ok(Arc::new(ArrowFormat)) + } + + fn default(&self) -> Arc { + Arc::new(ArrowFormat) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +impl GetExt for ArrowFormatFactory { + fn get_ext(&self) -> String { + // Removes the dot, i.e. ".parquet" -> "parquet" + DEFAULT_ARROW_EXTENSION[1..].to_string() + } +} + +/// Arrow `FileFormat` implementation. +#[derive(Default, Debug)] +pub struct ArrowFormat; + +#[async_trait] +impl FileFormat for ArrowFormat { + fn as_any(&self) -> &dyn Any { + self + } + + fn get_ext(&self) -> String { + ArrowFormatFactory::new().get_ext() + } + + fn get_ext_with_compression( + &self, + file_compression_type: &FileCompressionType, + ) -> Result { + let ext = self.get_ext(); + match file_compression_type.get_variant() { + CompressionTypeVariant::UNCOMPRESSED => Ok(ext), + _ => Err(internal_datafusion_err!( + "Arrow FileFormat does not support compression." + )), + } + } + + fn compression_type(&self) -> Option { + None + } + + async fn infer_schema( + &self, + _state: &dyn Session, + store: &Arc, + objects: &[ObjectMeta], + ) -> Result { + let mut schemas = vec![]; + for object in objects { + let r = store.as_ref().get(&object.location).await?; + let schema = match r.payload { + #[cfg(not(target_arch = "wasm32"))] + GetResultPayload::File(mut file, _) => { + let reader = FileReader::try_new(&mut file, None)?; + reader.schema() + } + GetResultPayload::Stream(stream) => { + infer_schema_from_file_stream(stream).await? + } + }; + schemas.push(schema.as_ref().clone()); + } + let merged_schema = Schema::try_merge(schemas)?; + Ok(Arc::new(merged_schema)) + } + + async fn infer_stats( + &self, + _state: &dyn Session, + _store: &Arc, + table_schema: SchemaRef, + _object: &ObjectMeta, + ) -> Result { + Ok(Statistics::new_unknown(&table_schema)) + } + + async fn create_physical_plan( + &self, + _state: &dyn Session, + conf: FileScanConfig, + ) -> Result> { + let source = Arc::new(ArrowSource::default()); + let config = FileScanConfigBuilder::from(conf) + .with_source(source) + .build(); + + Ok(DataSourceExec::from_data_source(config)) + } + + async fn create_writer_physical_plan( + &self, + input: Arc, + _state: &dyn Session, + conf: FileSinkConfig, + order_requirements: Option, + ) -> Result> { + if conf.insert_op != InsertOp::Append { + return not_impl_err!("Overwrites are not implemented yet for Arrow format"); + } + + let sink = Arc::new(ArrowFileSink::new(conf)); + + Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) + } + + fn file_source(&self) -> Arc { + Arc::new(ArrowSource::default()) + } +} + +/// Implements [`FileSink`] for writing to arrow_ipc files +struct ArrowFileSink { + config: FileSinkConfig, +} + +impl ArrowFileSink { + fn new(config: FileSinkConfig) -> Self { + Self { config } + } +} + +#[async_trait] +impl FileSink for ArrowFileSink { + fn config(&self) -> &FileSinkConfig { + &self.config + } + + async fn spawn_writer_tasks_and_join( + &self, + context: &Arc, + demux_task: SpawnedTask>, + mut file_stream_rx: DemuxedStreamReceiver, + object_store: Arc, + ) -> Result { + let mut file_write_tasks: JoinSet> = + JoinSet::new(); + + let ipc_options = + IpcWriteOptions::try_new(64, false, arrow_ipc::MetadataVersion::V5)? + .try_with_compression(Some(CompressionType::LZ4_FRAME))?; + while let Some((path, mut rx)) = file_stream_rx.recv().await { + let shared_buffer = SharedBuffer::new(INITIAL_BUFFER_BYTES); + let mut arrow_writer = arrow_ipc::writer::FileWriter::try_new_with_options( + shared_buffer.clone(), + &get_writer_schema(&self.config), + ipc_options.clone(), + )?; + let mut object_store_writer = ObjectWriterBuilder::new( + FileCompressionType::UNCOMPRESSED, + &path, + Arc::clone(&object_store), + ) + .with_buffer_size(Some( + context + .session_config() + .options() + .execution + .objectstore_writer_buffer_size, + )) + .build()?; + file_write_tasks.spawn(async move { + let mut row_count = 0; + while let Some(batch) = rx.recv().await { + row_count += batch.num_rows(); + arrow_writer.write(&batch)?; + let mut buff_to_flush = shared_buffer.buffer.try_lock().unwrap(); + if buff_to_flush.len() > BUFFER_FLUSH_BYTES { + object_store_writer + .write_all(buff_to_flush.as_slice()) + .await?; + buff_to_flush.clear(); + } + } + arrow_writer.finish()?; + let final_buff = shared_buffer.buffer.try_lock().unwrap(); + + object_store_writer.write_all(final_buff.as_slice()).await?; + object_store_writer.shutdown().await?; + Ok(row_count) + }); + } + + let mut row_count = 0; + while let Some(result) = file_write_tasks.join_next().await { + match result { + Ok(r) => { + row_count += r?; + } + Err(e) => { + if e.is_panic() { + std::panic::resume_unwind(e.into_panic()); + } else { + unreachable!(); + } + } + } + } + + demux_task + .join_unwind() + .await + .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??; + Ok(row_count as u64) + } +} + +impl Debug for ArrowFileSink { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ArrowFileSink").finish() + } +} + +impl DisplayAs for ArrowFileSink { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "ArrowFileSink(file_groups=",)?; + FileGroupDisplay(&self.config.file_group).fmt_as(t, f)?; + write!(f, ")") + } + DisplayFormatType::TreeRender => { + writeln!(f, "format: arrow")?; + write!(f, "file={}", &self.config.original_url) + } + } + } +} + +#[async_trait] +impl DataSink for ArrowFileSink { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> &SchemaRef { + self.config.output_schema() + } + + async fn write_all( + &self, + data: SendableRecordBatchStream, + context: &Arc, + ) -> Result { + FileSink::write_all(self, data, context).await + } +} + +const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1']; +const CONTINUATION_MARKER: [u8; 4] = [0xff; 4]; + +/// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs. +/// See +async fn infer_schema_from_file_stream( + mut stream: BoxStream<'static, object_store::Result>, +) -> Result { + // Expected format: + // - 6 bytes + // - 2 bytes + // - 4 bytes, not present below v0.15.0 + // - 4 bytes + // + // + + // So in first read we need at least all known sized sections, + // which is 6 + 2 + 4 + 4 = 16 bytes. + let bytes = collect_at_least_n_bytes(&mut stream, 16, None).await?; + + // Files should start with these magic bytes + if bytes[0..6] != ARROW_MAGIC { + return Err(ArrowError::ParseError( + "Arrow file does not contain correct header".to_string(), + ))?; + } + + // Since continuation marker bytes added in later versions + let (meta_len, rest_of_bytes_start_index) = if bytes[8..12] == CONTINUATION_MARKER { + (&bytes[12..16], 16) + } else { + (&bytes[8..12], 12) + }; + + let meta_len = [meta_len[0], meta_len[1], meta_len[2], meta_len[3]]; + let meta_len = i32::from_le_bytes(meta_len); + + // Read bytes for Schema message + let block_data = if bytes[rest_of_bytes_start_index..].len() < meta_len as usize { + // Need to read more bytes to decode Message + let mut block_data = Vec::with_capacity(meta_len as usize); + // In case we had some spare bytes in our initial read chunk + block_data.extend_from_slice(&bytes[rest_of_bytes_start_index..]); + let size_to_read = meta_len as usize - block_data.len(); + let block_data = + collect_at_least_n_bytes(&mut stream, size_to_read, Some(block_data)).await?; + Cow::Owned(block_data) + } else { + // Already have the bytes we need + let end_index = meta_len as usize + rest_of_bytes_start_index; + let block_data = &bytes[rest_of_bytes_start_index..end_index]; + Cow::Borrowed(block_data) + }; + + // Decode Schema message + let message = root_as_message(&block_data).map_err(|err| { + ArrowError::ParseError(format!("Unable to read IPC message as metadata: {err:?}")) + })?; + let ipc_schema = message.header_as_schema().ok_or_else(|| { + ArrowError::IpcError("Unable to read IPC message as schema".to_string()) + })?; + let schema = fb_to_schema(ipc_schema); + + Ok(Arc::new(schema)) +} + +async fn collect_at_least_n_bytes( + stream: &mut BoxStream<'static, object_store::Result>, + n: usize, + extend_from: Option>, +) -> Result> { + let mut buf = extend_from.unwrap_or_else(|| Vec::with_capacity(n)); + // If extending existing buffer then ensure we read n additional bytes + let n = n + buf.len(); + while let Some(bytes) = stream.next().await.transpose()? { + buf.extend_from_slice(&bytes); + if buf.len() >= n { + break; + } + } + if buf.len() < n { + return Err(ArrowError::ParseError( + "Unexpected end of byte stream for Arrow IPC file".to_string(), + ))?; + } + Ok(buf) +} + +#[cfg(test)] +mod tests { + use super::*; + + use chrono::DateTime; + use datafusion_common::config::TableOptions; + use datafusion_common::DFSchema; + use datafusion_execution::config::SessionConfig; + use datafusion_execution::runtime_env::RuntimeEnv; + use datafusion_expr::execution_props::ExecutionProps; + use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF}; + use datafusion_physical_expr_common::physical_expr::PhysicalExpr; + use object_store::{chunked::ChunkedStore, memory::InMemory, path::Path}; + + struct MockSession { + config: SessionConfig, + runtime_env: Arc, + } + + impl MockSession { + fn new() -> Self { + Self { + config: SessionConfig::new(), + runtime_env: Arc::new(RuntimeEnv::default()), + } + } + } + + #[async_trait::async_trait] + impl Session for MockSession { + fn session_id(&self) -> &str { + unimplemented!() + } + + fn config(&self) -> &SessionConfig { + &self.config + } + + async fn create_physical_plan( + &self, + _logical_plan: &LogicalPlan, + ) -> Result> { + unimplemented!() + } + + fn create_physical_expr( + &self, + _expr: Expr, + _df_schema: &DFSchema, + ) -> Result> { + unimplemented!() + } + + fn scalar_functions(&self) -> &HashMap> { + unimplemented!() + } + + fn aggregate_functions(&self) -> &HashMap> { + unimplemented!() + } + + fn window_functions(&self) -> &HashMap> { + unimplemented!() + } + + fn runtime_env(&self) -> &Arc { + &self.runtime_env + } + + fn execution_props(&self) -> &ExecutionProps { + unimplemented!() + } + + fn as_any(&self) -> &dyn Any { + unimplemented!() + } + + fn table_options(&self) -> &TableOptions { + unimplemented!() + } + + fn table_options_mut(&mut self) -> &mut TableOptions { + unimplemented!() + } + + fn task_ctx(&self) -> Arc { + unimplemented!() + } + } + + #[tokio::test] + async fn test_infer_schema_stream() -> Result<()> { + let mut bytes = std::fs::read("tests/data/example.arrow")?; + bytes.truncate(bytes.len() - 20); // mangle end to show we don't need to read whole file + let location = Path::parse("example.arrow")?; + let in_memory_store: Arc = Arc::new(InMemory::new()); + in_memory_store.put(&location, bytes.into()).await?; + + let state = MockSession::new(); + let object_meta = ObjectMeta { + location, + last_modified: DateTime::default(), + size: u64::MAX, + e_tag: None, + version: None, + }; + + let arrow_format = ArrowFormat {}; + let expected = vec!["f0: Int64", "f1: Utf8", "f2: Boolean"]; + + // Test chunk sizes where too small so we keep having to read more bytes + // And when large enough that first read contains all we need + for chunk_size in [7, 3000] { + let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), chunk_size)); + let inferred_schema = arrow_format + .infer_schema( + &state, + &(store.clone() as Arc), + std::slice::from_ref(&object_meta), + ) + .await?; + let actual_fields = inferred_schema + .fields() + .iter() + .map(|f| format!("{}: {:?}", f.name(), f.data_type())) + .collect::>(); + assert_eq!(expected, actual_fields); + } + + Ok(()) + } + + #[tokio::test] + async fn test_infer_schema_short_stream() -> Result<()> { + let mut bytes = std::fs::read("tests/data/example.arrow")?; + bytes.truncate(20); // should cause error that file shorter than expected + let location = Path::parse("example.arrow")?; + let in_memory_store: Arc = Arc::new(InMemory::new()); + in_memory_store.put(&location, bytes.into()).await?; + + let state = MockSession::new(); + let object_meta = ObjectMeta { + location, + last_modified: DateTime::default(), + size: u64::MAX, + e_tag: None, + version: None, + }; + + let arrow_format = ArrowFormat {}; + + let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), 7)); + let err = arrow_format + .infer_schema( + &state, + &(store.clone() as Arc), + std::slice::from_ref(&object_meta), + ) + .await; + + assert!(err.is_err()); + assert_eq!( + "Arrow error: Parser error: Unexpected end of byte stream for Arrow IPC file", + err.unwrap_err().to_string().lines().next().unwrap() + ); + + Ok(()) + } +} diff --git a/datafusion/datasource-arrow/src/mod.rs b/datafusion/datasource-arrow/src/mod.rs new file mode 100644 index 0000000000000..18bb8792c3ffe --- /dev/null +++ b/datafusion/datasource-arrow/src/mod.rs @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] + +pub mod file_format; +pub mod source; + +pub use file_format::*; diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/datasource-arrow/src/source.rs similarity index 98% rename from datafusion/core/src/datasource/physical_plan/arrow_file.rs rename to datafusion/datasource-arrow/src/source.rs index b37dc499d4035..f43f11880182b 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/datasource-arrow/src/source.rs @@ -18,20 +18,21 @@ use std::any::Any; use std::sync::Arc; -use crate::datasource::physical_plan::{FileOpenFuture, FileOpener}; -use crate::error::Result; use datafusion_datasource::as_file_source; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; use arrow::buffer::Buffer; use arrow::datatypes::SchemaRef; use arrow_ipc::reader::FileDecoder; +use datafusion_common::error::Result; use datafusion_common::{exec_datafusion_err, Statistics}; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::PartitionedFile; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion_datasource::file_stream::FileOpenFuture; +use datafusion_datasource::file_stream::FileOpener; use futures::StreamExt; use itertools::Itertools; use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore}; diff --git a/datafusion/core/tests/data/example.arrow b/datafusion/datasource-arrow/tests/data/example.arrow similarity index 100% rename from datafusion/core/tests/data/example.arrow rename to datafusion/datasource-arrow/tests/data/example.arrow diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index 62453ec4bf3e6..b3975e0c3f471 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -29,7 +29,7 @@ statement ok CREATE EXTERNAL TABLE arrow_simple STORED AS ARROW -LOCATION '../core/tests/data/example.arrow'; +LOCATION '../datasource-arrow/tests/data/example.arrow'; # physical plan @@ -37,7 +37,7 @@ query TT EXPLAIN SELECT * FROM arrow_simple ---- logical_plan TableScan: arrow_simple projection=[f0, f1, f2] -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow]]}, projection=[f0, f1, f2], file_type=arrow +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow]]}, projection=[f0, f1, f2], file_type=arrow # correct content query ITB @@ -50,8 +50,8 @@ SELECT * FROM arrow_simple # Ensure that local files can not be read by default (a potential security issue) # (url table is only supported when DynamicFileCatalog is enabled) -statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/example.arrow' not found -SELECT * FROM '../core/tests/data/example.arrow'; +statement error DataFusion error: Error during planning: table 'datafusion.public.../datasource-arrow/tests/data/example.arrow' not found +SELECT * FROM '../datasource-arrow/tests/data/example.arrow'; # ARROW partitioned table statement ok diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 03ef08e1a5f83..bc6cbfab0caed 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -312,7 +312,7 @@ DROP TABLE aggregate_simple # Arrow format statement ok -CREATE external table arrow_simple STORED as ARROW LOCATION '../core/tests/data/example.arrow'; +CREATE external table arrow_simple STORED as ARROW LOCATION '../datasource-arrow/tests/data/example.arrow'; query ITB rowsort SELECT * FROM arrow_simple order by f1 LIMIT 1 @@ -796,7 +796,7 @@ logical_plan 02)--Values: (Int64(1), Int64(2), Int64(3)) query TT -explain CREATE EXTERNAL TEMPORARY TABLE tty STORED as ARROW LOCATION '../core/tests/data/example.arrow'; +explain CREATE EXTERNAL TEMPORARY TABLE tty STORED as ARROW LOCATION '../datasource-arrow/tests/data/example.arrow'; ---- logical_plan CreateExternalTable: Bare { table: "tty" } @@ -804,7 +804,7 @@ statement ok set datafusion.explain.logical_plan_only=false; statement error DataFusion error: This feature is not implemented: Temporary tables not supported -CREATE EXTERNAL TEMPORARY TABLE tty STORED as ARROW LOCATION '../core/tests/data/example.arrow'; +CREATE EXTERNAL TEMPORARY TABLE tty STORED as ARROW LOCATION '../datasource-arrow/tests/data/example.arrow'; statement error DataFusion error: This feature is not implemented: Temporary views not supported CREATE TEMPORARY VIEW y AS VALUES (1,2,3); diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index c536c8165c5a3..41718b3aebc27 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -244,7 +244,7 @@ DROP TABLE json_table; statement ok CREATE EXTERNAL TABLE arrow_table STORED AS ARROW -LOCATION '../core/tests/data/example.arrow'; +LOCATION '../datasource-arrow/tests/data/example.arrow'; # It would be great to see the file read as "4" groups with even sizes (offsets) eventually @@ -253,7 +253,7 @@ query TT EXPLAIN SELECT * FROM arrow_table ---- logical_plan TableScan: arrow_table projection=[f0, f1, f2] -physical_plan DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow:0..461], [WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow:461..922], [WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow:922..1383], [WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow:1383..1842]]}, projection=[f0, f1, f2], file_type=arrow +physical_plan DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow:0..461], [WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow:461..922], [WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow:922..1383], [WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow:1383..1842]]}, projection=[f0, f1, f2], file_type=arrow # correct content query ITB diff --git a/dev/release/README.md b/dev/release/README.md index d70e256f73831..1b78f8d13be98 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -295,6 +295,7 @@ Verify that the Cargo.toml in the tarball contains the correct version (cd datafusion/catalog && cargo publish) (cd datafusion/catalog-listing && cargo publish) (cd datafusion/functions-table && cargo publish) +(cd datafusion/datasource-arrow && cargo publish) (cd datafusion/datasource-csv && cargo publish) (cd datafusion/datasource-json && cargo publish) (cd datafusion/datasource-parquet && cargo publish) From dce59f850ce9cf3e2aa36402a1593a8b25432336 Mon Sep 17 00:00:00 2001 From: Blake Orth Date: Fri, 17 Oct 2025 07:30:15 -0600 Subject: [PATCH 025/109] Adds instrumentation to LIST operations in CLI (#18103) ## Which issue does this PR close? This does not fully close, but is an incremental building block component for: - https://github.com/apache/datafusion/issues/17207 The full context of how this code is likely to progress can be seen in the POC for this effort: - https://github.com/apache/datafusion/pull/17266 ## Rationale for this change Continued progress filling out the methods that are instrumented for the instrumented object store. ## What changes are included in this PR? - Adds instrumentation around basic list operations into the instrumented object store - Adds test cases for new code ## Are these changes tested? Yes. Example output: ```sql DataFusion CLI v50.2.0 > \object_store_profiling trace ObjectStore Profile mode set to Trace > CREATE EXTERNAL TABLE nyc_taxi_rides STORED AS PARQUET LOCATION 's3://altinity-clickhouse-data/nyc_taxi_rides/data/tripdata_parquet'; 0 row(s) fetched. Elapsed 2.679 seconds. Object Store Profiling Instrumented Object Store: instrument_mode: Trace, inner: AmazonS3(altinity-clickhouse-data) 2025-10-16T18:53:09.512970085+00:00 operation=List path=nyc_taxi_rides/data/tripdata_parquet Summaries: List count: 1 Instrumented Object Store: instrument_mode: Trace, inner: AmazonS3(altinity-clickhouse-data) 2025-10-16T18:53:09.929709943+00:00 operation=List path=nyc_taxi_rides/data/tripdata_parquet 2025-10-16T18:53:10.106757629+00:00 operation=List path=nyc_taxi_rides/data/tripdata_parquet 2025-10-16T18:53:10.220555058+00:00 operation=Get duration=0.230604s size=8 range: bytes=222192975-222192982 path=nyc_taxi_rides/data/tripdata_parquet/data-200901.parquet 2025-10-16T18:53:10.226399832+00:00 operation=Get duration=0.263826s size=8 range: bytes=233123927-233123934 path=nyc_taxi_rides/data/tripdata_parquet/data-201104.parquet 2025-10-16T18:53:10.226194195+00:00 operation=Get duration=0.269754s size=8 range: bytes=252843253-252843260 path=nyc_taxi_rides/data/tripdata_parquet/data-201103.parquet . . . 2025-10-16T18:53:11.928787014+00:00 operation=Get duration=0.072248s size=18278 range: bytes=201384109-201402386 path=nyc_taxi_rides/data/tripdata_parquet/data-201509.parquet 2025-10-16T18:53:11.933475464+00:00 operation=Get duration=0.068880s size=17175 range: bytes=195411804-195428978 path=nyc_taxi_rides/data/tripdata_parquet/data-201601.parquet 2025-10-16T18:53:11.949629591+00:00 operation=Get duration=0.065645s size=19872 range: bytes=214807880-214827751 path=nyc_taxi_rides/data/tripdata_parquet/data-201603.parquet Summaries: List count: 2 Get count: 288 duration min: 0.060930s duration max: 0.444601s duration avg: 0.133339s size min: 8 B size max: 44247 B size avg: 18870 B size sum: 5434702 B > ``` ## Are there any user-facing changes? No-ish ## cc @alamb --- .../src/object_storage/instrumented.rs | 70 +++++++++++++++++-- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/datafusion-cli/src/object_storage/instrumented.rs b/datafusion-cli/src/object_storage/instrumented.rs index cb96734f24645..8acece315f764 100644 --- a/datafusion-cli/src/object_storage/instrumented.rs +++ b/datafusion-cli/src/object_storage/instrumented.rs @@ -114,6 +114,11 @@ impl InstrumentedObjectStore { req.drain(..).collect() } + fn enabled(&self) -> bool { + self.instrument_mode.load(Ordering::Relaxed) + != InstrumentedObjectStoreMode::Disabled as u8 + } + async fn instrumented_get_opts( &self, location: &Path, @@ -138,6 +143,26 @@ impl InstrumentedObjectStore { Ok(ret) } + + fn instrumented_list( + &self, + prefix: Option<&Path>, + ) -> BoxStream<'static, Result> { + let timestamp = Utc::now(); + let ret = self.inner.list(prefix); + + self.requests.lock().push(RequestDetails { + op: Operation::List, + path: prefix.cloned().unwrap_or_else(|| Path::from("")), + timestamp, + duration: None, // list returns a stream, so the duration isn't meaningful + size: None, + range: None, + extra_display: None, + }); + + ret + } } impl fmt::Display for InstrumentedObjectStore { @@ -172,9 +197,7 @@ impl ObjectStore for InstrumentedObjectStore { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - if self.instrument_mode.load(Ordering::Relaxed) - != InstrumentedObjectStoreMode::Disabled as u8 - { + if self.enabled() { return self.instrumented_get_opts(location, options).await; } @@ -186,6 +209,10 @@ impl ObjectStore for InstrumentedObjectStore { } fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { + if self.enabled() { + return self.instrumented_list(prefix); + } + self.inner.list(prefix) } @@ -213,7 +240,7 @@ pub enum Operation { _Delete, Get, _Head, - _List, + List, _Put, } @@ -477,8 +504,9 @@ mod tests { assert_eq!(reg.stores().len(), 1); } - #[tokio::test] - async fn instrumented_store() { + // Returns an `InstrumentedObjectStore` with some data loaded for testing and the path to + // access the data + async fn setup_test_store() -> (InstrumentedObjectStore, Path) { let store = Arc::new(object_store::memory::InMemory::new()); let mode = AtomicU8::new(InstrumentedObjectStoreMode::default() as u8); let instrumented = InstrumentedObjectStore::new(store, mode); @@ -488,6 +516,13 @@ mod tests { let payload = PutPayload::from_static(b"test_data"); instrumented.put(&path, payload).await.unwrap(); + (instrumented, path) + } + + #[tokio::test] + async fn instrumented_store_get() { + let (instrumented, path) = setup_test_store().await; + // By default no requests should be instrumented/stored assert!(instrumented.requests.lock().is_empty()); let _ = instrumented.get(&path).await.unwrap(); @@ -511,6 +546,29 @@ mod tests { assert!(request.extra_display.is_none()); } + #[tokio::test] + async fn instrumented_store_list() { + let (instrumented, path) = setup_test_store().await; + + // By default no requests should be instrumented/stored + assert!(instrumented.requests.lock().is_empty()); + let _ = instrumented.list(Some(&path)); + assert!(instrumented.requests.lock().is_empty()); + + instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace); + assert!(instrumented.requests.lock().is_empty()); + let _ = instrumented.list(Some(&path)); + assert_eq!(instrumented.requests.lock().len(), 1); + + let request = instrumented.take_requests().pop().unwrap(); + assert_eq!(request.op, Operation::List); + assert_eq!(request.path, path); + assert!(request.duration.is_none()); + assert!(request.size.is_none()); + assert!(request.range.is_none()); + assert!(request.extra_display.is_none()); + } + #[test] fn request_details() { let rd = RequestDetails { From 76050235b427d3e0f90c30a8222a8babe537ad9c Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Fri, 17 Oct 2025 21:52:07 +0800 Subject: [PATCH 026/109] feat: spark udf array shuffle (#17674) ## Which issue does this PR close? ## Rationale for this change support shuffle udf ## What changes are included in this PR? support shuffle udf ## Are these changes tested? UT ## Are there any user-facing changes? No --- datafusion/spark/Cargo.toml | 2 +- datafusion/spark/src/function/array/mod.rs | 9 +- .../spark/src/function/array/shuffle.rs | 191 ++++++++++++++++++ .../test_files/spark/array/shuffle.slt | 113 +++++++++++ 4 files changed, 313 insertions(+), 2 deletions(-) create mode 100644 datafusion/spark/src/function/array/shuffle.rs create mode 100644 datafusion/sqllogictest/test_files/spark/array/shuffle.slt diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml index b95cc31caec68..7f6210fb32bf6 100644 --- a/datafusion/spark/Cargo.toml +++ b/datafusion/spark/Cargo.toml @@ -46,12 +46,12 @@ datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-functions = { workspace = true, features = ["crypto_expressions"] } log = { workspace = true } +rand = { workspace = true } sha1 = "0.10" url = { workspace = true } [dev-dependencies] criterion = { workspace = true } -rand = { workspace = true } [[bench]] harness = false diff --git a/datafusion/spark/src/function/array/mod.rs b/datafusion/spark/src/function/array/mod.rs index fed52a494281d..01056ba952984 100644 --- a/datafusion/spark/src/function/array/mod.rs +++ b/datafusion/spark/src/function/array/mod.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +pub mod shuffle; pub mod spark_array; use datafusion_expr::ScalarUDF; @@ -22,13 +23,19 @@ use datafusion_functions::make_udf_function; use std::sync::Arc; make_udf_function!(spark_array::SparkArray, array); +make_udf_function!(shuffle::SparkShuffle, shuffle); pub mod expr_fn { use datafusion_functions::export_functions; export_functions!((array, "Returns an array with the given elements.", args)); + export_functions!(( + shuffle, + "Returns a random permutation of the given array.", + args + )); } pub fn functions() -> Vec> { - vec![array()] + vec![array(), shuffle()] } diff --git a/datafusion/spark/src/function/array/shuffle.rs b/datafusion/spark/src/function/array/shuffle.rs new file mode 100644 index 0000000000000..abeafd3a93660 --- /dev/null +++ b/datafusion/spark/src/function/array/shuffle.rs @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::function::functions_nested_utils::make_scalar_function; +use arrow::array::{ + Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, MutableArrayData, + OffsetSizeTrait, +}; +use arrow::buffer::OffsetBuffer; +use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null}; +use arrow::datatypes::{DataType, FieldRef}; +use datafusion_common::cast::{ + as_fixed_size_list_array, as_large_list_array, as_list_array, +}; +use datafusion_common::{exec_err, utils::take_function_args, Result}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use rand::rng; +use rand::seq::SliceRandom; +use std::any::Any; +use std::sync::Arc; + +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkShuffle { + signature: Signature, +} + +impl Default for SparkShuffle { + fn default() -> Self { + Self::new() + } +} + +impl SparkShuffle { + pub fn new() -> Self { + Self { + signature: Signature::arrays(1, None, Volatility::Volatile), + } + } +} + +impl ScalarUDFImpl for SparkShuffle { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "shuffle" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(arg_types[0].clone()) + } + + fn invoke_with_args( + &self, + args: datafusion_expr::ScalarFunctionArgs, + ) -> Result { + make_scalar_function(array_shuffle_inner)(&args.args) + } +} + +/// array_shuffle SQL function +pub fn array_shuffle_inner(arg: &[ArrayRef]) -> Result { + let [input_array] = take_function_args("shuffle", arg)?; + match &input_array.data_type() { + List(field) => { + let array = as_list_array(input_array)?; + general_array_shuffle::(array, field) + } + LargeList(field) => { + let array = as_large_list_array(input_array)?; + general_array_shuffle::(array, field) + } + FixedSizeList(field, _) => { + let array = as_fixed_size_list_array(input_array)?; + fixed_size_array_shuffle(array, field) + } + Null => Ok(Arc::clone(input_array)), + array_type => exec_err!("shuffle does not support type '{array_type}'."), + } +} + +fn general_array_shuffle( + array: &GenericListArray, + field: &FieldRef, +) -> Result { + let values = array.values(); + let original_data = values.to_data(); + let capacity = Capacities::Array(original_data.len()); + let mut offsets = vec![O::usize_as(0)]; + let mut nulls = vec![]; + let mut mutable = + MutableArrayData::with_capacities(vec![&original_data], false, capacity); + let mut rng = rng(); + + for (row_index, offset_window) in array.offsets().windows(2).enumerate() { + // skip the null value + if array.is_null(row_index) { + nulls.push(false); + offsets.push(offsets[row_index] + O::one()); + mutable.extend(0, 0, 1); + continue; + } + nulls.push(true); + let start = offset_window[0]; + let end = offset_window[1]; + let length = (end - start).to_usize().unwrap(); + + // Create indices and shuffle them + let mut indices: Vec = + (start.to_usize().unwrap()..end.to_usize().unwrap()).collect(); + indices.shuffle(&mut rng); + + // Add shuffled elements + for &index in &indices { + mutable.extend(0, index, index + 1); + } + + offsets.push(offsets[row_index] + O::usize_as(length)); + } + + let data = mutable.freeze(); + Ok(Arc::new(GenericListArray::::try_new( + Arc::clone(field), + OffsetBuffer::::new(offsets.into()), + arrow::array::make_array(data), + Some(nulls.into()), + )?)) +} + +fn fixed_size_array_shuffle( + array: &FixedSizeListArray, + field: &FieldRef, +) -> Result { + let values = array.values(); + let original_data = values.to_data(); + let capacity = Capacities::Array(original_data.len()); + let mut nulls = vec![]; + let mut mutable = + MutableArrayData::with_capacities(vec![&original_data], false, capacity); + let value_length = array.value_length() as usize; + let mut rng = rng(); + + for row_index in 0..array.len() { + // skip the null value + if array.is_null(row_index) { + nulls.push(false); + mutable.extend(0, 0, value_length); + continue; + } + nulls.push(true); + + let start = row_index * value_length; + let end = start + value_length; + + // Create indices and shuffle them + let mut indices: Vec = (start..end).collect(); + indices.shuffle(&mut rng); + + // Add shuffled elements + for &index in &indices { + mutable.extend(0, index, index + 1); + } + } + + let data = mutable.freeze(); + Ok(Arc::new(FixedSizeListArray::try_new( + Arc::clone(field), + array.value_length(), + arrow::array::make_array(data), + Some(nulls.into()), + )?)) +} diff --git a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt new file mode 100644 index 0000000000000..cb3c77cac8fbb --- /dev/null +++ b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Test shuffle function with simple arrays +query B +SELECT array_sort(shuffle([1, 2, 3, 4, 5, NULL])) = [NULL,1, 2, 3, 4, 5]; +---- +true + +query B +SELECT shuffle([1, 2, 3, 4, 5, NULL]) != [1, 2, 3, 4, 5, NULL]; +---- +true + +# Test shuffle function with string arrays + +query B +SELECT array_sort(shuffle(['a', 'b', 'c', 'd', 'e', 'f'])) = ['a', 'b', 'c', 'd', 'e', 'f']; +---- +true + +query B +SELECT shuffle(['a', 'b', 'c', 'd', 'e', 'f']) != ['a', 'b', 'c', 'd', 'e', 'f'];; +---- +true + +# Test shuffle function with empty array +query ? +SELECT shuffle([]); +---- +[] + +# Test shuffle function with single element +query ? +SELECT shuffle([42]); +---- +[42] + +# Test shuffle function with null array +query ? +SELECT shuffle(NULL); +---- +NULL + +# Test shuffle function with fixed size list arrays +query B +SELECT array_sort(shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)'))) = [NULL, 1, 2, 3, 4, 5]; +---- +true + +query B +SELECT shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)')) != [1, 2, NULL, 3, 4, 5]; +---- +true + +# Test shuffle on table data with different list types +statement ok +CREATE TABLE test_shuffle_list_types AS VALUES + ([1, 2, 3, 4]), + ([5, 6, 7, 8, 9]), + ([10]), + (NULL), + ([]); + +# Test shuffle with large list from table +query ? +SELECT array_sort(shuffle(column1)) FROM test_shuffle_list_types; +---- +[1, 2, 3, 4] +[5, 6, 7, 8, 9] +[10] +NULL +[] + +# Test fixed size list table +statement ok +CREATE TABLE test_shuffle_fixed_size AS VALUES + (arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)')), + (arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)')), + (arrow_cast([NULL, 8, 9], 'FixedSizeList(3, Int64)')), + (NULL); + +# Test shuffle with fixed size list from table +query ? +SELECT array_sort(shuffle(column1)) FROM test_shuffle_fixed_size; +---- +[1, 2, 3] +[4, 5, 6] +[NULL, 8, 9] +NULL + +# Clean up +statement ok +DROP TABLE test_shuffle_list_types; + +statement ok +DROP TABLE test_shuffle_fixed_size; + + From f0ab1369a200bc80b0a737bcbf54609b15b5015c Mon Sep 17 00:00:00 2001 From: Leonardo Yvens Date: Fri, 17 Oct 2025 15:57:43 +0200 Subject: [PATCH 027/109] make Union::try_new pub (#18125) ## Which issue does this PR close? - Closes #18126. ## Rationale for this change It's a useful constructor for users manipulating logical plans where they know the schemas will match exactly. We already expose other constructors for Union and constructors for logical plans. ## What changes are included in this PR? Makes `Union::try_new` a public function. ## Are these changes tested? Seems unnecessary. ## Are there any user-facing changes? The function is now public. Not a breaking change, but going forward changes to it would breaking changes to users of the logical plan API. --- datafusion/expr/src/logical_plan/plan.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index b8200ab8a48c3..05a2564464c59 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2753,7 +2753,8 @@ pub struct Union { impl Union { /// Constructs new Union instance deriving schema from inputs. - fn try_new(inputs: Vec>) -> Result { + /// Schema data types must match exactly. + pub fn try_new(inputs: Vec>) -> Result { let schema = Self::derive_schema_from_inputs(&inputs, false, false)?; Ok(Union { inputs, schema }) } From c9561049d96c4875efa6670f7a4b19556d00fb79 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Fri, 17 Oct 2025 22:22:12 +0800 Subject: [PATCH 028/109] fix: window unparsing (#17367) ## Which issue does this PR close? - Closes #17360. ## Rationale for this change in LogicalPlan::Filter unparsing, if there's a window expr, it should be converted to quailify. postgres must has an alias for derived table. otherwise it will complain: ``` ERROR: subquery in FROM must have an alias. ``` fixed this issue at the same time. ## What changes are included in this PR? If window expr is found, convert filter to quailify. ## Are these changes tested? UT ## Are there any user-facing changes? No --------- Co-authored-by: Jeffrey Vo --- datafusion/sql/src/unparser/dialect.rs | 23 ++++++ datafusion/sql/src/unparser/plan.rs | 19 ++++- datafusion/sql/src/unparser/rewrite.rs | 66 +++++++++++++++++ datafusion/sql/tests/cases/plan_to_sql.rs | 86 +++++++++++++++++++++++ 4 files changed, 192 insertions(+), 2 deletions(-) diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs index 647ad680674b0..834b0a97a47b0 100644 --- a/datafusion/sql/src/unparser/dialect.rs +++ b/datafusion/sql/src/unparser/dialect.rs @@ -207,6 +207,13 @@ pub trait Dialect: Send + Sync { Ok(None) } + /// Allows the dialect to support the QUALIFY clause + /// + /// Some dialects, like Postgres, do not support the QUALIFY clause + fn supports_qualify(&self) -> bool { + true + } + /// Allows the dialect to override logic of formatting datetime with tz into string. fn timestamp_with_tz_to_string(&self, dt: DateTime, _unit: TimeUnit) -> String { dt.to_string() @@ -274,6 +281,14 @@ impl Dialect for DefaultDialect { pub struct PostgreSqlDialect {} impl Dialect for PostgreSqlDialect { + fn supports_qualify(&self) -> bool { + false + } + + fn requires_derived_table_alias(&self) -> bool { + true + } + fn identifier_quote_style(&self, _: &str) -> Option { Some('"') } @@ -424,6 +439,10 @@ impl Dialect for DuckDBDialect { pub struct MySqlDialect {} impl Dialect for MySqlDialect { + fn supports_qualify(&self) -> bool { + false + } + fn identifier_quote_style(&self, _: &str) -> Option { Some('`') } @@ -485,6 +504,10 @@ impl Dialect for MySqlDialect { pub struct SqliteDialect {} impl Dialect for SqliteDialect { + fn supports_qualify(&self) -> bool { + false + } + fn identifier_quote_style(&self, _: &str) -> Option { Some('`') } diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index b6c65614995a9..e7535338b7677 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -32,11 +32,11 @@ use super::{ }, Unparser, }; -use crate::unparser::ast::UnnestRelationBuilder; use crate::unparser::extension_unparser::{ UnparseToStatementResult, UnparseWithinStatementResult, }; use crate::unparser::utils::{find_unnest_node_until_relation, unproject_agg_exprs}; +use crate::unparser::{ast::UnnestRelationBuilder, rewrite::rewrite_qualify}; use crate::utils::UNNEST_PLACEHOLDER; use datafusion_common::{ internal_err, not_impl_err, @@ -95,7 +95,10 @@ pub fn plan_to_sql(plan: &LogicalPlan) -> Result { impl Unparser<'_> { pub fn plan_to_sql(&self, plan: &LogicalPlan) -> Result { - let plan = normalize_union_schema(plan)?; + let mut plan = normalize_union_schema(plan)?; + if !self.dialect.supports_qualify() { + plan = rewrite_qualify(plan)?; + } match plan { LogicalPlan::Projection(_) @@ -428,6 +431,18 @@ impl Unparser<'_> { unproject_agg_exprs(filter.predicate.clone(), agg, None)?; let filter_expr = self.expr_to_sql(&unprojected)?; select.having(Some(filter_expr)); + } else if let (Some(window), true) = ( + find_window_nodes_within_select( + plan, + None, + select.already_projected(), + ), + self.dialect.supports_qualify(), + ) { + let unprojected = + unproject_window_exprs(filter.predicate.clone(), &window)?; + let filter_expr = self.expr_to_sql(&unprojected)?; + select.qualify(Some(filter_expr)); } else { let filter_expr = self.expr_to_sql(&filter.predicate)?; select.selection(Some(filter_expr)); diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs index aa480cf4fff92..c961f1d6f1f0c 100644 --- a/datafusion/sql/src/unparser/rewrite.rs +++ b/datafusion/sql/src/unparser/rewrite.rs @@ -100,6 +100,72 @@ fn rewrite_sort_expr_for_union(exprs: Vec) -> Result> { Ok(sort_exprs) } +/// Rewrite Filter plans that have a Window as their input by inserting a SubqueryAlias. +/// +/// When a Filter directly operates on a Window plan, it can cause issues during SQL unparsing +/// because window functions in a WHERE clause are not valid SQL. The solution is to wrap +/// the Window plan in a SubqueryAlias, effectively creating a derived table. +/// +/// Example transformation: +/// +/// Filter: condition +/// Window: window_function +/// TableScan: table +/// +/// becomes: +/// +/// Filter: condition +/// SubqueryAlias: __qualify_subquery +/// Projection: table.column1, table.column2 +/// Window: window_function +/// TableScan: table +/// +pub(super) fn rewrite_qualify(plan: LogicalPlan) -> Result { + let transformed_plan = plan.transform_up(|plan| match plan { + // Check if the filter's input is a Window plan + LogicalPlan::Filter(mut filter) => { + if matches!(&*filter.input, LogicalPlan::Window(_)) { + // Create a SubqueryAlias around the Window plan + let qualifier = filter + .input + .schema() + .iter() + .find_map(|(q, _)| q) + .map(|q| q.to_string()) + .unwrap_or_else(|| "__qualify_subquery".to_string()); + + // for Postgres, name of column for 'rank() over (...)' is 'rank' + // but in Datafusion, it is 'rank() over (...)' + // without projection, it's still an invalid sql in Postgres + + let project_exprs = filter + .input + .schema() + .iter() + .map(|(_, f)| datafusion_expr::col(f.name()).alias(f.name())) + .collect::>(); + + let input = + datafusion_expr::LogicalPlanBuilder::from(Arc::clone(&filter.input)) + .project(project_exprs)? + .build()?; + + let subquery_alias = + datafusion_expr::SubqueryAlias::try_new(Arc::new(input), qualifier)?; + + filter.input = Arc::new(LogicalPlan::SubqueryAlias(subquery_alias)); + Ok(Transformed::yes(LogicalPlan::Filter(filter))) + } else { + Ok(Transformed::no(LogicalPlan::Filter(filter))) + } + } + + _ => Ok(Transformed::no(plan)), + }); + + transformed_plan.data() +} + /// Rewrite logic plan for query that order by columns are not in projections /// Plan before rewrite: /// diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 7aa982dcf3dd9..5f76afb763cff 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -21,12 +21,14 @@ use datafusion_common::{ assert_contains, Column, DFSchema, DFSchemaRef, DataFusionError, Result, TableReference, }; +use datafusion_expr::expr::{WindowFunction, WindowFunctionParams}; use datafusion_expr::test::function_stub::{ count_udaf, max_udaf, min_udaf, sum, sum_udaf, }; use datafusion_expr::{ cast, col, lit, table_scan, wildcard, EmptyRelation, Expr, Extension, LogicalPlan, LogicalPlanBuilder, Union, UserDefinedLogicalNode, UserDefinedLogicalNodeCore, + WindowFrame, WindowFunctionDefinition, }; use datafusion_functions::unicode; use datafusion_functions_aggregate::grouping::grouping_udaf; @@ -2521,6 +2523,90 @@ fn test_unparse_left_semi_join_with_table_scan_projection() -> Result<()> { Ok(()) } +#[test] +fn test_unparse_window() -> Result<()> { + // SubqueryAlias: t + // Projection: t.k, t.v, rank() PARTITION BY [t.k] ORDER BY [t.v ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS r + // Filter: rank() PARTITION BY [t.k] ORDER BY [t.v ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = UInt64(1) + // WindowAggr: windowExpr=[[rank() PARTITION BY [t.k] ORDER BY [t.v ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] + // TableScan: t projection=[k, v] + + let schema = Schema::new(vec![ + Field::new("k", DataType::Int32, false), + Field::new("v", DataType::Int32, false), + ]); + let window_expr = Expr::WindowFunction(Box::new(WindowFunction { + fun: WindowFunctionDefinition::WindowUDF(rank_udwf()), + params: WindowFunctionParams { + args: vec![], + partition_by: vec![col("k")], + order_by: vec![col("v").sort(true, true)], + window_frame: WindowFrame::new(None), + null_treatment: None, + distinct: false, + filter: None, + }, + })); + let table = table_scan(Some("test"), &schema, Some(vec![0, 1]))?.build()?; + let plan = LogicalPlanBuilder::window_plan(table, vec![window_expr.clone()])?; + + let name = plan.schema().fields().last().unwrap().name().clone(); + let plan = LogicalPlanBuilder::from(plan) + .filter(col(name.clone()).eq(lit(1i64)))? + .project(vec![col("k"), col("v"), col(name)])? + .build()?; + + let unparser = Unparser::new(&UnparserPostgreSqlDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_snapshot!( + sql, + @r#"SELECT "test"."k", "test"."v", "rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" FROM (SELECT "test"."k" AS "k", "test"."v" AS "v", rank() OVER (PARTITION BY "test"."k" ORDER BY "test"."v" ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" FROM "test") AS "test" WHERE ("rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" = 1)"# + ); + + let unparser = Unparser::new(&UnparserMySqlDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_snapshot!( + sql, + @r#"SELECT `test`.`k`, `test`.`v`, `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM (SELECT `test`.`k` AS `k`, `test`.`v` AS `v`, rank() OVER (PARTITION BY `test`.`k` ORDER BY `test`.`v` ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM `test`) AS `test` WHERE (`rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` = 1)"# + ); + + let unparser = Unparser::new(&SqliteDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_snapshot!( + sql, + @r#"SELECT `test`.`k`, `test`.`v`, `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM (SELECT `test`.`k` AS `k`, `test`.`v` AS `v`, rank() OVER (PARTITION BY `test`.`k` ORDER BY `test`.`v` ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM `test`) AS `test` WHERE (`rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` = 1)"# + ); + + let unparser = Unparser::new(&DefaultDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_snapshot!( + sql, + @r#"SELECT test.k, test.v, rank() OVER (PARTITION BY test.k ORDER BY test.v ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) FROM test QUALIFY (rank() OVER (PARTITION BY test.k ORDER BY test.v ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) = 1)"# + ); + + // without table qualifier + let table = table_scan(Some("test"), &schema, Some(vec![0, 1]))?.build()?; + let table = LogicalPlanBuilder::from(table) + .project(vec![col("k").alias("k"), col("v").alias("v")])? + .build()?; + let plan = LogicalPlanBuilder::window_plan(table, vec![window_expr])?; + + let name = plan.schema().fields().last().unwrap().name().clone(); + let plan = LogicalPlanBuilder::from(plan) + .filter(col(name.clone()).eq(lit(1i64)))? + .project(vec![col("k"), col("v"), col(name)])? + .build()?; + + let unparser = Unparser::new(&UnparserPostgreSqlDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_snapshot!( + sql, + @r#"SELECT "k", "v", "rank() PARTITION BY [k] ORDER BY [v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" FROM (SELECT "k" AS "k", "v" AS "v", rank() OVER (PARTITION BY "k" ORDER BY "v" ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "rank() PARTITION BY [k] ORDER BY [v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" FROM (SELECT "test"."k" AS "k", "test"."v" AS "v" FROM "test") AS "derived_projection") AS "__qualify_subquery" WHERE ("rank() PARTITION BY [k] ORDER BY [v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" = 1)"# + ); + + Ok(()) +} + #[test] fn test_like_filter() { let statement = generate_round_trip_statement( From ec2402aee9bf510d3a98927cb3580850914fcf27 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Sat, 18 Oct 2025 00:22:21 +0800 Subject: [PATCH 029/109] feat: Support configurable `EXPLAIN ANALYZE` detail level (#18098) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #. ## Rationale for this change `EXPLAIN ANALYZE` can be used for profiling and displays the results alongside the EXPLAIN plan. The issue is that it currently shows too many low-level details. It would provide a better user experience if only the most commonly used metrics were shown by default, with more detailed metrics available through specific configuration options. ### Example In `datafusion-cli`: ``` > CREATE EXTERNAL TABLE IF NOT EXISTS lineitem STORED AS parquet LOCATION '/Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem'; 0 row(s) fetched. Elapsed 0.000 seconds. explain analyze select * from lineitem where l_orderkey = 3000000; ``` The parquet reader includes a large number of low-level details: ``` metrics=[output_rows=19813, elapsed_compute=14ns, batches_split=0, bytes_scanned=2147308, file_open_errors=0, file_scan_errors=0, files_ranges_pruned_statistics=18, num_predicate_creation_errors=0, page_index_rows_matched=19813, page_index_rows_pruned=729088, predicate_cache_inner_records=0, predicate_cache_records=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, row_groups_matched_bloom_filter=0, row_groups_matched_statistics=1, row_groups_pruned_bloom_filter=0, row_groups_pruned_statistics=0, bloom_filter_eval_time=21.997µs, metadata_load_time=273.83µs, page_index_eval_time=29.915µs, row_pushdown_eval_time=42ns, statistics_eval_time=76.248µs, time_elapsed_opening=4.02146ms, time_elapsed_processing=24.787461ms, time_elapsed_scanning_total=24.17671ms, time_elapsed_scanning_until_data=23.103665ms] ``` I believe only a subset of it is commonly used, for example `output_rows`, `metadata_load_time`, and how many file/row-group/pages are pruned, and it would better to only display the most common ones by default. ### Existing `VERBOSE` keyword There is a existing verbose keyword in `EXPLAIN ANALYZE VERBOSE`, however it's turning on per-partition metrics instead of controlling detail level. I think it would be hard to mix this partition control and the detail level introduced in this PR, so they're separated: the following config will be used for detail level and the semantics of `EXPLAIN ANALYZE VERBOSE` keep unchanged. ### This PR: configurable explain analyze level 1. Introduced a new config option `datafusion.explain.analyze_level`. When set to `dev` (default value), all existing metrics will be shown. If set to `summary`, only `BaselineMetrics` will be displayed (i.e. `output_rows` and `elapsed_compute`). Note now we only include `BaselineMetrics` for simplicity, in the follow-up PRs we can figure out what's the commonly used metrics for each operator, and add them to `summary` analyze level, finally set the `summary` analyze level to default. 2. Add a `MetricType` field associated with `Metric` for detail level or potentially category in the future. For different configurations, a certain `MetricType` set will be shown accordingly. #### Demo ``` -- continuing the above example > set datafusion.explain.analyze_level = summary; 0 row(s) fetched. Elapsed 0.000 seconds. > explain analyze select * from lineitem where l_orderkey| plan_type | plan || Plan with Metrics | CoalesceBatchesExec: target_batch_size=8192, metrics=[output_rows=5, elapsed_compute=25.339µs] | | | FilterExec: l_orderkey@0 = 3000000, metrics=[output_rows=5, elapsed_compute=81.221µs] | | | DataSourceExec: file_groups={14 groups: [[Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-0.parquet:0..11525426], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-0.parquet:11525426..20311205, Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:0..2739647], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:2739647..14265073], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:14265073..20193593, Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-2.parquet:0..5596906], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-2.parquet:5596906..17122332], ...]}, projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment], file_type=parquet, predicate=l_orderkey@0 = 3000000, pruning_predicate=l_orderkey_null_count@2 != row_count@3 AND l_orderkey_min@0 <= 3000000 AND 3000000 <= l_orderkey_max@1, required_guarantees=[l_orderkey in (3000000)], metrics=[output_rows=19813, elapsed_compute=14ns] | | | |row(s) fetched. Elapsed 0.025 seconds. ``` Only `BaselineMetrics` are shown. ## What changes are included in this PR? ## Are these changes tested? UT ## Are there any user-facing changes? No --------- Co-authored-by: Andrew Lamb --- datafusion/common/src/config.rs | 7 ++- datafusion/common/src/format.rs | 45 ++++++++++++++++ .../src/datasource/physical_plan/parquet.rs | 5 +- datafusion/core/src/physical_planner.rs | 8 +++ datafusion/core/tests/sql/explain_analyze.rs | 35 ++++++++++++ datafusion/physical-plan/src/analyze.rs | 19 ++++++- datafusion/physical-plan/src/display.rs | 33 ++++++++++++ .../physical-plan/src/metrics/baseline.rs | 12 +++-- .../physical-plan/src/metrics/builder.rs | 21 +++++++- datafusion/physical-plan/src/metrics/mod.rs | 54 ++++++++++++++++++- datafusion/proto/src/physical_plan/mod.rs | 2 + .../tests/cases/roundtrip_physical_plan.rs | 2 + .../test_files/information_schema.slt | 2 + .../test_files/spark/aggregate/avg.slt | 2 +- .../test_files/spark/string/concat.slt | 2 +- .../test_files/spark/string/format_string.slt | 8 +-- docs/source/user-guide/configs.md | 1 + docs/source/user-guide/sql/explain.md | 11 ++-- 18 files changed, 251 insertions(+), 18 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 126935a1de90b..52e35985698f0 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -22,7 +22,7 @@ use arrow_ipc::CompressionType; #[cfg(feature = "parquet_encryption")] use crate::encryption::{FileDecryptionProperties, FileEncryptionProperties}; use crate::error::_config_err; -use crate::format::ExplainFormat; +use crate::format::{ExplainAnalyzeLevel, ExplainFormat}; use crate::parsers::CompressionTypeVariant; use crate::utils::get_available_parallelism; use crate::{DataFusionError, Result}; @@ -991,6 +991,11 @@ config_namespace! { /// (format=tree only) Maximum total width of the rendered tree. /// When set to 0, the tree will have no width limit. pub tree_maximum_render_width: usize, default = 240 + + /// Verbosity level for "EXPLAIN ANALYZE". Default is "dev" + /// "summary" shows common metrics for high-level insights. + /// "dev" provides deep operator-level introspection for developers. + pub analyze_level: ExplainAnalyzeLevel, default = ExplainAnalyzeLevel::Dev } } diff --git a/datafusion/common/src/format.rs b/datafusion/common/src/format.rs index 06ec519ef356c..764190e1189bf 100644 --- a/datafusion/common/src/format.rs +++ b/datafusion/common/src/format.rs @@ -205,3 +205,48 @@ impl ConfigField for ExplainFormat { Ok(()) } } + +/// Verbosity levels controlling how `EXPLAIN ANALYZE` renders metrics +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ExplainAnalyzeLevel { + /// Show a compact view containing high-level metrics + Summary, + /// Show a developer-focused view with per-operator details + Dev, + // When adding new enum, update the error message in `from_str()` accordingly. +} + +impl FromStr for ExplainAnalyzeLevel { + type Err = DataFusionError; + + fn from_str(level: &str) -> Result { + match level.to_lowercase().as_str() { + "summary" => Ok(ExplainAnalyzeLevel::Summary), + "dev" => Ok(ExplainAnalyzeLevel::Dev), + other => Err(DataFusionError::Configuration(format!( + "Invalid explain analyze level. Expected 'summary' or 'dev'. Got '{other}'" + ))), + } + } +} + +impl Display for ExplainAnalyzeLevel { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + ExplainAnalyzeLevel::Summary => "summary", + ExplainAnalyzeLevel::Dev => "dev", + }; + write!(f, "{s}") + } +} + +impl ConfigField for ExplainAnalyzeLevel { + fn visit(&self, v: &mut V, key: &str, description: &'static str) { + v.some(key, self, description) + } + + fn set(&mut self, _: &str, value: &str) -> Result<()> { + *self = ExplainAnalyzeLevel::from_str(value)?; + Ok(()) + } +} diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index d0774e57174ee..10a475c1cc9a6 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -64,7 +64,9 @@ mod tests { use datafusion_physical_expr::planner::logical2physical; use datafusion_physical_plan::analyze::AnalyzeExec; use datafusion_physical_plan::collect; - use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; + use datafusion_physical_plan::metrics::{ + ExecutionPlanMetricsSet, MetricType, MetricsSet, + }; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use chrono::{TimeZone, Utc}; @@ -238,6 +240,7 @@ mod tests { let analyze_exec = Arc::new(AnalyzeExec::new( false, false, + vec![MetricType::SUMMARY, MetricType::DEV], // use a new ParquetSource to avoid sharing execution metrics self.build_parquet_exec( Arc::clone(table_schema), diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index c28e56790e660..0fa17deea1295 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -62,6 +62,7 @@ use arrow::compute::SortOptions; use arrow::datatypes::Schema; use datafusion_catalog::ScanArgs; use datafusion_common::display::ToStringifiedPlan; +use datafusion_common::format::ExplainAnalyzeLevel; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion_common::TableReference; use datafusion_common::{ @@ -90,6 +91,7 @@ use datafusion_physical_expr::{ use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::empty::EmptyExec; use datafusion_physical_plan::execution_plan::InvariantLevel; +use datafusion_physical_plan::metrics::MetricType; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::recursive_query::RecursiveQueryExec; use datafusion_physical_plan::unnest::ListUnnest; @@ -2073,9 +2075,15 @@ impl DefaultPhysicalPlanner { let input = self.create_physical_plan(&a.input, session_state).await?; let schema = Arc::clone(a.schema.inner()); let show_statistics = session_state.config_options().explain.show_statistics; + let analyze_level = session_state.config_options().explain.analyze_level; + let metric_types = match analyze_level { + ExplainAnalyzeLevel::Summary => vec![MetricType::SUMMARY], + ExplainAnalyzeLevel::Dev => vec![MetricType::SUMMARY, MetricType::DEV], + }; Ok(Arc::new(AnalyzeExec::new( a.verbose, show_statistics, + metric_types, input, schema, ))) diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index e082cabaadaff..54a57ed901162 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -22,6 +22,7 @@ use rstest::rstest; use datafusion::config::ConfigOptions; use datafusion::physical_plan::display::DisplayableExecutionPlan; use datafusion::physical_plan::metrics::Timestamp; +use datafusion_common::format::ExplainAnalyzeLevel; use object_store::path::Path; #[tokio::test] @@ -158,6 +159,40 @@ async fn explain_analyze_baseline_metrics() { fn nanos_from_timestamp(ts: &Timestamp) -> i64 { ts.value().unwrap().timestamp_nanos_opt().unwrap() } + +// Test different detail level for config `datafusion.explain.analyze_level` +#[tokio::test] +async fn explain_analyze_level() { + async fn collect_plan(level: ExplainAnalyzeLevel) -> String { + let mut config = SessionConfig::new(); + config.options_mut().explain.analyze_level = level; + let ctx = SessionContext::new_with_config(config); + let sql = "EXPLAIN ANALYZE \ + SELECT * \ + FROM generate_series(10) as t1(v1) \ + ORDER BY v1 DESC"; + let dataframe = ctx.sql(sql).await.unwrap(); + let batches = dataframe.collect().await.unwrap(); + arrow::util::pretty::pretty_format_batches(&batches) + .unwrap() + .to_string() + } + + for (level, needle, should_contain) in [ + (ExplainAnalyzeLevel::Summary, "spill_count", false), + (ExplainAnalyzeLevel::Summary, "output_rows", true), + (ExplainAnalyzeLevel::Dev, "spill_count", true), + (ExplainAnalyzeLevel::Dev, "output_rows", true), + ] { + let plan = collect_plan(level).await; + assert_eq!( + plan.contains(needle), + should_contain, + "plan for level {level:?} unexpected content: {plan}" + ); + } +} + #[tokio::test] async fn csv_explain_plans() { // This test verify the look of each plan in its full cycle plan creation diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs index c095afe5e716e..c696cf5aa5e60 100644 --- a/datafusion/physical-plan/src/analyze.rs +++ b/datafusion/physical-plan/src/analyze.rs @@ -26,6 +26,7 @@ use super::{ SendableRecordBatchStream, }; use crate::display::DisplayableExecutionPlan; +use crate::metrics::MetricType; use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch}; @@ -44,6 +45,8 @@ pub struct AnalyzeExec { verbose: bool, /// If statistics should be displayed show_statistics: bool, + /// Which metric categories should be displayed + metric_types: Vec, /// The input plan (the plan being analyzed) pub(crate) input: Arc, /// The output schema for RecordBatches of this exec node @@ -56,6 +59,7 @@ impl AnalyzeExec { pub fn new( verbose: bool, show_statistics: bool, + metric_types: Vec, input: Arc, schema: SchemaRef, ) -> Self { @@ -63,6 +67,7 @@ impl AnalyzeExec { AnalyzeExec { verbose, show_statistics, + metric_types, input, schema, cache, @@ -145,6 +150,7 @@ impl ExecutionPlan for AnalyzeExec { Ok(Arc::new(Self::new( self.verbose, self.show_statistics, + self.metric_types.clone(), children.pop().unwrap(), Arc::clone(&self.schema), ))) @@ -182,6 +188,7 @@ impl ExecutionPlan for AnalyzeExec { let captured_schema = Arc::clone(&self.schema); let verbose = self.verbose; let show_statistics = self.show_statistics; + let metric_types = self.metric_types.clone(); // future that gathers the results from all the tasks in the // JoinSet that computes the overall row count and final @@ -201,6 +208,7 @@ impl ExecutionPlan for AnalyzeExec { duration, captured_input, captured_schema, + &metric_types, ) }; @@ -219,6 +227,7 @@ fn create_output_batch( duration: std::time::Duration, input: Arc, schema: SchemaRef, + metric_types: &[MetricType], ) -> Result { let mut type_builder = StringBuilder::with_capacity(1, 1024); let mut plan_builder = StringBuilder::with_capacity(1, 1024); @@ -227,6 +236,7 @@ fn create_output_batch( type_builder.append_value("Plan with Metrics"); let annotated_plan = DisplayableExecutionPlan::with_metrics(input.as_ref()) + .set_metric_types(metric_types.to_vec()) .set_show_statistics(show_statistics) .indent(verbose) .to_string(); @@ -238,6 +248,7 @@ fn create_output_batch( type_builder.append_value("Plan with Full Metrics"); let annotated_plan = DisplayableExecutionPlan::with_full_metrics(input.as_ref()) + .set_metric_types(metric_types.to_vec()) .set_show_statistics(show_statistics) .indent(verbose) .to_string(); @@ -282,7 +293,13 @@ mod tests { let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); let refs = blocking_exec.refs(); - let analyze_exec = Arc::new(AnalyzeExec::new(true, false, blocking_exec, schema)); + let analyze_exec = Arc::new(AnalyzeExec::new( + true, + false, + vec![MetricType::SUMMARY, MetricType::DEV], + blocking_exec, + schema, + )); let fut = collect(analyze_exec, task_ctx); let mut fut = fut.boxed(); diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index 2420edfc743da..35ca0b65ae294 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -28,6 +28,7 @@ use datafusion_common::display::{GraphvizBuilder, PlanType, StringifiedPlan}; use datafusion_expr::display_schema; use datafusion_physical_expr::LexOrdering; +use crate::metrics::MetricType; use crate::render_tree::RenderTree; use super::{accept, ExecutionPlan, ExecutionPlanVisitor}; @@ -120,11 +121,17 @@ pub struct DisplayableExecutionPlan<'a> { show_statistics: bool, /// If schema should be displayed. See [`Self::set_show_schema`] show_schema: bool, + /// Which metric categories should be included when rendering + metric_types: Vec, // (TreeRender) Maximum total width of the rendered tree tree_maximum_render_width: usize, } impl<'a> DisplayableExecutionPlan<'a> { + fn default_metric_types() -> Vec { + vec![MetricType::SUMMARY, MetricType::DEV] + } + /// Create a wrapper around an [`ExecutionPlan`] which can be /// pretty printed in a variety of ways pub fn new(inner: &'a dyn ExecutionPlan) -> Self { @@ -133,6 +140,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: ShowMetrics::None, show_statistics: false, show_schema: false, + metric_types: Self::default_metric_types(), tree_maximum_render_width: 240, } } @@ -146,6 +154,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: ShowMetrics::Aggregated, show_statistics: false, show_schema: false, + metric_types: Self::default_metric_types(), tree_maximum_render_width: 240, } } @@ -159,6 +168,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: ShowMetrics::Full, show_statistics: false, show_schema: false, + metric_types: Self::default_metric_types(), tree_maximum_render_width: 240, } } @@ -178,6 +188,12 @@ impl<'a> DisplayableExecutionPlan<'a> { self } + /// Specify which metric types should be rendered alongside the plan + pub fn set_metric_types(mut self, metric_types: Vec) -> Self { + self.metric_types = metric_types; + self + } + /// Set the maximum render width for the tree format pub fn set_tree_maximum_render_width(mut self, width: usize) -> Self { self.tree_maximum_render_width = width; @@ -206,6 +222,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: ShowMetrics, show_statistics: bool, show_schema: bool, + metric_types: Vec, } impl fmt::Display for Wrapper<'_> { fn fmt(&self, f: &mut Formatter) -> fmt::Result { @@ -216,6 +233,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: self.show_metrics, show_statistics: self.show_statistics, show_schema: self.show_schema, + metric_types: &self.metric_types, }; accept(self.plan, &mut visitor) } @@ -226,6 +244,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: self.show_metrics, show_statistics: self.show_statistics, show_schema: self.show_schema, + metric_types: self.metric_types.clone(), } } @@ -245,6 +264,7 @@ impl<'a> DisplayableExecutionPlan<'a> { plan: &'a dyn ExecutionPlan, show_metrics: ShowMetrics, show_statistics: bool, + metric_types: Vec, } impl fmt::Display for Wrapper<'_> { fn fmt(&self, f: &mut Formatter) -> fmt::Result { @@ -255,6 +275,7 @@ impl<'a> DisplayableExecutionPlan<'a> { t, show_metrics: self.show_metrics, show_statistics: self.show_statistics, + metric_types: &self.metric_types, graphviz_builder: GraphvizBuilder::default(), parents: Vec::new(), }; @@ -272,6 +293,7 @@ impl<'a> DisplayableExecutionPlan<'a> { plan: self.inner, show_metrics: self.show_metrics, show_statistics: self.show_statistics, + metric_types: self.metric_types.clone(), } } @@ -306,6 +328,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: ShowMetrics, show_statistics: bool, show_schema: bool, + metric_types: Vec, } impl fmt::Display for Wrapper<'_> { @@ -317,6 +340,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: self.show_metrics, show_statistics: self.show_statistics, show_schema: self.show_schema, + metric_types: &self.metric_types, }; visitor.pre_visit(self.plan)?; Ok(()) @@ -328,6 +352,7 @@ impl<'a> DisplayableExecutionPlan<'a> { show_metrics: self.show_metrics, show_statistics: self.show_statistics, show_schema: self.show_schema, + metric_types: self.metric_types.clone(), } } @@ -382,6 +407,8 @@ struct IndentVisitor<'a, 'b> { show_statistics: bool, /// If schema should be displayed show_schema: bool, + /// Which metric types should be rendered + metric_types: &'a [MetricType], } impl ExecutionPlanVisitor for IndentVisitor<'_, '_> { @@ -394,6 +421,7 @@ impl ExecutionPlanVisitor for IndentVisitor<'_, '_> { ShowMetrics::Aggregated => { if let Some(metrics) = plan.metrics() { let metrics = metrics + .filter_by_metric_types(self.metric_types) .aggregate_by_name() .sorted_for_display() .timestamps_removed(); @@ -405,6 +433,7 @@ impl ExecutionPlanVisitor for IndentVisitor<'_, '_> { } ShowMetrics::Full => { if let Some(metrics) = plan.metrics() { + let metrics = metrics.filter_by_metric_types(self.metric_types); write!(self.f, ", metrics=[{metrics}]")?; } else { write!(self.f, ", metrics=[]")?; @@ -441,6 +470,8 @@ struct GraphvizVisitor<'a, 'b> { show_metrics: ShowMetrics, /// If statistics should be displayed show_statistics: bool, + /// Which metric types should be rendered + metric_types: &'a [MetricType], graphviz_builder: GraphvizBuilder, /// Used to record parent node ids when visiting a plan. @@ -478,6 +509,7 @@ impl ExecutionPlanVisitor for GraphvizVisitor<'_, '_> { ShowMetrics::Aggregated => { if let Some(metrics) = plan.metrics() { let metrics = metrics + .filter_by_metric_types(self.metric_types) .aggregate_by_name() .sorted_for_display() .timestamps_removed(); @@ -489,6 +521,7 @@ impl ExecutionPlanVisitor for GraphvizVisitor<'_, '_> { } ShowMetrics::Full => { if let Some(metrics) = plan.metrics() { + let metrics = metrics.filter_by_metric_types(self.metric_types); format!("metrics=[{metrics}]") } else { "metrics=[]".to_string() diff --git a/datafusion/physical-plan/src/metrics/baseline.rs b/datafusion/physical-plan/src/metrics/baseline.rs index 15efb8f90aa20..45cef58b5dd8c 100644 --- a/datafusion/physical-plan/src/metrics/baseline.rs +++ b/datafusion/physical-plan/src/metrics/baseline.rs @@ -62,9 +62,15 @@ impl BaselineMetrics { start_time.record(); Self { - end_time: MetricBuilder::new(metrics).end_timestamp(partition), - elapsed_compute: MetricBuilder::new(metrics).elapsed_compute(partition), - output_rows: MetricBuilder::new(metrics).output_rows(partition), + end_time: MetricBuilder::new(metrics) + .with_type(super::MetricType::SUMMARY) + .end_timestamp(partition), + elapsed_compute: MetricBuilder::new(metrics) + .with_type(super::MetricType::SUMMARY) + .elapsed_compute(partition), + output_rows: MetricBuilder::new(metrics) + .with_type(super::MetricType::SUMMARY) + .output_rows(partition), } } diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-plan/src/metrics/builder.rs index dbda0a310ce52..74ba5a2a18343 100644 --- a/datafusion/physical-plan/src/metrics/builder.rs +++ b/datafusion/physical-plan/src/metrics/builder.rs @@ -19,6 +19,8 @@ use std::{borrow::Cow, sync::Arc}; +use crate::metrics::MetricType; + use super::{ Count, ExecutionPlanMetricsSet, Gauge, Label, Metric, MetricValue, Time, Timestamp, }; @@ -52,15 +54,23 @@ pub struct MetricBuilder<'a> { /// arbitrary name=value pairs identifying this metric labels: Vec