From c5fb60535ea2c7f5e31f39b4ab769db2eaf762e2 Mon Sep 17 00:00:00 2001 From: Michael Kleen Date: Tue, 4 Nov 2025 19:13:25 +0100 Subject: [PATCH 0001/1589] Move generate_series projection logic into LazyMemoryStream (#18373) ## Which issue does this PR close? - None, This is a follow-up for https://github.com/apache/datafusion/pull/18298 ## Rationale for this change This moves the projection logic from `generate_series` out of the generator into `LazyMemoryStream` as discussed in https://github.com/apache/datafusion/pull/18298#discussion_r2465670378 This makes the projection logic generic for all generators. ## What changes are included in this PR? The projection logic is moved from `generate_series` into the `LazyMemoryStream` and relevant tests, where `LazyMemoryStream` is used, are adapted accordingly. ## Are these changes tested? This is only a small refactoring; the changes are covered by the tests from https://github.com/apache/datafusion/pull/18298 ## Are there any user-facing changes? There is a new parameter added to LazyMemoryExec::try_new method --- .../functions-table/src/generate_series.rs | 23 ++++---------- datafusion/physical-plan/src/memory.rs | 30 ++++++++++++++++++- datafusion/proto/src/physical_plan/mod.rs | 3 +- 3 files changed, 36 insertions(+), 20 deletions(-) diff --git a/datafusion/functions-table/src/generate_series.rs b/datafusion/functions-table/src/generate_series.rs index c66e652147eb8..d71c5945aafcc 100644 --- a/datafusion/functions-table/src/generate_series.rs +++ b/datafusion/functions-table/src/generate_series.rs @@ -237,7 +237,6 @@ impl GenerateSeriesTable { pub fn as_generator( &self, batch_size: usize, - projection: Option>, ) -> Result>> { let generator: Arc> = match &self.args { GenSeriesArgs::ContainsNull { name } => Arc::new(RwLock::new(Empty { name })), @@ -256,7 +255,6 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, - projection, })), GenSeriesArgs::TimestampArgs { start, @@ -297,7 +295,6 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, - projection, })) } GenSeriesArgs::DateArgs { @@ -327,7 +324,6 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, - projection, })), }; @@ -345,7 +341,6 @@ pub struct GenericSeriesState { current: T, include_end: bool, name: &'static str, - projection: Option>, } impl GenericSeriesState { @@ -401,11 +396,7 @@ impl LazyBatchGenerator for GenericSeriesState { let array = self.current.create_array(buf)?; let batch = RecordBatch::try_new(Arc::clone(&self.schema), vec![array])?; - let projected = match self.projection.as_ref() { - Some(projection) => batch.project(projection)?, - None => batch, - }; - Ok(Some(projected)) + Ok(Some(batch)) } } @@ -481,14 +472,12 @@ impl TableProvider for GenerateSeriesTable { _limit: Option, ) -> Result> { let batch_size = state.config_options().execution.batch_size; - let schema = match projection { - Some(projection) => Arc::new(self.schema.project(projection)?), - None => self.schema(), - }; - - let generator = self.as_generator(batch_size, projection.cloned())?; + let generator = self.as_generator(batch_size)?; - Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?)) + Ok(Arc::new( + LazyMemoryExec::try_new(self.schema(), vec![generator])? + .with_projection(projection.cloned()), + )) } } diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 1bf1e04efb53b..09710ae1e2edb 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -153,6 +153,8 @@ pub trait LazyBatchGenerator: Send + Sync + fmt::Debug + fmt::Display { pub struct LazyMemoryExec { /// Schema representing the data schema: SchemaRef, + /// Optional projection for which columns to load + projection: Option>, /// Functions to generate batches for each partition batch_generators: Vec>>, /// Plan properties cache storing equivalence properties, partitioning, and execution mode @@ -199,12 +201,28 @@ impl LazyMemoryExec { Ok(Self { schema, + projection: None, batch_generators: generators, cache, metrics: ExecutionPlanMetricsSet::new(), }) } + pub fn with_projection(mut self, projection: Option>) -> Self { + match projection.as_ref() { + Some(columns) => { + let projected = Arc::new(self.schema.project(columns).unwrap()); + self.cache = self.cache.with_eq_properties(EquivalenceProperties::new( + Arc::clone(&projected), + )); + self.schema = projected; + self.projection = projection; + self + } + _ => self, + } + } + pub fn try_set_partitioning(&mut self, partitioning: Partitioning) -> Result<()> { if partitioning.partition_count() != self.batch_generators.len() { internal_err!( @@ -320,6 +338,7 @@ impl ExecutionPlan for LazyMemoryExec { let stream = LazyMemoryStream { schema: Arc::clone(&self.schema), + projection: self.projection.clone(), generator: Arc::clone(&self.batch_generators[partition]), baseline_metrics, }; @@ -338,6 +357,8 @@ impl ExecutionPlan for LazyMemoryExec { /// Stream that generates record batches on demand pub struct LazyMemoryStream { schema: SchemaRef, + /// Optional projection for which columns to load + projection: Option>, /// Generator to produce batches /// /// Note: Idiomatically, DataFusion uses plan-time parallelism - each stream @@ -361,7 +382,14 @@ impl Stream for LazyMemoryStream { let batch = self.generator.write().generate_next_batch(); let poll = match batch { - Ok(Some(batch)) => Poll::Ready(Some(Ok(batch))), + Ok(Some(batch)) => { + // return just the columns requested + let batch = match self.projection.as_ref() { + Some(columns) => batch.project(columns)?, + None => batch, + }; + Poll::Ready(Some(Ok(batch))) + } Ok(None) => Poll::Ready(None), Err(e) => Poll::Ready(Some(Err(e))), }; diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 0ebbb373f2d10..e5f4a1f7d0267 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -1940,8 +1940,7 @@ impl protobuf::PhysicalPlanNode { }; let table = GenerateSeriesTable::new(Arc::clone(&schema), args); - let generator = - table.as_generator(generate_series.target_batch_size as usize, None)?; + let generator = table.as_generator(generate_series.target_batch_size as usize)?; Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?)) } From 2d5c10194364b4a55a1367b03b56d5313c62e0c8 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov <62326549+cj-zhukov@users.noreply.github.com> Date: Tue, 4 Nov 2025 22:18:33 +0300 Subject: [PATCH 0002/1589] Consolidate flight examples (#18142) (#18442) ## Which issue does this PR close? - part of #https://github.com/apache/datafusion/issues/18142. ## Rationale for this change As discussed in https://github.com/apache/datafusion/pull/18289 this PR is for consolidating all the `flight` examples into a single example binary. Then we can make sure we are agreed on the pattern and then we can apply it to the remaining examples ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --------- Co-authored-by: Sergey Zhukov Co-authored-by: Andrew Lamb --- datafusion-examples/Cargo.toml | 12 --- datafusion-examples/README.md | 4 +- .../flight/{flight_client.rs => client.rs} | 3 +- datafusion-examples/examples/flight/main.rs | 94 +++++++++++++++++++ .../flight/{flight_server.rs => server.rs} | 3 +- .../{flight_sql_server.rs => sql_server.rs} | 3 +- 6 files changed, 99 insertions(+), 20 deletions(-) rename datafusion-examples/examples/flight/{flight_client.rs => client.rs} (97%) create mode 100644 datafusion-examples/examples/flight/main.rs rename datafusion-examples/examples/flight/{flight_server.rs => server.rs} (99%) rename datafusion-examples/examples/flight/{flight_sql_server.rs => sql_server.rs} (99%) diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index bb0525e57753b..0ec410ecc6b29 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -32,18 +32,6 @@ rust-version = { workspace = true } [lints] workspace = true -[[example]] -name = "flight_sql_server" -path = "examples/flight/flight_sql_server.rs" - -[[example]] -name = "flight_server" -path = "examples/flight/flight_server.rs" - -[[example]] -name = "flight_client" -path = "examples/flight/flight_client.rs" - [[example]] name = "dataframe_to_s3" path = "examples/external_dependency/dataframe-to-s3.rs" diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index f1bcbcce82004..f6783a643f76e 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -65,7 +65,7 @@ cargo run --example dataframe - [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs - [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s - [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks. -- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients +- [`flight/sql_server.rs`](examples/flight/sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from Flight and and FlightSQL (e.g. JDBC) clients - [`function_factory.rs`](examples/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros - [`memory_pool_tracking.rs`](examples/memory_pool_tracking.rs): Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages - [`memory_pool_execution_plan.rs`](examples/memory_pool_execution_plan.rs): Shows how to implement memory-aware ExecutionPlan with memory reservation and spilling @@ -94,4 +94,4 @@ cargo run --example dataframe ## Distributed -- [`flight_client.rs`](examples/flight/flight_client.rs) and [`flight_server.rs`](examples/flight/flight_server.rs): Run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol. +- [`examples/flight/client.rs`](examples/flight/client.rs) and [`examples/flight/server.rs`](examples/flight/server.rs): Run DataFusion as a standalone process and execute SQL queries from a client using the Arrow Flight protocol. diff --git a/datafusion-examples/examples/flight/flight_client.rs b/datafusion-examples/examples/flight/client.rs similarity index 97% rename from datafusion-examples/examples/flight/flight_client.rs rename to datafusion-examples/examples/flight/client.rs index ff4b5903ad884..031beea47d57a 100644 --- a/datafusion-examples/examples/flight/flight_client.rs +++ b/datafusion-examples/examples/flight/client.rs @@ -30,8 +30,7 @@ use datafusion::arrow::util::pretty; /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for /// Parquet files and executing SQL queries against them on a remote server. /// This example is run along-side the example `flight_server`. -#[tokio::main] -async fn main() -> Result<(), Box> { +pub async fn client() -> Result<(), Box> { let testdata = datafusion::test_util::parquet_test_data(); // Create Flight client diff --git a/datafusion-examples/examples/flight/main.rs b/datafusion-examples/examples/flight/main.rs new file mode 100644 index 0000000000000..a448789b353b9 --- /dev/null +++ b/datafusion-examples/examples/flight/main.rs @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # Arrow Flight Examples +//! +//! These examples demonstrate Arrow Flight usage. +//! +//! Each subcommand runs a corresponding example: +//! - `client` — run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol +//! - `server` — run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol +//! - `sql_server` — run DataFusion as a standalone process and execute SQL queries from JDBC clients + +mod client; +mod server; +mod sql_server; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + Client, + Server, + SqlServer, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::Client => "client", + Self::Server => "server", + Self::SqlServer => "sql_server", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "client" => Ok(Self::Client), + "server" => Ok(Self::Server), + "sql_server" => Ok(Self::SqlServer), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 3] = [Self::Client, Self::Server, Self::SqlServer]; + + const EXAMPLE_NAME: &str = "flight"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::Client => client::client().await?, + ExampleKind::Server => server::server().await?, + ExampleKind::SqlServer => sql_server::sql_server().await?, + } + + Ok(()) +} diff --git a/datafusion-examples/examples/flight/flight_server.rs b/datafusion-examples/examples/flight/server.rs similarity index 99% rename from datafusion-examples/examples/flight/flight_server.rs rename to datafusion-examples/examples/flight/server.rs index 22265e415fbdb..dc75287cf2e2b 100644 --- a/datafusion-examples/examples/flight/flight_server.rs +++ b/datafusion-examples/examples/flight/server.rs @@ -194,8 +194,7 @@ fn to_tonic_err(e: datafusion::error::DataFusionError) -> Status { /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for /// Parquet files and executing SQL queries against them on a remote server. /// This example is run along-side the example `flight_client`. -#[tokio::main] -async fn main() -> Result<(), Box> { +pub async fn server() -> Result<(), Box> { let addr = "0.0.0.0:50051".parse()?; let service = FlightServiceImpl {}; diff --git a/datafusion-examples/examples/flight/flight_sql_server.rs b/datafusion-examples/examples/flight/sql_server.rs similarity index 99% rename from datafusion-examples/examples/flight/flight_sql_server.rs rename to datafusion-examples/examples/flight/sql_server.rs index c35debec7d712..fc7d0817bd5fa 100644 --- a/datafusion-examples/examples/flight/flight_sql_server.rs +++ b/datafusion-examples/examples/flight/sql_server.rs @@ -69,8 +69,7 @@ macro_rules! status { /// Based heavily on Ballista's implementation: https://github.com/apache/datafusion-ballista/blob/main/ballista/scheduler/src/flight_sql.rs /// and the example in arrow-rs: https://github.com/apache/arrow-rs/blob/master/arrow-flight/examples/flight_sql_server.rs /// -#[tokio::main] -async fn main() -> Result<(), Box> { +pub async fn sql_server() -> Result<(), Box> { env_logger::init(); let addr = "0.0.0.0:50051".parse()?; let service = FlightSqlServiceImpl { From 1ed6e5138f6e18f2325739af7f03af6fc9611e53 Mon Sep 17 00:00:00 2001 From: bubulalabu Date: Wed, 5 Nov 2025 02:46:09 +0100 Subject: [PATCH 0003/1589] feat: support named arguments for aggregate and window udfs (#18389) ## Which issue does this PR close? Addresses portions of https://github.com/apache/datafusion/issues/17379. ## Rationale for this change Add support for aggregate and window UDFs in the same way as we did it for scalar UDFs here: https://github.com/apache/datafusion/pull/18019 ## Are these changes tested? Yes ## Are there any user-facing changes? Yes, the changes are user-facing, documented, purely additive and non-breaking. --- .../functions-aggregate/src/correlation.rs | 4 +- .../src/percentile_cont.rs | 4 +- datafusion/functions-window/src/lead_lag.rs | 8 +- datafusion/sql/src/expr/function.rs | 56 +++++++- .../test_files/named_arguments.slt | 132 ++++++++++++++++++ .../functions/adding-udfs.md | 48 ++----- 6 files changed, 210 insertions(+), 42 deletions(-) diff --git a/datafusion/functions-aggregate/src/correlation.rs b/datafusion/functions-aggregate/src/correlation.rs index 20f23662cadec..f2a464de41550 100644 --- a/datafusion/functions-aggregate/src/correlation.rs +++ b/datafusion/functions-aggregate/src/correlation.rs @@ -88,7 +88,9 @@ impl Correlation { signature: Signature::exact( vec![DataType::Float64, DataType::Float64], Volatility::Immutable, - ), + ) + .with_parameter_names(vec!["y".to_string(), "x".to_string()]) + .expect("valid parameter names for corr"), } } } diff --git a/datafusion/functions-aggregate/src/percentile_cont.rs b/datafusion/functions-aggregate/src/percentile_cont.rs index 7ef0f8baf08d9..1e06461e569fb 100644 --- a/datafusion/functions-aggregate/src/percentile_cont.rs +++ b/datafusion/functions-aggregate/src/percentile_cont.rs @@ -146,7 +146,9 @@ impl PercentileCont { variants.push(TypeSignature::Exact(vec![num.clone(), DataType::Float64])); } Self { - signature: Signature::one_of(variants, Volatility::Immutable), + signature: Signature::one_of(variants, Volatility::Immutable) + .with_parameter_names(vec!["expr".to_string(), "percentile".to_string()]) + .expect("valid parameter names for percentile_cont"), aliases: vec![String::from("quantile_cont")], } } diff --git a/datafusion/functions-window/src/lead_lag.rs b/datafusion/functions-window/src/lead_lag.rs index 3910a0be574d8..02d7fc290b32c 100644 --- a/datafusion/functions-window/src/lead_lag.rs +++ b/datafusion/functions-window/src/lead_lag.rs @@ -137,7 +137,13 @@ impl WindowShift { TypeSignature::Any(3), ], Volatility::Immutable, - ), + ) + .with_parameter_names(vec![ + "expr".to_string(), + "offset".to_string(), + "default".to_string(), + ]) + .expect("valid parameter names for lead/lag"), kind, } } diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 2d20aaf523589..50e479af36204 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -386,7 +386,30 @@ impl SqlToRel<'_, S> { }; if let Ok(fun) = self.find_window_func(&name) { - let args = self.function_args_to_expr(args, schema, planner_context)?; + let (args, arg_names) = + self.function_args_to_expr_with_names(args, schema, planner_context)?; + + let resolved_args = if arg_names.iter().any(|name| name.is_some()) { + let signature = match &fun { + WindowFunctionDefinition::AggregateUDF(udaf) => udaf.signature(), + WindowFunctionDefinition::WindowUDF(udwf) => udwf.signature(), + }; + + if let Some(param_names) = &signature.parameter_names { + datafusion_expr::arguments::resolve_function_arguments( + param_names, + args, + arg_names, + )? + } else { + return plan_err!( + "Window function '{}' does not support named arguments", + name + ); + } + } else { + args + }; // Plan FILTER clause if present let filter = filter @@ -396,7 +419,7 @@ impl SqlToRel<'_, S> { let mut window_expr = RawWindowExpr { func_def: fun, - args, + args: resolved_args, partition_by, order_by, window_frame, @@ -464,8 +487,8 @@ impl SqlToRel<'_, S> { ); } - let mut args = - self.function_args_to_expr(args, schema, planner_context)?; + let (mut args, mut arg_names) = + self.function_args_to_expr_with_names(args, schema, planner_context)?; let order_by = if fm.supports_within_group_clause() { let within_group = self.order_by_to_sort_expr( @@ -479,6 +502,12 @@ impl SqlToRel<'_, S> { // Add the WITHIN GROUP ordering expressions to the front of the argument list // So function(arg) WITHIN GROUP (ORDER BY x) becomes function(x, arg) if !within_group.is_empty() { + // Prepend None arg names for each WITHIN GROUP expression + let within_group_count = within_group.len(); + arg_names = std::iter::repeat_n(None, within_group_count) + .chain(arg_names) + .collect(); + args = within_group .iter() .map(|sort| sort.expr.clone()) @@ -506,9 +535,26 @@ impl SqlToRel<'_, S> { .transpose()? .map(Box::new); + let resolved_args = if arg_names.iter().any(|name| name.is_some()) { + if let Some(param_names) = &fm.signature().parameter_names { + datafusion_expr::arguments::resolve_function_arguments( + param_names, + args, + arg_names, + )? + } else { + return plan_err!( + "Aggregate function '{}' does not support named arguments", + fm.name() + ); + } + } else { + args + }; + let mut aggregate_expr = RawAggregateExpr { func: fm, - args, + args: resolved_args, distinct, filter, order_by, diff --git a/datafusion/sqllogictest/test_files/named_arguments.slt b/datafusion/sqllogictest/test_files/named_arguments.slt index c93da7e7a8f9e..4eab799fd261a 100644 --- a/datafusion/sqllogictest/test_files/named_arguments.slt +++ b/datafusion/sqllogictest/test_files/named_arguments.slt @@ -137,3 +137,135 @@ SELECT substr(str => 'hello world', start_pos => 7, length => 5); # Reset to default dialect statement ok set datafusion.sql_parser.dialect = 'Generic'; + +############# +## Aggregate UDF Tests - using corr(y, x) function +############# + +# Setup test data +statement ok +CREATE TABLE correlation_test(col1 DOUBLE, col2 DOUBLE) AS VALUES + (1.0, 2.0), + (2.0, 4.0), + (3.0, 6.0), + (4.0, 8.0); + +# Test positional arguments (baseline) +query R +SELECT corr(col1, col2) FROM correlation_test; +---- +1 + +# Test named arguments out of order (proves named args work for aggregates) +query R +SELECT corr(x => col2, y => col1) FROM correlation_test; +---- +1 + +# Error: function doesn't support named arguments (count has no parameter names) +query error DataFusion error: Error during planning: Aggregate function 'count' does not support named arguments +SELECT count(value => col1) FROM correlation_test; + +# Cleanup +statement ok +DROP TABLE correlation_test; + +############# +## Aggregate UDF with WITHIN GROUP Tests - using percentile_cont(expression, percentile) +## This tests the special handling where WITHIN GROUP ORDER BY expressions are prepended to args +############# + +# Setup test data +statement ok +CREATE TABLE percentile_test(salary DOUBLE) AS VALUES + (50000.0), + (60000.0), + (70000.0), + (80000.0), + (90000.0); + +# Test positional arguments (baseline) - standard call without WITHIN GROUP +query R +SELECT percentile_cont(salary, 0.5) FROM percentile_test; +---- +70000 + +# Test WITHIN GROUP with positional argument +query R +SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) FROM percentile_test; +---- +70000 + +# Test WITHIN GROUP with named argument for percentile +# The ORDER BY expression (salary) is prepended internally, becoming: percentile_cont(salary, 0.5) +# We use named argument for percentile, which should work correctly +query R +SELECT percentile_cont(percentile => 0.5) WITHIN GROUP (ORDER BY salary) FROM percentile_test; +---- +70000 + +# Verify the WITHIN GROUP prepending logic with different percentile value +query R +SELECT percentile_cont(percentile => 0.25) WITHIN GROUP (ORDER BY salary) FROM percentile_test; +---- +60000 + +# Cleanup +statement ok +DROP TABLE percentile_test; + +############# +## Window UDF Tests - using lead(expression, offset, default) function +############# + +# Setup test data +statement ok +CREATE TABLE window_test(id INT, value INT) AS VALUES + (1, 10), + (2, 20), + (3, 30), + (4, 40); + +# Test positional arguments (baseline) +query II +SELECT id, lead(value, 1, 0) OVER (ORDER BY id) FROM window_test ORDER BY id; +---- +1 20 +2 30 +3 40 +4 0 + +# Test named arguments out of order (proves named args work for window functions) +query II +SELECT id, lead(default => 0, offset => 1, expr => value) OVER (ORDER BY id) FROM window_test ORDER BY id; +---- +1 20 +2 30 +3 40 +4 0 + +# Test with 1 argument (offset and default use defaults) +query II +SELECT id, lead(expr => value) OVER (ORDER BY id) FROM window_test ORDER BY id; +---- +1 20 +2 30 +3 40 +4 NULL + +# Test with 2 arguments (default uses default) +query II +SELECT id, lead(expr => value, offset => 2) OVER (ORDER BY id) FROM window_test ORDER BY id; +---- +1 30 +2 40 +3 NULL +4 NULL + +# Error: function doesn't support named arguments (row_number has no parameter names) +query error DataFusion error: Error during planning: Window function 'row_number' does not support named arguments +SELECT row_number(value => 1) OVER (ORDER BY id) FROM window_test; + +# Cleanup +statement ok +DROP TABLE window_test; diff --git a/docs/source/library-user-guide/functions/adding-udfs.md b/docs/source/library-user-guide/functions/adding-udfs.md index 7581d8b6505ea..e56790a4b7d83 100644 --- a/docs/source/library-user-guide/functions/adding-udfs.md +++ b/docs/source/library-user-guide/functions/adding-udfs.md @@ -588,10 +588,17 @@ For async UDF implementation details, see [`async_udf.rs`](https://github.com/ap ## Named Arguments -DataFusion supports PostgreSQL-style named arguments for scalar functions, allowing you to pass arguments by parameter name: +DataFusion supports named arguments for Scalar, Window, and Aggregate UDFs, allowing you to pass arguments by parameter name: ```sql +-- Scalar function SELECT substr(str => 'hello', start_pos => 2, length => 3); + +-- Window function +SELECT lead(expr => value, offset => 1) OVER (ORDER BY id) FROM table; + +-- Aggregate function +SELECT corr(y => col1, x => col2) FROM table; ``` Named arguments can be mixed with positional arguments, but positional arguments must come first: @@ -602,38 +609,7 @@ SELECT substr('hello', start_pos => 2, length => 3); -- Valid ### Implementing Functions with Named Arguments -To support named arguments in your UDF, add parameter names to your function's signature using `.with_parameter_names()`: - -```rust -# use arrow::datatypes::DataType; -# use datafusion_expr::{Signature, Volatility}; -# -# #[derive(Debug)] -# struct MyFunction { -# signature: Signature, -# } -# -impl MyFunction { - fn new() -> Self { - Self { - signature: Signature::uniform( - 2, - vec![DataType::Float64], - Volatility::Immutable - ) - .with_parameter_names(vec![ - "base".to_string(), - "exponent".to_string() - ]) - .expect("valid parameter names"), - } - } -} -``` - -The parameter names should match the order of arguments in your function's signature. DataFusion automatically resolves named arguments to the correct positional order before invoking your function. - -### Example +To support named arguments in your UDF, add parameter names to your function's signature using `.with_parameter_names()`. This works the same way for Scalar, Window, and Aggregate UDFs: ```rust # use std::sync::Arc; @@ -681,10 +657,14 @@ impl ScalarUDFImpl for PowerFunction { } ``` -Once registered, users can call your function with named arguments: +The parameter names should match the order of arguments in your function's signature. DataFusion automatically resolves named arguments to the correct positional order before invoking your function. + +Once registered, users can call your functions with named arguments in any order: ```sql +-- All equivalent SELECT power(base => 2.0, exponent => 3.0); +SELECT power(exponent => 3.0, base => 2.0); SELECT power(2.0, exponent => 3.0); ``` From b52a81db70002489c20bc78dceb33aea2edf44b7 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Wed, 5 Nov 2025 02:49:25 +0100 Subject: [PATCH 0004/1589] Support reverse for ListView (#18424) ## Which issue does this PR close? - Closes #18350. ## Rationale for this change We want to be able to reverse a ListView. ## What changes are included in this PR? - Downcast `&dyn Array` to `ListView`: `as_list_view_array` - Downcast `&dyn Array` to `LargeListView`: `as_large_list_view_array` - Branches in `array_reverse_inner` to reverse `ListView` and `LargeListView` - Main logic in `list_view_reverse` which materializes a new values array using `take` ## Are these changes tested? Yes --- datafusion/common/src/cast.rs | 14 +- datafusion/functions-nested/src/reverse.rs | 254 ++++++++++++++++++- datafusion/sqllogictest/test_files/array.slt | 7 + 3 files changed, 268 insertions(+), 7 deletions(-) diff --git a/datafusion/common/src/cast.rs b/datafusion/common/src/cast.rs index e6eda3c585e89..b95167ca13908 100644 --- a/datafusion/common/src/cast.rs +++ b/datafusion/common/src/cast.rs @@ -24,8 +24,8 @@ use crate::{downcast_value, Result}; use arrow::array::{ BinaryViewArray, Decimal32Array, Decimal64Array, DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, Float16Array, - Int16Array, Int8Array, LargeBinaryArray, LargeStringArray, StringViewArray, - UInt16Array, + Int16Array, Int8Array, LargeBinaryArray, LargeListViewArray, LargeStringArray, + ListViewArray, StringViewArray, UInt16Array, }; use arrow::{ array::{ @@ -324,3 +324,13 @@ pub fn as_generic_string_array( ) -> Result<&GenericStringArray> { Ok(downcast_value!(array, GenericStringArray, T)) } + +// Downcast Array to ListViewArray +pub fn as_list_view_array(array: &dyn Array) -> Result<&ListViewArray> { + Ok(downcast_value!(array, ListViewArray)) +} + +// Downcast Array to LargeListViewArray +pub fn as_large_list_view_array(array: &dyn Array) -> Result<&LargeListViewArray> { + Ok(downcast_value!(array, LargeListViewArray)) +} diff --git a/datafusion/functions-nested/src/reverse.rs b/datafusion/functions-nested/src/reverse.rs index 870e54f590009..635f23967a198 100644 --- a/datafusion/functions-nested/src/reverse.rs +++ b/datafusion/functions-nested/src/reverse.rs @@ -19,14 +19,18 @@ use crate::utils::make_scalar_function; use arrow::array::{ - Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, MutableArrayData, - OffsetSizeTrait, + Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, + GenericListViewArray, MutableArrayData, OffsetSizeTrait, UInt32Array, +}; +use arrow::buffer::{OffsetBuffer, ScalarBuffer}; +use arrow::compute::take; +use arrow::datatypes::DataType::{ + FixedSizeList, LargeList, LargeListView, List, ListView, Null, }; -use arrow::buffer::OffsetBuffer; -use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null}; use arrow::datatypes::{DataType, FieldRef}; use datafusion_common::cast::{ - as_fixed_size_list_array, as_large_list_array, as_list_array, + as_fixed_size_list_array, as_large_list_array, as_large_list_view_array, + as_list_array, as_list_view_array, }; use datafusion_common::{exec_err, utils::take_function_args, Result}; use datafusion_expr::{ @@ -134,6 +138,14 @@ pub fn array_reverse_inner(arg: &[ArrayRef]) -> Result { fixed_size_array_reverse(array, field) } Null => Ok(Arc::clone(input_array)), + ListView(field) => { + let array = as_list_view_array(input_array)?; + list_view_reverse::(array, field) + } + LargeListView(field) => { + let array = as_large_list_view_array(input_array)?; + list_view_reverse::(array, field) + } array_type => exec_err!("array_reverse does not support type '{array_type}'."), } } @@ -175,6 +187,75 @@ fn general_array_reverse( )?)) } +/// Reverses a list view array. +/// +/// Construct indices, sizes and offsets for the reversed array by iterating over +/// the list view array in the logical order, and reversing the order of the elements. +/// We end up with a list view array where the elements are in order, +/// even if the original array had elements out of order. +fn list_view_reverse( + array: &GenericListViewArray, + field: &FieldRef, +) -> Result { + let offsets = array.offsets(); + let values = array.values(); + let sizes = array.sizes(); + + let mut new_offsets: Vec = Vec::with_capacity(offsets.len()); + let mut indices: Vec = Vec::with_capacity(values.len()); + let mut new_sizes = Vec::with_capacity(sizes.len()); + + let mut current_offset = O::zero(); + for (row_index, offset) in offsets.iter().enumerate() { + new_offsets.push(current_offset); + + // If this array is null, we set its size to 0 and continue + if array.is_null(row_index) { + new_sizes.push(O::zero()); + continue; + } + let size = sizes[row_index]; + new_sizes.push(size); + + // Each array is located at [offset, offset + size), collect indices in the reverse order + let array_start = *offset; + let array_end = array_start + size; + let mut idx = array_end - O::one(); + while idx >= array_start { + indices.push(idx); + idx = idx - O::one(); + } + + current_offset += size; + } + + // Materialize values from underlying array with take + let indices_array: ArrayRef = if O::IS_LARGE { + Arc::new(arrow::array::UInt64Array::from( + indices + .iter() + .map(|i| i.as_usize() as u64) + .collect::>(), + )) + } else { + Arc::new(UInt32Array::from( + indices + .iter() + .map(|i| i.as_usize() as u32) + .collect::>(), + )) + }; + let values_reversed = take(&values, &indices_array, None)?; + + Ok(Arc::new(GenericListViewArray::::try_new( + Arc::clone(field), + ScalarBuffer::from(new_offsets), + ScalarBuffer::from(new_sizes), + values_reversed, + array.nulls().cloned(), + )?)) +} + fn fixed_size_array_reverse( array: &FixedSizeListArray, field: &FieldRef, @@ -207,3 +288,166 @@ fn fixed_size_array_reverse( array.nulls().cloned(), )?)) } + +#[cfg(test)] +mod tests { + use crate::reverse::list_view_reverse; + use arrow::{ + array::{ + AsArray, GenericListViewArray, Int32Array, LargeListViewArray, ListViewArray, + OffsetSizeTrait, + }, + buffer::{NullBuffer, ScalarBuffer}, + datatypes::{DataType, Field, Int32Type}, + }; + use datafusion_common::Result; + use std::sync::Arc; + + fn list_view_values( + array: &GenericListViewArray, + ) -> Vec>> { + array + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect() + } + + #[test] + fn test_reverse_list_view() -> Result<()> { + let field = Arc::new(Field::new("a", DataType::Int32, false)); + let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]); + let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]); + let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9])); + let nulls = Some(NullBuffer::from(vec![true, true, false, true])); + let list_view = ListViewArray::new(field, offsets, sizes, values, nulls); + let result = list_view_reverse( + &list_view, + &Arc::new(Field::new("test", DataType::Int32, true)), + )?; + let reversed = list_view_values(result.as_list_view::()); + let expected = vec![ + Some(vec![1]), + Some(vec![6, 5, 4, 3, 2]), + None, + Some(vec![9, 8, 7]), + ]; + assert_eq!(expected, reversed); + Ok(()) + } + + #[test] + fn test_reverse_large_list_view() -> Result<()> { + let field = Arc::new(Field::new("a", DataType::Int32, false)); + let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]); + let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]); + let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9])); + let nulls = Some(NullBuffer::from(vec![true, true, false, true])); + let list_view = LargeListViewArray::new(field, offsets, sizes, values, nulls); + let result = list_view_reverse( + &list_view, + &Arc::new(Field::new("test", DataType::Int32, true)), + )?; + let reversed = list_view_values(result.as_list_view::()); + let expected = vec![ + Some(vec![1]), + Some(vec![6, 5, 4, 3, 2]), + None, + Some(vec![9, 8, 7]), + ]; + assert_eq!(expected, reversed); + Ok(()) + } + + #[test] + fn test_reverse_list_view_out_of_order() -> Result<()> { + let field = Arc::new(Field::new("a", DataType::Int32, false)); + let offsets = ScalarBuffer::from(vec![6, 1, 6, 0]); // out of order + let sizes = ScalarBuffer::from(vec![3, 5, 0, 1]); + let values = Arc::new(Int32Array::from(vec![ + 1, // fourth array: offset 0, size 1 + 2, 3, 4, 5, 6, // second array: offset 1, size 5 + // third array: offset 6, size 0 (and null) + 7, 8, 9, // first array: offset 6, size 3 + ])); + let nulls = Some(NullBuffer::from(vec![true, true, false, true])); + let list_view = ListViewArray::new(field, offsets, sizes, values, nulls); + let result = list_view_reverse( + &list_view, + &Arc::new(Field::new("test", DataType::Int32, true)), + )?; + let reversed = list_view_values(result.as_list_view::()); + let expected = vec![ + Some(vec![9, 8, 7]), + Some(vec![6, 5, 4, 3, 2]), + None, + Some(vec![1]), + ]; + assert_eq!(expected, reversed); + Ok(()) + } + + #[test] + fn test_reverse_list_view_with_nulls() -> Result<()> { + let field = Arc::new(Field::new("a", DataType::Int32, false)); + let offsets = ScalarBuffer::from(vec![16, 1, 6, 0]); // out of order + let sizes = ScalarBuffer::from(vec![3, 5, 10, 1]); + let values = Arc::new(Int32Array::from(vec![ + 1, // fourth array: offset 0, size 1 + 2, 3, 4, 5, 6, // second array: offset 1, size 5 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // third array: offset 6, size 10 + 7, 8, 9, // first array: offset 6, size 3 + ])); + let nulls = Some(NullBuffer::from(vec![true, true, false, true])); + let list_view = ListViewArray::new(field, offsets, sizes, values, nulls); + let result = list_view_reverse( + &list_view, + &Arc::new(Field::new("test", DataType::Int32, true)), + )?; + let reversed = list_view_values(result.as_list_view::()); + let expected = vec![ + Some(vec![9, 8, 7]), + Some(vec![6, 5, 4, 3, 2]), + None, + Some(vec![1]), + ]; + assert_eq!(expected, reversed); + Ok(()) + } + + #[test] + fn test_reverse_list_view_empty() -> Result<()> { + let field = Arc::new(Field::new("a", DataType::Int32, false)); + let offsets = ScalarBuffer::from(vec![]); + let sizes = ScalarBuffer::from(vec![]); + let empty_array: Vec = vec![]; + let values = Arc::new(Int32Array::from(empty_array)); + let nulls = None; + let list_view = ListViewArray::new(field, offsets, sizes, values, nulls); + let result = list_view_reverse( + &list_view, + &Arc::new(Field::new("test", DataType::Int32, true)), + )?; + let reversed = list_view_values(result.as_list_view::()); + let expected: Vec>> = vec![]; + assert_eq!(expected, reversed); + Ok(()) + } + + #[test] + fn test_reverse_list_view_all_nulls() -> Result<()> { + let field = Arc::new(Field::new("a", DataType::Int32, false)); + let offsets = ScalarBuffer::from(vec![0, 1, 2, 3]); + let sizes = ScalarBuffer::from(vec![0, 1, 1, 1]); + let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let nulls = Some(NullBuffer::from(vec![false, false, false, false])); + let list_view = ListViewArray::new(field, offsets, sizes, values, nulls); + let result = list_view_reverse( + &list_view, + &Arc::new(Field::new("test", DataType::Int32, true)), + )?; + let reversed = list_view_values(result.as_list_view::()); + let expected: Vec>> = vec![None, None, None, None]; + assert_eq!(expected, reversed); + Ok(()) + } +} diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 38bdd7f3e3eb3..00629c392df48 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -8384,6 +8384,13 @@ select array_contains(a, b) from array_has order by 1 nulls last; true NULL +# TODO: Enable once arrow_cast supports ListView types. +# Expected output (once supported): +# ---- +# [5, 4, 3, 2, 1] +query error +select array_reverse(arrow_cast(make_array(1, 2, 3, 4, 5), 'ListView(Int64)')); + ### Delete tables statement ok From f7a9f2449f7e45bf4dbff49979089e5c619b1faf Mon Sep 17 00:00:00 2001 From: jizezhang Date: Wed, 5 Nov 2025 00:06:31 -0800 Subject: [PATCH 0005/1589] fix: spark array return type mismatch when inner data type is LargeList (#18485) ## Which issue does this PR close? - This PR came up as part of #17964. ## Rationale for this change This PR is intended to fix return type mismatch of spark `array` when inner data type is `LargeList`, e.g. ``` query error SELECT array(arrow_cast(array(1), 'LargeList(Int64)')) ---- DataFusion error: Internal error: Function 'array' returned value of type 'LargeList(Field { name: "element", data_type: LargeList(Field { data_type: Int64, nullable: true }), nullable: true })' while the following type was promised at planning time and expected: 'List(Field { name: "element", data_type: LargeList(Field { data_type: Int64, nullable: true }), nullable: true })'. This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues ``` ## What changes are included in this PR? - Return `List` regardless of whether inner data type is `LargeList` or not. This aligns with the behavior of datafusion `make_array` function. - Remove `return_field_from_args` as `return_type` is already defined and is invoked internally. ## Are these changes tested? Yes ## Are there any user-facing changes? No. --------- Co-authored-by: Michael Kleen Co-authored-by: Sergey Zhukov <62326549+cj-zhukov@users.noreply.github.com> Co-authored-by: Sergey Zhukov Co-authored-by: Andrew Lamb Co-authored-by: bubulalabu Co-authored-by: Vegard Stikbakke --- .../spark/src/function/array/spark_array.rs | 31 ++++++++++--------- .../test_files/spark/array/array.slt | 15 +++++++++ 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/datafusion/spark/src/function/array/spark_array.rs b/datafusion/spark/src/function/array/spark_array.rs index bf5842cb5a5a6..bb9665613de9b 100644 --- a/datafusion/spark/src/function/array/spark_array.rs +++ b/datafusion/spark/src/function/array/spark_array.rs @@ -24,7 +24,7 @@ use arrow::array::{ use arrow::buffer::OffsetBuffer; use arrow::datatypes::{DataType, Field, FieldRef}; use datafusion_common::utils::SingleRowListArrayBuilder; -use datafusion_common::{plan_datafusion_err, plan_err, Result}; +use datafusion_common::{internal_err, plan_datafusion_err, plan_err, Result}; use datafusion_expr::type_coercion::binary::comparison_coercion; use datafusion_expr::{ ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, @@ -72,9 +72,20 @@ impl ScalarUDFImpl for SparkArray { &self.signature } - fn return_type(&self, arg_types: &[DataType]) -> Result { + fn return_type(&self, _arg_types: &[DataType]) -> Result { + internal_err!("return_field_from_args should be used instead") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let data_types = args + .arg_fields + .iter() + .map(|f| f.data_type()) + .cloned() + .collect::>(); + let mut expr_type = DataType::Null; - for arg_type in arg_types { + for arg_type in &data_types { if !arg_type.equals_datatype(&DataType::Null) { expr_type = arg_type.clone(); break; @@ -85,21 +96,12 @@ impl ScalarUDFImpl for SparkArray { expr_type = DataType::Int32; } - Ok(DataType::List(Arc::new(Field::new( + let return_type = DataType::List(Arc::new(Field::new( ARRAY_FIELD_DEFAULT_NAME, expr_type, true, - )))) - } + ))); - fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { - let data_types = args - .arg_fields - .iter() - .map(|f| f.data_type()) - .cloned() - .collect::>(); - let return_type = self.return_type(&data_types)?; Ok(Arc::new(Field::new( "this_field_name_is_irrelevant", return_type, @@ -166,7 +168,6 @@ pub fn make_array_inner(arrays: &[ArrayRef]) -> Result { .build_list_array(), )) } - DataType::LargeList(..) => array_array::(arrays, data_type), _ => array_array::(arrays, data_type), } } diff --git a/datafusion/sqllogictest/test_files/spark/array/array.slt b/datafusion/sqllogictest/test_files/spark/array/array.slt index 09821e6d582d2..79dca1c10a7d0 100644 --- a/datafusion/sqllogictest/test_files/spark/array/array.slt +++ b/datafusion/sqllogictest/test_files/spark/array/array.slt @@ -70,3 +70,18 @@ query ? SELECT array(array(1,2)); ---- [[1, 2]] + +query ? +SELECT array(arrow_cast(array(1), 'LargeList(Int64)')); +---- +[[1]] + +query ? +SELECT array(arrow_cast(array(1), 'LargeList(Int64)'), arrow_cast(array(), 'LargeList(Int64)')); +---- +[[1], []] + +query ? +SELECT array(arrow_cast(array(1,2), 'LargeList(Int64)'), array(3)); +---- +[[1, 2], [3]] From 8363c8974befac9760f3036d10c09ae6cb28b604 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 5 Nov 2025 00:09:49 -0800 Subject: [PATCH 0006/1589] feat: Add selectivity metric to NestedLoopJoinExec for EXPLAIN ANALYZE (#18481) ## Which issue does this PR close? - Closes #18407 ## Rationale for this change This new metric will give the user better visibility to see what portion of the possibilities is actually being matched. ## What changes are included in this PR? Add selectivity metric to NestedLoopJoinExec for EXPLAIN ANALYZE ## Are these changes tested? Added tests ## Are there any user-facing changes? Yes, new metric in explain analyze --------- Co-authored-by: Yongting You <2010youy01@gmail.com> --- datafusion/core/tests/sql/explain_analyze.rs | 30 +++++++ .../src/joins/nested_loop_join.rs | 78 +++++++++++++------ 2 files changed, 86 insertions(+), 22 deletions(-) diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 8d98b91547fe7..26b71b5496f29 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -1110,3 +1110,33 @@ async fn csv_explain_analyze_with_statistics() { ", statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]" ); } + +#[tokio::test] +async fn nested_loop_join_selectivity() { + for (join_type, expected_selectivity) in [ + ("INNER", "1% (1/100)"), + ("LEFT", "10% (10/100)"), + ("RIGHT", "10% (10/100)"), + // 1 match + 9 left + 9 right = 19 + ("FULL", "19% (19/100)"), + ] { + let ctx = SessionContext::new(); + let sql = format!( + "EXPLAIN ANALYZE SELECT * \ + FROM generate_series(1, 10) as t1(a) \ + {join_type} JOIN generate_series(1, 10) as t2(b) \ + ON (t1.a + t2.b) = 20" + ); + + let actual = execute_to_batches(&ctx, sql.as_str()).await; + let formatted = arrow::util::pretty::pretty_format_batches(&actual) + .unwrap() + .to_string(); + + assert_metrics!( + &formatted, + "NestedLoopJoinExec", + &format!("selectivity={expected_selectivity}") + ); + } +} diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 7ae09a42de880..1f0cdf391c1f9 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -36,7 +36,9 @@ use crate::joins::utils::{ OnceAsync, OnceFut, }; use crate::joins::SharedBitmapBuilder; -use crate::metrics::{Count, ExecutionPlanMetricsSet, MetricsSet}; +use crate::metrics::{ + Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, MetricsSet, RatioMetrics, +}; use crate::projection::{ try_embed_projection, try_pushdown_through_join, EmbeddedProjection, JoinData, ProjectionExec, @@ -496,7 +498,7 @@ impl ExecutionPlan for NestedLoopJoinExec { ); } - let join_metrics = BuildProbeJoinMetrics::new(partition, &self.metrics); + let metrics = NestedLoopJoinMetrics::new(&self.metrics, partition); // Initialization reservation for load of inner table let load_reservation = @@ -508,7 +510,7 @@ impl ExecutionPlan for NestedLoopJoinExec { Ok(collect_left_input( stream, - join_metrics.clone(), + metrics.join_metrics.clone(), load_reservation, need_produce_result_in_final(self.join_type), self.right().output_partitioning().partition_count(), @@ -535,7 +537,7 @@ impl ExecutionPlan for NestedLoopJoinExec { probe_side_data, build_side_data, column_indices_after_projection, - join_metrics, + metrics, batch_size, ))) } @@ -749,7 +751,7 @@ pub(crate) struct NestedLoopJoinStream { /// the join filter (e.g., `JOIN ON (b+c)>0`). pub(crate) column_indices: Vec, /// Join execution metrics - pub(crate) join_metrics: BuildProbeJoinMetrics, + pub(crate) metrics: NestedLoopJoinMetrics, /// `batch_size` from configuration batch_size: usize, @@ -794,6 +796,24 @@ pub(crate) struct NestedLoopJoinStream { current_right_batch_matched: Option, } +pub(crate) struct NestedLoopJoinMetrics { + /// Join execution metrics + pub(crate) join_metrics: BuildProbeJoinMetrics, + /// Selectivity of the join: output_rows / (left_rows * right_rows) + pub(crate) selectivity: RatioMetrics, +} + +impl NestedLoopJoinMetrics { + pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self { + Self { + join_metrics: BuildProbeJoinMetrics::new(partition, metrics), + selectivity: MetricBuilder::new(metrics) + .with_type(MetricType::SUMMARY) + .ratio_metrics("selectivity", partition), + } + } +} + impl Stream for NestedLoopJoinStream { type Item = Result; @@ -844,7 +864,7 @@ impl Stream for NestedLoopJoinStream { // -side batches), related metrics except build time will be // updated. // stop on drop - let build_metric = self.join_metrics.build_time.clone(); + let build_metric = self.metrics.join_metrics.build_time.clone(); let _build_timer = build_metric.timer(); match self.handle_buffering_left(cx) { @@ -878,7 +898,7 @@ impl Stream for NestedLoopJoinStream { NLJState::FetchingRight => { debug!("[NLJState] Entering: {:?}", self.state); // stop on drop - let join_metric = self.join_metrics.join_time.clone(); + let join_metric = self.metrics.join_metrics.join_time.clone(); let _join_timer = join_metric.timer(); match self.handle_fetching_right(cx) { @@ -905,13 +925,13 @@ impl Stream for NestedLoopJoinStream { debug!("[NLJState] Entering: {:?}", self.state); // stop on drop - let join_metric = self.join_metrics.join_time.clone(); + let join_metric = self.metrics.join_metrics.join_time.clone(); let _join_timer = join_metric.timer(); match self.handle_probe_right() { ControlFlow::Continue(()) => continue, ControlFlow::Break(poll) => { - return self.join_metrics.baseline.record_poll(poll) + return self.metrics.join_metrics.baseline.record_poll(poll) } } } @@ -926,13 +946,13 @@ impl Stream for NestedLoopJoinStream { debug!("[NLJState] Entering: {:?}", self.state); // stop on drop - let join_metric = self.join_metrics.join_time.clone(); + let join_metric = self.metrics.join_metrics.join_time.clone(); let _join_timer = join_metric.timer(); match self.handle_emit_right_unmatched() { ControlFlow::Continue(()) => continue, ControlFlow::Break(poll) => { - return self.join_metrics.baseline.record_poll(poll) + return self.metrics.join_metrics.baseline.record_poll(poll) } } } @@ -956,13 +976,13 @@ impl Stream for NestedLoopJoinStream { debug!("[NLJState] Entering: {:?}", self.state); // stop on drop - let join_metric = self.join_metrics.join_time.clone(); + let join_metric = self.metrics.join_metrics.join_time.clone(); let _join_timer = join_metric.timer(); match self.handle_emit_left_unmatched() { ControlFlow::Continue(()) => continue, ControlFlow::Break(poll) => { - return self.join_metrics.baseline.record_poll(poll) + return self.metrics.join_metrics.baseline.record_poll(poll) } } } @@ -972,13 +992,13 @@ impl Stream for NestedLoopJoinStream { debug!("[NLJState] Entering: {:?}", self.state); // stop on drop - let join_metric = self.join_metrics.join_time.clone(); + let join_metric = self.metrics.join_metrics.join_time.clone(); let _join_timer = join_metric.timer(); // counting it in join timer due to there might be some // final resout batches to output in this state let poll = self.handle_done(); - return self.join_metrics.baseline.record_poll(poll); + return self.metrics.join_metrics.baseline.record_poll(poll); } } } @@ -1000,7 +1020,7 @@ impl NestedLoopJoinStream { right_data: SendableRecordBatchStream, left_data: OnceFut, column_indices: Vec, - join_metrics: BuildProbeJoinMetrics, + metrics: NestedLoopJoinMetrics, batch_size: usize, ) -> Self { Self { @@ -1010,7 +1030,7 @@ impl NestedLoopJoinStream { right_data, column_indices, left_data, - join_metrics, + metrics, buffered_left_data: None, output_buffer: Box::new(BatchCoalescer::new(schema, batch_size)), batch_size, @@ -1057,8 +1077,8 @@ impl NestedLoopJoinStream { Some(Ok(right_batch)) => { // Update metrics let right_batch_size = right_batch.num_rows(); - self.join_metrics.input_rows.add(right_batch_size); - self.join_metrics.input_batches.add(1); + self.metrics.join_metrics.input_rows.add(right_batch_size); + self.metrics.join_metrics.input_batches.add(1); // Skip the empty batch if right_batch_size == 0 { @@ -1108,6 +1128,17 @@ impl NestedLoopJoinStream { Ok(false) => { // Left exhausted, transition to FetchingRight self.left_probe_idx = 0; + + // Selectivity Metric: Update total possibilities for the batch (left_rows * right_rows) + // If memory-limited execution is implemented, this logic must be updated accordingly. + if let (Ok(left_data), Some(right_batch)) = + (self.get_left_data(), self.current_right_batch.as_ref()) + { + let left_rows = left_data.batch().num_rows(); + let right_rows = right_batch.num_rows(); + self.metrics.selectivity.add_total(left_rows * right_rows); + } + if self.should_track_unmatched_right { debug_assert!( self.current_right_batch_matched.is_some(), @@ -1138,7 +1169,6 @@ impl NestedLoopJoinStream { && self.current_right_batch.is_some(), "This state is yielding output for unmatched rows in the current right batch, so both the right batch and the bitmap must be present" ); - // Construct the result batch for unmatched right rows using a utility function match self.process_right_unmatched() { Ok(Some(batch)) => { @@ -1205,7 +1235,7 @@ impl NestedLoopJoinStream { // should be with the expected schema for this operator if !self.handled_empty_output { let zero_count = Count::new(); - if *self.join_metrics.baseline.output_rows() == zero_count { + if *self.metrics.join_metrics.baseline.output_rows() == zero_count { let empty_batch = RecordBatch::new_empty(Arc::clone(&self.output_schema)); self.handled_empty_output = true; return Poll::Ready(Some(Ok(empty_batch))); @@ -1455,7 +1485,11 @@ impl NestedLoopJoinStream { if let Some(batch) = self.output_buffer.next_completed_batch() { // HACK: this is not part of `BaselineMetrics` yet, so update it // manually - self.join_metrics.output_batches.add(1); + self.metrics.join_metrics.output_batches.add(1); + + // Update output rows for selectivity metric + let output_rows = batch.num_rows(); + self.metrics.selectivity.add_part(output_rows); return Some(Poll::Ready(Some(Ok(batch)))); } From acdd263812eb449c856d74e1d0395dd1705f0cd7 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Wed, 5 Nov 2025 10:08:40 +0000 Subject: [PATCH 0007/1589] Complete migrating `enforce_distrubution` tests to insta (#18185) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Closes https://github.com/apache/datafusion/issues/15791 - Closes https://github.com/apache/datafusion/issues/15178 🥳 - Surpasses part of https://github.com/apache/datafusion/pull/16978 --------- Co-authored-by: Claude Co-authored-by: Andrew Lamb --- .../enforce_distribution.rs | 634 ++++++++---------- 1 file changed, 282 insertions(+), 352 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index db011c4be43ab..5b7d9ac8fbe99 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -66,8 +66,8 @@ use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr}; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{ - displayable, get_plan_string, DisplayAs, DisplayFormatType, ExecutionPlanProperties, - PlanProperties, Statistics, + displayable, DisplayAs, DisplayFormatType, ExecutionPlanProperties, PlanProperties, + Statistics, }; use insta::Settings; @@ -469,83 +469,6 @@ impl TestConfig { self } - // This be deleted in https://github.com/apache/datafusion/pull/18185 - /// Perform a series of runs using the current [`TestConfig`], - /// assert the expected plan result, - /// and return the result plan (for potential subsequent runs). - fn run( - &self, - expected_lines: &[&str], - plan: Arc, - optimizers_to_run: &[Run], - ) -> Result> { - let expected_lines: Vec<&str> = expected_lines.to_vec(); - - // Add the ancillary output requirements operator at the start: - let optimizer = OutputRequirements::new_add_mode(); - let mut optimized = optimizer.optimize(plan.clone(), &self.config)?; - - // This file has 2 rules that use tree node, apply these rules to original plan consecutively - // After these operations tree nodes should be in a consistent state. - // This code block makes sure that these rules doesn't violate tree node integrity. - { - let adjusted = if self.config.optimizer.top_down_join_key_reordering { - // Run adjust_input_keys_ordering rule - let plan_requirements = - PlanWithKeyRequirements::new_default(plan.clone()); - let adjusted = plan_requirements - .transform_down(adjust_input_keys_ordering) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - adjusted.plan - } else { - // Run reorder_join_keys_to_inputs rule - plan.clone() - .transform_up(|plan| { - Ok(Transformed::yes(reorder_join_keys_to_inputs(plan)?)) - }) - .data()? - }; - - // Then run ensure_distribution rule - DistributionContext::new_default(adjusted) - .transform_up(|distribution_context| { - ensure_distribution(distribution_context, &self.config) - }) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - } - - for run in optimizers_to_run { - optimized = match run { - Run::Distribution => { - let optimizer = EnforceDistribution::new(); - optimizer.optimize(optimized, &self.config)? - } - Run::Sorting => { - let optimizer = EnforceSorting::new(); - optimizer.optimize(optimized, &self.config)? - } - }; - } - - // Remove the ancillary output requirements operator when done: - let optimizer = OutputRequirements::new_remove_mode(); - let optimized = optimizer.optimize(optimized, &self.config)?; - - // Now format correctly - let actual_lines = get_plan_string(&optimized); - - assert_eq!( - &expected_lines, &actual_lines, - "\n\nexpected:\n\n{expected_lines:#?}\nactual:\n\n{actual_lines:#?}\n\n" - ); - - Ok(optimized) - } - /// Perform a series of runs using the current [`TestConfig`], /// assert the expected plan result, /// and return the result plan (for potential subsequent runs). @@ -1503,15 +1426,6 @@ fn multi_smj_joins() -> Result<()> { for join_type in join_types { let join = sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type); - let join_plan = |shift| -> String { - format!( - "{}SortMergeJoin: join_type={join_type}, on=[(a@0, b1@1)]", - " ".repeat(shift) - ) - }; - let join_plan_indent2 = join_plan(2); - let join_plan_indent6 = join_plan(6); - let join_plan_indent10 = join_plan(10); // Top join on (a == c) let top_join_on = vec![( @@ -1520,235 +1434,246 @@ fn multi_smj_joins() -> Result<()> { )]; let top_join = sort_merge_join_exec(join.clone(), parquet_exec(), &top_join_on, &join_type); - let top_join_plan = - format!("SortMergeJoin: join_type={join_type}, on=[(a@0, c@2)]"); - - let expected = match join_type { - // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs - JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => - vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // Should include 7 RepartitionExecs (4 hash, 3 round-robin), 4 SortExecs - // Since ordering of the left child is not preserved after SortMergeJoin - // when mode is Right, RightSemi, RightAnti, Full - // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases - // when mode is Inner, Left, LeftSemi, LeftAnti - // Similarly, since partitioning of the left side is not preserved - // when mode is Right, RightSemi, RightAnti, Full - // - We need to add one additional Hash Repartition after SortMergeJoin in contrast the test - // cases when mode is Inner, Left, LeftSemi, LeftAnti - _ => vec![ - top_join_plan.as_str(), - // Below 2 operators are differences introduced, when join mode is changed - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - &join_plan_indent6, - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - }; - // TODO(wiedld): show different test result if enforce sorting first. - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - - let expected_first_sort_enforcement = match join_type { - // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs - JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => - vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // Should include 8 RepartitionExecs (4 hash, 8 round-robin), 4 SortExecs - // Since ordering of the left child is not preserved after SortMergeJoin - // when mode is Right, RightSemi, RightAnti, Full - // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases - // when mode is Inner, Left, LeftSemi, LeftAnti - // Similarly, since partitioning of the left side is not preserved - // when mode is Right, RightSemi, RightAnti, Full - // - We need to add one additional Hash Repartition and Roundrobin repartition after - // SortMergeJoin in contrast the test cases when mode is Inner, Left, LeftSemi, LeftAnti - _ => vec![ - top_join_plan.as_str(), - // Below 4 operators are differences introduced, when join mode is changed - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - &join_plan_indent10, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - }; - // TODO(wiedld): show different test result if enforce distribution first. - test_config.run( - &expected_first_sort_enforcement, - top_join, - &SORT_DISTRIB_DISTRIB, - )?; - match join_type { - JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { - // This time we use (b1 == c) for top join - // Join on (b1 == c) - let top_join_on = vec![( - Arc::new(Column::new_with_schema("b1", &join.schema()).unwrap()) as _, - Arc::new(Column::new_with_schema("c", &schema()).unwrap()) as _, - )]; - let top_join = - sort_merge_join_exec(join, parquet_exec(), &top_join_on, &join_type); - let top_join_plan = - format!("SortMergeJoin: join_type={join_type}, on=[(b1@6, c@2)]"); - - let expected = match join_type { - // Should include 6 RepartitionExecs(3 hash, 3 round-robin) and 3 SortExecs - JoinType::Inner | JoinType::Right => vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // Should include 7 RepartitionExecs (4 hash, 3 round-robin) and 4 SortExecs - JoinType::Left | JoinType::Full => vec![ - top_join_plan.as_str(), - " SortExec: expr=[b1@6 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10", - &join_plan_indent6, - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // this match arm cannot be reached - _ => unreachable!() - }; - // TODO(wiedld): show different test result if enforce sorting first. - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - - let expected_first_sort_enforcement = match join_type { - // Should include 6 RepartitionExecs (3 of them preserves order) and 3 SortExecs - JoinType::Inner | JoinType::Right => vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs - JoinType::Left | JoinType::Full => vec![ - top_join_plan.as_str(), - " RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@6 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@6 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - &join_plan_indent10, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // this match arm cannot be reached - _ => unreachable!() - }; + let mut settings = Settings::clone_current(); + settings.add_filter(&format!("join_type={join_type}"), "join_type=..."); - // TODO(wiedld): show different test result if enforce distribution first. - test_config.run( - &expected_first_sort_enforcement, - top_join, - &SORT_DISTRIB_DISTRIB, - )?; - } - _ => {} + #[rustfmt::skip] + insta::allow_duplicates! { + settings.bind(|| { + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { + // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs + JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => { + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=..., on=[(a@0, c@2)] + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b1@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // Should include 7 RepartitionExecs (4 hash, 3 round-robin), 4 SortExecs + // Since ordering of the left child is not preserved after SortMergeJoin + // when mode is Right, RightSemi, RightAnti, Full + // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases + // when mode is Inner, Left, LeftSemi, LeftAnti + // Similarly, since partitioning of the left side is not preserved + // when mode is Right, RightSemi, RightAnti, Full + // - We need to add one additional Hash Repartition after SortMergeJoin in contrast the test + // cases when mode is Inner, Left, LeftSemi, LeftAnti + _ => { + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=..., on=[(a@0, c@2)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b1@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + } + + let plan_sort = test_config.to_plan(top_join.clone(), &SORT_DISTRIB_DISTRIB); + + match join_type { + // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs + JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => { + // TODO(wiedld): show different test result if enforce distribution first. + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=..., on=[(a@0, c@2)] + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b1@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // Should include 8 RepartitionExecs (4 hash, 8 round-robin), 4 SortExecs + // Since ordering of the left child is not preserved after SortMergeJoin + // when mode is Right, RightSemi, RightAnti, Full + // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases + // when mode is Inner, Left, LeftSemi, LeftAnti + // Similarly, since partitioning of the left side is not preserved + // when mode is Right, RightSemi, RightAnti, Full + // - We need to add one additional Hash Repartition and Roundrobin repartition after + // SortMergeJoin in contrast the test cases when mode is Inner, Left, LeftSemi, LeftAnti + _ => { + // TODO(wiedld): show different test result if enforce distribution first. + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=..., on=[(a@0, c@2)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b1@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + } + + match join_type { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { + // This time we use (b1 == c) for top join + // Join on (b1 == c) + let top_join_on = vec![( + Arc::new(Column::new_with_schema("b1", &join.schema()).unwrap()) as _, + Arc::new(Column::new_with_schema("c", &schema()).unwrap()) as _, + )]; + let top_join = sort_merge_join_exec(join, parquet_exec(), &top_join_on, &join_type); + + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { + // Should include 6 RepartitionExecs(3 hash, 3 round-robin) and 3 SortExecs + JoinType::Inner | JoinType::Right => { + // TODO(wiedld): show different test result if enforce sorting first. + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=..., on=[(b1@6, c@2)] + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b1@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // Should include 7 RepartitionExecs (4 hash, 3 round-robin) and 4 SortExecs + JoinType::Left | JoinType::Full => { + // TODO(wiedld): show different test result if enforce sorting first. + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=..., on=[(b1@6, c@2)] + SortExec: expr=[b1@6 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10 + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b1@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // this match arm cannot be reached + _ => unreachable!() + } + + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + + match join_type { + // Should include 6 RepartitionExecs (3 of them preserves order) and 3 SortExecs + JoinType::Inner | JoinType::Right => { + // TODO(wiedld): show different test result if enforce distribution first. + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=..., on=[(b1@6, c@2)] + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b1@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs + JoinType::Left | JoinType::Full => { + // TODO(wiedld): show different test result if enforce distribution first. + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=..., on=[(b1@6, c@2)] + RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@6 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b1@6 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b1@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // this match arm cannot be reached + _ => unreachable!() + } + } + _ => {} + } + }); } } - Ok(()) } @@ -2667,46 +2592,51 @@ fn parallelization_compressed_csv() -> Result<()> { FileCompressionType::UNCOMPRESSED, ]; - let expected_not_partitioned = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - - let expected_partitioned = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; + #[rustfmt::skip] + insta::allow_duplicates! { + for compression_type in compression_types { + let plan = aggregate_exec_with_alias( + DataSourceExec::from_data_source( + FileScanConfigBuilder::new( + ObjectStoreUrl::parse("test:///").unwrap(), + schema(), + Arc::new(CsvSource::new(false, b',', b'"')), + ) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_file_compression_type(compression_type) + .build(), + ), + vec![("a".to_string(), "a".to_string())], + ); + let test_config = TestConfig::default() + .with_query_execution_partitions(2) + .with_prefer_repartition_file_scans(10); + + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + if compression_type.is_compressed() { + // Compressed files cannot be partitioned + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + } else { + // Uncompressed files can be partitioned + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + } - for compression_type in compression_types { - let expected = if compression_type.is_compressed() { - &expected_not_partitioned[..] - } else { - &expected_partitioned[..] - }; - - let plan = aggregate_exec_with_alias( - DataSourceExec::from_data_source( - FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_file_compression_type(compression_type) - .build(), - ), - vec![("a".to_string(), "a".to_string())], - ); - let test_config = TestConfig::default() - .with_query_execution_partitions(2) - .with_prefer_repartition_file_scans(10); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); + } } Ok(()) } From 7b5685baa7043f8a99ee6613050eeaaa575a50dc Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Wed, 5 Nov 2025 12:27:04 +0100 Subject: [PATCH 0008/1589] Add benchmark for array_reverse (#18425) There's no benchmarks for `array_reverse`. I used this while working on #18424 to confirm `take` was faster than MutableData for ListView. That might be the case for other List types as well, which are currently using `MutableData`. The benchmark can be run with `cargo bench --bench array_reverse`. --- datafusion/functions-nested/Cargo.toml | 4 + .../functions-nested/benches/array_reverse.rs | 78 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 datafusion/functions-nested/benches/array_reverse.rs diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 9c0b7a16f9a9b..6e0d1048f9697 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -66,6 +66,10 @@ rand = { workspace = true } harness = false name = "array_expression" +[[bench]] +harness = false +name = "array_reverse" + [[bench]] harness = false name = "map" diff --git a/datafusion/functions-nested/benches/array_reverse.rs b/datafusion/functions-nested/benches/array_reverse.rs new file mode 100644 index 0000000000000..d4a63e36403af --- /dev/null +++ b/datafusion/functions-nested/benches/array_reverse.rs @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +extern crate arrow; + +use std::{hint::black_box, sync::Arc}; + +use crate::criterion::Criterion; +use arrow::{ + array::{ArrayRef, FixedSizeListArray, Int32Array, ListArray, ListViewArray}, + buffer::{OffsetBuffer, ScalarBuffer}, + datatypes::{DataType, Field}, +}; +use datafusion_functions_nested::reverse::array_reverse_inner; + +fn array_reverse(array: &ArrayRef) -> ArrayRef { + black_box(array_reverse_inner(std::slice::from_ref(array)).unwrap()) +} + +fn criterion_benchmark(c: &mut Criterion) { + // Construct large arrays for benchmarking + let array_len = 100000; + let step_size: usize = 1000; + let offsets: Vec = (0..array_len as i32).step_by(step_size).collect(); + let offsets = ScalarBuffer::from(offsets); + let sizes: Vec = vec![step_size as i32; array_len / step_size]; + let values = (0..array_len as i32).collect::>(); + let list_array: ArrayRef = Arc::new(ListArray::new( + Arc::new(Field::new("a", DataType::Int32, false)), + OffsetBuffer::new(offsets.clone()), + Arc::new(Int32Array::from(values.clone())), + None, + )); + let fixed_size_list_array: ArrayRef = Arc::new(FixedSizeListArray::new( + Arc::new(Field::new("a", DataType::Int32, false)), + step_size as i32, + Arc::new(Int32Array::from(values.clone())), + None, + )); + let list_view_array: ArrayRef = Arc::new(ListViewArray::new( + Arc::new(Field::new("a", DataType::Int32, false)), + offsets, + ScalarBuffer::from(sizes), + Arc::new(Int32Array::from(values)), + None, + )); + + c.bench_function("array_reverse_list", |b| { + b.iter(|| array_reverse(&list_array)) + }); + + c.bench_function("array_reverse_fixed_size_list", |b| { + b.iter(|| array_reverse(&fixed_size_list_array)) + }); + + c.bench_function("array_reverse_list_view", |b| { + b.iter(|| array_reverse(&list_view_array)) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 6852d502a8fac35a21efcb5a59b398208379fe0a Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Thu, 6 Nov 2025 00:11:43 +0800 Subject: [PATCH 0009/1589] chore: simplify map const (#18440) ## Which issue does this PR close? ## Rationale for this change map const wont be simplified, the comment says "TODO: support the optimization for `Map` type after support impl hash for it", but it seems that hash is already supported for map. ## What changes are included in this PR? remove the todo ## Are these changes tested? UT ## Are there any user-facing changes? No --- .../simplify_expressions/expr_simplifier.rs | 27 ++----------------- .../sqllogictest/test_files/simplify_expr.slt | 11 ++++++++ 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 56fe95fffd150..05b8c28fadd6c 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -722,35 +722,12 @@ impl<'a> ConstEvaluator<'a> { } else { // Non-ListArray match ScalarValue::try_from_array(&a, 0) { - Ok(s) => { - // TODO: support the optimization for `Map` type after support impl hash for it - if matches!(&s, ScalarValue::Map(_)) { - ConstSimplifyResult::SimplifyRuntimeError( - DataFusionError::NotImplemented("Const evaluate for Map type is still not supported".to_string()), - expr, - ) - } else { - ConstSimplifyResult::Simplified(s, metadata) - } - } + Ok(s) => ConstSimplifyResult::Simplified(s, metadata), Err(err) => ConstSimplifyResult::SimplifyRuntimeError(err, expr), } } } - ColumnarValue::Scalar(s) => { - // TODO: support the optimization for `Map` type after support impl hash for it - if matches!(&s, ScalarValue::Map(_)) { - ConstSimplifyResult::SimplifyRuntimeError( - DataFusionError::NotImplemented( - "Const evaluate for Map type is still not supported" - .to_string(), - ), - expr, - ) - } else { - ConstSimplifyResult::Simplified(s, metadata) - } - } + ColumnarValue::Scalar(s) => ConstSimplifyResult::Simplified(s, metadata), } } } diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index c77163dc996dc..2387385369cb2 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -107,3 +107,14 @@ query B SELECT a / NULL::DECIMAL(4,3) > 1.2::decimal(2,1) FROM VALUES (1) AS t(a); ---- NULL + +query TT +explain SELECT CASE WHEN 1 > 0 THEN MAP {'x': 100} ELSE MAP {'y': 200} END AS a; +---- +logical_plan +01)Projection: Map([{"x":"100"}]) AS a +02)--EmptyRelation: rows=1 +physical_plan +01)ProjectionExec: expr=[[{x:100}] as a] +02)--PlaceholderRowExec + From 32d26187a358aaa0803867a7963cd149ec1fcd1e Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Wed, 5 Nov 2025 17:32:26 +0100 Subject: [PATCH 0010/1589] Avoid scatter operation in `ExpressionOrExpression` case evaluation method (#18444) ## Which issue does this PR close? - Part of #18075. ## Rationale for this change The `ExpressionOrExpression` case evaluation method currently uses `zip` to combine the `then` and `else` results for a batch. This requires a scatter operation to ensure the partial results are correctly lined up for the `zip` algorithm. By using a custom `merge` algorithm, this scatter step can be avoided. ## What changes are included in this PR? - Introduce a zip variant that does not require prealigning truthy and falsy result values with the mask array ## Are these changes tested? Covered by existing case tests ## Are there any user-facing changes? No --- .../physical-expr/src/expressions/case.rs | 259 +++++++++++++----- 1 file changed, 191 insertions(+), 68 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 010df564a948a..7a33aa95c56b5 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -23,8 +23,9 @@ use arrow::array::*; use arrow::compute::kernels::zip::zip; use arrow::compute::{ is_not_null, not, nullif, prep_null_mask_filter, FilterBuilder, FilterPredicate, + SlicesIterator, }; -use arrow::datatypes::{DataType, Schema, UInt32Type}; +use arrow::datatypes::{DataType, Schema, UInt32Type, UnionMode}; use arrow::error::ArrowError; use datafusion_common::cast::as_boolean_array; use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; @@ -246,13 +247,26 @@ fn is_cheap_and_infallible(expr: &Arc) -> bool { } /// Creates a [FilterPredicate] from a boolean array. -fn create_filter(predicate: &BooleanArray) -> FilterPredicate { +fn create_filter(predicate: &BooleanArray, optimize: bool) -> FilterPredicate { let mut filter_builder = FilterBuilder::new(predicate); - // Always optimize the filter since we use them multiple times. - filter_builder = filter_builder.optimize(); + if optimize { + // Always optimize the filter since we use them multiple times. + filter_builder = filter_builder.optimize(); + } filter_builder.build() } +fn multiple_arrays(data_type: &DataType) -> bool { + match data_type { + DataType::Struct(fields) => { + fields.len() > 1 + || fields.len() == 1 && multiple_arrays(fields[0].data_type()) + } + DataType::Union(fields, UnionMode::Sparse) => !fields.is_empty(), + _ => false, + } +} + // This should be removed when https://github.com/apache/arrow-rs/pull/8693 // is merged and becomes available. fn filter_record_batch( @@ -290,6 +304,84 @@ fn filter_array( filter.filter(array) } +fn merge( + mask: &BooleanArray, + truthy: ColumnarValue, + falsy: ColumnarValue, +) -> std::result::Result { + let (truthy, truthy_is_scalar) = match truthy { + ColumnarValue::Array(a) => (a, false), + ColumnarValue::Scalar(s) => (s.to_array()?, true), + }; + let (falsy, falsy_is_scalar) = match falsy { + ColumnarValue::Array(a) => (a, false), + ColumnarValue::Scalar(s) => (s.to_array()?, true), + }; + + if truthy_is_scalar && falsy_is_scalar { + return zip(mask, &Scalar::new(truthy), &Scalar::new(falsy)); + } + + let falsy = falsy.to_data(); + let truthy = truthy.to_data(); + + let mut mutable = MutableArrayData::new(vec![&truthy, &falsy], false, truthy.len()); + + // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to + // fill with falsy values + + // keep track of how much is filled + let mut filled = 0; + let mut falsy_offset = 0; + let mut truthy_offset = 0; + + SlicesIterator::new(mask).for_each(|(start, end)| { + // the gap needs to be filled with falsy values + if start > filled { + if falsy_is_scalar { + for _ in filled..start { + // Copy the first item from the 'falsy' array into the output buffer. + mutable.extend(1, 0, 1); + } + } else { + let falsy_length = start - filled; + let falsy_end = falsy_offset + falsy_length; + mutable.extend(1, falsy_offset, falsy_end); + falsy_offset = falsy_end; + } + } + // fill with truthy values + if truthy_is_scalar { + for _ in start..end { + // Copy the first item from the 'truthy' array into the output buffer. + mutable.extend(0, 0, 1); + } + } else { + let truthy_length = end - start; + let truthy_end = truthy_offset + truthy_length; + mutable.extend(0, truthy_offset, truthy_end); + truthy_offset = truthy_end; + } + filled = end; + }); + // the remaining part is falsy + if filled < mask.len() { + if falsy_is_scalar { + for _ in filled..mask.len() { + // Copy the first item from the 'falsy' array into the output buffer. + mutable.extend(1, 0, 1); + } + } else { + let falsy_length = mask.len() - filled; + let falsy_end = falsy_offset + falsy_length; + mutable.extend(1, falsy_offset, falsy_end); + } + } + + let data = mutable.freeze(); + Ok(make_array(data)) +} + /// Merges elements by index from a list of [`ArrayData`], creating a new [`ColumnarValue`] from /// those values. /// @@ -342,7 +434,7 @@ fn filter_array( /// └───────────┘ └─────────┘ └─────────┘ /// values indices result /// ``` -fn merge(values: &[ArrayData], indices: &[PartialResultIndex]) -> Result { +fn merge_n(values: &[ArrayData], indices: &[PartialResultIndex]) -> Result { #[cfg(debug_assertions)] for ix in indices { if let Some(index) = ix.index() { @@ -647,7 +739,7 @@ impl ResultBuilder { } Partial { arrays, indices } => { // Merge partial results into a single array. - Ok(ColumnarValue::Array(merge(&arrays, &indices)?)) + Ok(ColumnarValue::Array(merge_n(&arrays, &indices)?)) } Complete(v) => { // If we have a complete result, we can just return it. @@ -723,6 +815,26 @@ impl CaseExpr { } impl CaseBody { + fn data_type(&self, input_schema: &Schema) -> Result { + // since all then results have the same data type, we can choose any one as the + // return data type except for the null. + let mut data_type = DataType::Null; + for i in 0..self.when_then_expr.len() { + data_type = self.when_then_expr[i].1.data_type(input_schema)?; + if !data_type.equals_datatype(&DataType::Null) { + break; + } + } + // if all then results are null, we use data type of else expr instead if possible. + if data_type.equals_datatype(&DataType::Null) { + if let Some(e) = &self.else_expr { + data_type = e.data_type(input_schema)?; + } + } + + Ok(data_type) + } + /// See [CaseExpr::case_when_with_expr]. fn case_when_with_expr( &self, @@ -767,7 +879,7 @@ impl CaseBody { result_builder.add_branch_result(&remainder_rows, nulls_value)?; } else { // Filter out the null rows and evaluate the else expression for those - let nulls_filter = create_filter(¬(&base_not_nulls)?); + let nulls_filter = create_filter(¬(&base_not_nulls)?, true); let nulls_batch = filter_record_batch(&remainder_batch, &nulls_filter)?; let nulls_rows = filter_array(&remainder_rows, &nulls_filter)?; @@ -782,7 +894,7 @@ impl CaseBody { } // Remove the null rows from the remainder batch - let not_null_filter = create_filter(&base_not_nulls); + let not_null_filter = create_filter(&base_not_nulls, true); remainder_batch = Cow::Owned(filter_record_batch(&remainder_batch, ¬_null_filter)?); remainder_rows = filter_array(&remainder_rows, ¬_null_filter)?; @@ -802,8 +914,7 @@ impl CaseBody { compare_with_eq(&a, &base_values, base_value_is_nested) } ColumnarValue::Scalar(s) => { - let scalar = Scalar::new(s.to_array()?); - compare_with_eq(&scalar, &base_values, base_value_is_nested) + compare_with_eq(&s.to_scalar()?, &base_values, base_value_is_nested) } }?; @@ -829,7 +940,7 @@ impl CaseBody { // for the current branch // Still no need to call `prep_null_mask_filter` since `create_filter` will already do // this unconditionally. - let then_filter = create_filter(&when_value); + let then_filter = create_filter(&when_value, true); let then_batch = filter_record_batch(&remainder_batch, &then_filter)?; let then_rows = filter_array(&remainder_rows, &then_filter)?; @@ -852,7 +963,7 @@ impl CaseBody { not(&prep_null_mask_filter(&when_value)) } }?; - let next_filter = create_filter(&next_selection); + let next_filter = create_filter(&next_selection, true); remainder_batch = Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?); remainder_rows = filter_array(&remainder_rows, &next_filter)?; @@ -918,7 +1029,7 @@ impl CaseBody { // for the current branch // Still no need to call `prep_null_mask_filter` since `create_filter` will already do // this unconditionally. - let then_filter = create_filter(when_value); + let then_filter = create_filter(when_value, true); let then_batch = filter_record_batch(&remainder_batch, &then_filter)?; let then_rows = filter_array(&remainder_rows, &then_filter)?; @@ -941,7 +1052,7 @@ impl CaseBody { not(&prep_null_mask_filter(when_value)) } }?; - let next_filter = create_filter(&next_selection); + let next_filter = create_filter(&next_selection, true); remainder_batch = Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?); remainder_rows = filter_array(&remainder_rows, &next_filter)?; @@ -964,24 +1075,39 @@ impl CaseBody { &self, batch: &RecordBatch, when_value: &BooleanArray, - return_type: &DataType, ) -> Result { - let then_value = self.when_then_expr[0] - .1 - .evaluate_selection(batch, when_value)? - .into_array(batch.num_rows())?; + let when_value = match when_value.null_count() { + 0 => Cow::Borrowed(when_value), + _ => { + // `prep_null_mask_filter` is required to ensure null is treated as false + Cow::Owned(prep_null_mask_filter(when_value)) + } + }; + + let optimize_filter = batch.num_columns() > 1 + || (batch.num_columns() == 1 && multiple_arrays(batch.column(0).data_type())); + + let when_filter = create_filter(&when_value, optimize_filter); + let then_batch = filter_record_batch(batch, &when_filter)?; + let then_value = self.when_then_expr[0].1.evaluate(&then_batch)?; + + let else_selection = not(&when_value)?; + let else_filter = create_filter(&else_selection, optimize_filter); + let else_batch = filter_record_batch(batch, &else_filter)?; - // evaluate else expression on the values not covered by when_value - let remainder = not(when_value)?; - let e = self.else_expr.as_ref().unwrap(); // keep `else_expr`'s data type and return type consistent - let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone()) + let e = self.else_expr.as_ref().unwrap(); + let return_type = self.data_type(&batch.schema())?; + let else_expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone()) .unwrap_or_else(|_| Arc::clone(e)); - let else_ = expr - .evaluate_selection(batch, &remainder)? - .into_array(batch.num_rows())?; - Ok(ColumnarValue::Array(zip(&remainder, &else_, &then_value)?)) + let else_value = else_expr.evaluate(&else_batch)?; + + Ok(ColumnarValue::Array(merge( + &when_value, + then_value, + else_value, + )?)) } } @@ -1113,11 +1239,12 @@ impl CaseExpr { batch: &RecordBatch, projected: &ProjectedCaseBody, ) -> Result { - let return_type = self.data_type(&batch.schema())?; - // evaluate when condition on batch let when_value = self.body.when_then_expr[0].0.evaluate(batch)?; - let when_value = when_value.into_array(batch.num_rows())?; + // `num_rows == 1` is intentional to avoid expanding scalars. + // If the `when_value` is effectively a scalar, the 'all true' and 'all false' checks + // below will avoid incorrectly using the scalar as a merge/zip mask. + let when_value = when_value.into_array(1)?; let when_value = as_boolean_array(&when_value).map_err(|e| { DataFusionError::Context( "WHEN expression did not return a BooleanArray".to_string(), @@ -1125,29 +1252,21 @@ impl CaseExpr { ) })?; - // For the true and false/null selection vectors, bypass `evaluate_selection` and merging - // results. This avoids materializing the array for the other branch which we will discard - // entirely anyway. let true_count = when_value.true_count(); - if true_count == batch.num_rows() { - return self.body.when_then_expr[0].1.evaluate(batch); + if true_count == when_value.len() { + // All input rows are true, just call the 'then' expression + self.body.when_then_expr[0].1.evaluate(batch) } else if true_count == 0 { - return self.body.else_expr.as_ref().unwrap().evaluate(batch); - } - - // Treat 'NULL' as false value - let when_value = match when_value.null_count() { - 0 => Cow::Borrowed(when_value), - _ => Cow::Owned(prep_null_mask_filter(when_value)), - }; - - if projected.projection.len() < batch.num_columns() { + // All input rows are false/null, just call the 'else' expression + self.body.else_expr.as_ref().unwrap().evaluate(batch) + } else if projected.projection.len() < batch.num_columns() { + // The case expressions do not use all the columns of the input batch. + // Project first to reduce time spent filtering. let projected_batch = batch.project(&projected.projection)?; - projected - .body - .expr_or_expr(&projected_batch, &when_value, &return_type) + projected.body.expr_or_expr(&projected_batch, when_value) } else { - self.body.expr_or_expr(batch, &when_value, &return_type) + // All columns are used in the case expressions, so there is no need to project. + self.body.expr_or_expr(batch, when_value) } } } @@ -1159,23 +1278,7 @@ impl PhysicalExpr for CaseExpr { } fn data_type(&self, input_schema: &Schema) -> Result { - // since all then results have the same data type, we can choose any one as the - // return data type except for the null. - let mut data_type = DataType::Null; - for i in 0..self.body.when_then_expr.len() { - data_type = self.body.when_then_expr[i].1.data_type(input_schema)?; - if !data_type.equals_datatype(&DataType::Null) { - break; - } - } - // if all then results are null, we use data type of else expr instead if possible. - if data_type.equals_datatype(&DataType::Null) { - if let Some(e) = &self.body.else_expr { - data_type = e.data_type(input_schema)?; - } - } - - Ok(data_type) + self.body.data_type(input_schema) } fn nullable(&self, input_schema: &Schema) -> Result { @@ -2140,7 +2243,7 @@ mod tests { } #[test] - fn test_merge() { + fn test_merge_n() { let a1 = StringArray::from(vec![Some("A")]).to_data(); let a2 = StringArray::from(vec![Some("B")]).to_data(); let a3 = StringArray::from(vec![Some("C"), Some("D")]).to_data(); @@ -2154,7 +2257,7 @@ mod tests { PartialResultIndex::try_new(2).unwrap(), ]; - let merged = merge(&[a1, a2, a3], &indices).unwrap(); + let merged = merge_n(&[a1, a2, a3], &indices).unwrap(); let merged = merged.as_string::(); assert_eq!(merged.len(), indices.len()); @@ -2169,4 +2272,24 @@ mod tests { assert!(merged.is_valid(5)); assert_eq!(merged.value(5), "D"); } + + #[test] + fn test_merge() { + let a1 = Arc::new(StringArray::from(vec![Some("A"), Some("C")])); + let a2 = Arc::new(StringArray::from(vec![Some("B")])); + + let mask = BooleanArray::from(vec![true, false, true]); + + let merged = + merge(&mask, ColumnarValue::Array(a1), ColumnarValue::Array(a2)).unwrap(); + let merged = merged.as_string::(); + + assert_eq!(merged.len(), mask.len()); + assert!(merged.is_valid(0)); + assert_eq!(merged.value(0), "A"); + assert!(merged.is_valid(1)); + assert_eq!(merged.value(1), "B"); + assert!(merged.is_valid(2)); + assert_eq!(merged.value(2), "C"); + } } From d679b3be4f7a0943dde5e58aa191b1d72fad648c Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 5 Nov 2025 20:09:24 +0000 Subject: [PATCH 0011/1589] Fix an out of date comment for `snapshot_physical_expr` (#18498) ## Which issue does this PR close? - Closes https://github.com/apache/datafusion/issues/18497 ## Rationale for this change Better docs ## What changes are included in this PR? Docs clarifications ## Are these changes tested? N/A ## Are there any user-facing changes? No --- datafusion/physical-expr-common/src/physical_expr.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 492383663d455..e8280409c0279 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -565,9 +565,8 @@ pub fn fmt_sql(expr: &dyn PhysicalExpr) -> impl Display + '_ { /// /// # Returns /// -/// Returns an `Option>` which is the snapshot of the -/// `PhysicalExpr` if it is dynamic. If the `PhysicalExpr` does not have -/// any dynamic references or state, it returns `None`. +/// Returns a snapshot of the `PhysicalExpr` if it is dynamic, otherwise +/// returns itself. pub fn snapshot_physical_expr( expr: Arc, ) -> Result> { From f2437d13c08283cb1dd1241b716c4e3a4e027648 Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Thu, 6 Nov 2025 11:17:54 +0800 Subject: [PATCH 0012/1589] Disable `parquet_encryption` by default in datafusion-sqllogictests (#18492) ## Which issue does this PR close? - Closes [#18490](https://github.com/apache/datafusion/issues/18490) ## Rationale for this change When our internal project to use datafusion, we found it default to enable encryption even for latest datafusion. Problem Analysis In datafusion/sqllogictest/Cargo.toml: ```rust datafusion = { workspace = true, default-features = true, features = ["avro", "parquet_encryption"] } ``` The Problem: datafusion-sqllogictest depends on datafusion with default-features = true And, sqllogictest is the member of datafusion, it will default to encryption when we use datafusion. ```rust [workspace] members = [ "datafusion/common", "datafusion/common-runtime", "datafusion/catalog", "datafusion/catalog-listing", "datafusion/datasource", "datafusion/datasource-avro", "datafusion/datasource-csv", "datafusion/datasource-json", "datafusion/datasource-parquet", "datafusion/core", "datafusion/expr", "datafusion/expr-common", "datafusion/execution", "datafusion/ffi", "datafusion/functions", "datafusion/functions-aggregate", "datafusion/functions-aggregate-common", "datafusion/functions-table", "datafusion/functions-nested", "datafusion/functions-window", "datafusion/functions-window-common", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/physical-expr-adapter", "datafusion/physical-expr-common", "datafusion/physical-optimizer", "datafusion/pruning", "datafusion/physical-plan", "datafusion/proto", "datafusion/proto/gen", "datafusion/proto-common", "datafusion/proto-common/gen", "datafusion/session", "datafusion/spark", "datafusion/sql", "datafusion/sqllogictest", "datafusion/substrait", "datafusion-cli", "datafusion-examples", "datafusion-examples/examples/ffi/ffi_example_table_provider", "datafusion-examples/examples/ffi/ffi_module_interface", "datafusion-examples/examples/ffi/ffi_module_loader", "test-utils", "benchmarks", "datafusion/macros", "datafusion/doc", ] exclude = ["dev/depcheck"] resolver = "2" ``` ## What changes are included in this PR? Fixed above. ## Are these changes tested? Yes ## Are there any user-facing changes? Make encryption a feature instead of default. --- .github/workflows/extended.yml | 2 +- .github/workflows/rust.yml | 2 +- datafusion/sqllogictest/Cargo.toml | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 2472d2e0424fd..19d3e723c64cd 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -169,7 +169,7 @@ jobs: rust-version: stable - name: Run sqllogictest run: | - cargo test --features backtrace --profile release-nonlto --test sqllogictests -- --include-sqlite + cargo test --features backtrace,parquet_encryption --profile release-nonlto --test sqllogictests -- --include-sqlite cargo clean diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 88d9f4e13378c..4b3c31e6b3b0c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -475,7 +475,7 @@ jobs: export RUST_MIN_STACK=20971520 export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data` cargo test plan_q --package datafusion-benchmarks --profile ci --features=ci -- --test-threads=1 - INCLUDE_TPCH=true cargo test --features backtrace --profile ci --package datafusion-sqllogictest --test sqllogictests + INCLUDE_TPCH=true cargo test --features backtrace,parquet_encryption --profile ci --package datafusion-sqllogictest --test sqllogictests - name: Verify Working Directory Clean run: git diff --exit-code diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index e719a8851df7c..9cf397270100f 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -43,7 +43,7 @@ bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } clap = { version = "4.5.50", features = ["derive", "env"] } -datafusion = { workspace = true, default-features = true, features = ["avro", "parquet_encryption"] } +datafusion = { workspace = true, default-features = true, features = ["avro"] } datafusion-spark = { workspace = true, default-features = true } datafusion-substrait = { workspace = true, default-features = true } futures = { workspace = true } @@ -78,6 +78,9 @@ postgres = [ "testcontainers-modules", "tokio-postgres", ] +parquet_encryption = [ + "datafusion/parquet_encryption", +] [dev-dependencies] env_logger = { workspace = true } From d3a25fe82a1f57b009b8992d527761d86043b054 Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Thu, 6 Nov 2025 16:17:26 +0800 Subject: [PATCH 0013/1589] Make extended test to use optional parquet_encryption feature (#18507) ## Which issue does this PR close? Now we make parquet_encryption optional feature - Closes https://github.com/apache/datafusion/pull/18492#issuecomment-3495097113 ## Rationale for this change Make the extended test to add this feature. ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --- .github/workflows/extended.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 19d3e723c64cd..85e40731a9592 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -124,7 +124,7 @@ jobs: --lib \ --tests \ --bins \ - --features avro,json,backtrace,extended_tests,recursive_protection + --features avro,json,backtrace,extended_tests,recursive_protection,parquet_encryption - name: Verify Working Directory Clean run: git diff --exit-code - name: Cleanup From 1ac6625c4f3c8c07028d28479a186953a9d06d4c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 6 Nov 2025 06:02:30 -0500 Subject: [PATCH 0014/1589] Update roadmap links for DataFusion Q1 2026 (#18495) ## Which issue does this PR close? N/A ## Rationale for this change I started a new roadmap discussion, so let's also link it in the docs ## What changes are included in this PR? Add a link to - https://github.com/apache/datafusion/issues/18494 ## Are these changes tested? by C I ## Are there any user-facing changes? new link in docs --- docs/source/contributor-guide/roadmap.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/contributor-guide/roadmap.md b/docs/source/contributor-guide/roadmap.md index 073682008047d..aac0710dadf77 100644 --- a/docs/source/contributor-guide/roadmap.md +++ b/docs/source/contributor-guide/roadmap.md @@ -55,8 +55,9 @@ discussion. For more information: 1. [Search for issues labeled `roadmap`](https://github.com/apache/datafusion/issues?q=is%3Aissue%20%20%20roadmap) -2. [DataFusion Road Map: Q3-Q4 2025](https://github.com/apache/datafusion/issues/15878) -3. [2024 Q4 / 2025 Q1 Roadmap](https://github.com/apache/datafusion/issues/13274) +2. [DataFusion Road Map: Q1 2026](https://github.com/apache/datafusion/issues/18494) +3. [DataFusion Road Map: Q3-Q4 2025](https://github.com/apache/datafusion/issues/15878) +4. [2024 Q4 / 2025 Q1 Roadmap](https://github.com/apache/datafusion/issues/13274) ## Improvement Proposals From a5eb9121ccf802dda547897155403b08a4fbf774 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov <62326549+cj-zhukov@users.noreply.github.com> Date: Thu, 6 Nov 2025 15:33:00 +0300 Subject: [PATCH 0015/1589] Consolidate udf examples (#18142) (#18493) ## Which issue does this PR close? - part of #https://github.com/apache/datafusion/issues/18142. ## Rationale for this change This PR is for consolidating all the `udf` examples into a single example binary. We are agreed on the pattern and we can apply it to the remaining examples ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --------- Co-authored-by: Sergey Zhukov --- datafusion-examples/README.md | 15 +- .../examples/{ => udf}/advanced_udaf.rs | 5 +- .../examples/{ => udf}/advanced_udf.rs | 55 ++++---- .../examples/{ => udf}/advanced_udwf.rs | 5 +- .../examples/{ => udf}/async_udf.rs | 5 +- datafusion-examples/examples/udf/main.rs | 133 ++++++++++++++++++ .../examples/{ => udf}/simple_udaf.rs | 5 +- .../examples/{ => udf}/simple_udf.rs | 3 +- .../examples/{ => udf}/simple_udtf.rs | 3 +- .../examples/{ => udf}/simple_udwf.rs | 3 +- 10 files changed, 183 insertions(+), 49 deletions(-) rename datafusion-examples/examples/{ => udf}/advanced_udaf.rs (98%) rename datafusion-examples/examples/{ => udf}/advanced_udf.rs (99%) rename datafusion-examples/examples/{ => udf}/advanced_udwf.rs (98%) rename datafusion-examples/examples/{ => udf}/async_udf.rs (98%) create mode 100644 datafusion-examples/examples/udf/main.rs rename datafusion-examples/examples/{ => udf}/simple_udaf.rs (97%) rename datafusion-examples/examples/{ => udf}/simple_udf.rs (99%) rename datafusion-examples/examples/{ => udf}/simple_udtf.rs (99%) rename datafusion-examples/examples/{ => udf}/simple_udwf.rs (99%) diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index f6783a643f76e..f87f62e170af0 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -46,11 +46,11 @@ cargo run --example dataframe ## Single Process -- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF) -- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF) -- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF) +- [`examples/udf/advanced_udaf.rs`](examples/udf/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF) +- [`examples/udf/advanced_udf.rs`](examples/udf/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF) +- [`examples/udf/advanced_udwf.rs`](examples/udf/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF) - [`advanced_parquet_index.rs`](examples/advanced_parquet_index.rs): Creates a detailed secondary index that covers the contents of several parquet files -- [`async_udf.rs`](examples/async_udf.rs): Define and invoke an asynchronous User Defined Scalar Function (UDF) +- [`examples/udf/async_udf.rs`](examples/udf/async_udf.rs): Define and invoke an asynchronous User Defined Scalar Function (UDF) - [`analyzer_rule.rs`](examples/analyzer_rule.rs): Use a custom AnalyzerRule to change a query's semantics (row level access control) - [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog - [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization @@ -83,9 +83,10 @@ cargo run --example dataframe - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP - [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions - [`remote_catalog.rs`](examples/regexp.rs): Examples of interfacing with a remote catalog (e.g. over a network) -- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF) -- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF) -- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF) +- [`examples/udf/simple_udaf.rs`](examples/udf/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF) +- [`examples/udf/simple_udf.rs`](examples/udf/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF) +- [`examples/udf/simple_udtf.rs`](examples/udf/simple_udtf.rs): Define and invoke a User Defined Table Function (UDTF) +- [`examples/udf/simple_udfw.rs`](examples/udf/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF) - [`sql_analysis.rs`](examples/sql_analysis.rs): Analyse SQL queries with DataFusion structures - [`sql_frontend.rs`](examples/sql_frontend.rs): Create LogicalPlans (only) from sql strings - [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a custom SQL dialect on top of `DFParser` diff --git a/datafusion-examples/examples/advanced_udaf.rs b/datafusion-examples/examples/udf/advanced_udaf.rs similarity index 98% rename from datafusion-examples/examples/advanced_udaf.rs rename to datafusion-examples/examples/udf/advanced_udaf.rs index 89f0a470e32e4..81e227bfacee4 100644 --- a/datafusion-examples/examples/advanced_udaf.rs +++ b/datafusion-examples/examples/udf/advanced_udaf.rs @@ -469,8 +469,9 @@ fn create_context() -> Result { Ok(ctx) } -#[tokio::main] -async fn main() -> Result<()> { +/// In this example we register `GeoMeanUdaf` and `SimplifiedGeoMeanUdaf` +/// as user defined aggregate functions and invoke them via the DataFrame API and SQL +pub async fn advanced_udaf() -> Result<()> { let ctx = create_context()?; let geo_mean_udf = AggregateUDF::from(GeoMeanUdaf::new()); diff --git a/datafusion-examples/examples/advanced_udf.rs b/datafusion-examples/examples/udf/advanced_udf.rs similarity index 99% rename from datafusion-examples/examples/advanced_udf.rs rename to datafusion-examples/examples/udf/advanced_udf.rs index 56ae599efa11b..bb5a68e90cbbe 100644 --- a/datafusion-examples/examples/advanced_udf.rs +++ b/datafusion-examples/examples/udf/advanced_udf.rs @@ -245,10 +245,35 @@ fn maybe_pow_in_place(base: f64, exp_array: ArrayRef) -> Result { } } +/// create local execution context with an in-memory table: +/// +/// ```text +/// +-----+-----+ +/// | a | b | +/// +-----+-----+ +/// | 2.1 | 1.0 | +/// | 3.1 | 2.0 | +/// | 4.1 | 3.0 | +/// | 5.1 | 4.0 | +/// +-----+-----+ +/// ``` +fn create_context() -> Result { + // define data. + let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1])); + let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])); + let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?; + + // declare a new context. In Spark API, this corresponds to a new SparkSession + let ctx = SessionContext::new(); + + // declare a table in memory. In Spark API, this corresponds to createDataFrame(...). + ctx.register_batch("t", batch)?; + Ok(ctx) +} + /// In this example we register `PowUdf` as a user defined function /// and invoke it via the DataFrame API and SQL -#[tokio::main] -async fn main() -> Result<()> { +pub async fn advanced_udf() -> Result<()> { let ctx = create_context()?; // create the UDF @@ -295,29 +320,3 @@ async fn main() -> Result<()> { Ok(()) } - -/// create local execution context with an in-memory table: -/// -/// ```text -/// +-----+-----+ -/// | a | b | -/// +-----+-----+ -/// | 2.1 | 1.0 | -/// | 3.1 | 2.0 | -/// | 4.1 | 3.0 | -/// | 5.1 | 4.0 | -/// +-----+-----+ -/// ``` -fn create_context() -> Result { - // define data. - let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1])); - let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])); - let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?; - - // declare a new context. In Spark API, this corresponds to a new SparkSession - let ctx = SessionContext::new(); - - // declare a table in memory. In Spark API, this corresponds to createDataFrame(...). - ctx.register_batch("t", batch)?; - Ok(ctx) -} diff --git a/datafusion-examples/examples/advanced_udwf.rs b/datafusion-examples/examples/udf/advanced_udwf.rs similarity index 98% rename from datafusion-examples/examples/advanced_udwf.rs rename to datafusion-examples/examples/udf/advanced_udwf.rs index ba4c377fd6762..86f215e019c78 100644 --- a/datafusion-examples/examples/advanced_udwf.rs +++ b/datafusion-examples/examples/udf/advanced_udwf.rs @@ -236,8 +236,9 @@ async fn create_context() -> Result { Ok(ctx) } -#[tokio::main] -async fn main() -> Result<()> { +/// In this example we register `SmoothItUdf` as user defined window function +/// and invoke it via the DataFrame API and SQL +pub async fn advanced_udwf() -> Result<()> { let ctx = create_context().await?; let smooth_it = WindowUDF::from(SmoothItUdf::new()); ctx.register_udwf(smooth_it.clone()); diff --git a/datafusion-examples/examples/async_udf.rs b/datafusion-examples/examples/udf/async_udf.rs similarity index 98% rename from datafusion-examples/examples/async_udf.rs rename to datafusion-examples/examples/udf/async_udf.rs index b52ec68ea4422..475775a599f62 100644 --- a/datafusion-examples/examples/async_udf.rs +++ b/datafusion-examples/examples/udf/async_udf.rs @@ -38,8 +38,9 @@ use datafusion::prelude::{SessionConfig, SessionContext}; use std::any::Any; use std::sync::Arc; -#[tokio::main] -async fn main() -> Result<()> { +/// In this example we register `AskLLM` as an asynchronous user defined function +/// and invoke it via the DataFrame API and SQL +pub async fn async_udf() -> Result<()> { // Use a hard coded parallelism level of 4 so the explain plan // is consistent across machines. let config = SessionConfig::new().with_target_partitions(4); diff --git a/datafusion-examples/examples/udf/main.rs b/datafusion-examples/examples/udf/main.rs new file mode 100644 index 0000000000000..ba36dbb15c58b --- /dev/null +++ b/datafusion-examples/examples/udf/main.rs @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # User-Defined Functions Examples +//! +//! These examples demonstrate user-defined functions in DataFusion. +//! +//! Each subcommand runs a corresponding example: +//! - `adv_udaf` — user defined aggregate function example +//! - `adv_udf` — user defined scalar function example +//! - `adv_udwf` — user defined window function example +//! - `async_udf` — asynchronous user defined function example +//! - `udaf` — simple user defined aggregate function example +//! - `udf` — simple user defined scalar function example +//! - `udtf` — simple user defined table function example +//! - `udwf` — simple user defined window function example + +mod advanced_udaf; +mod advanced_udf; +mod advanced_udwf; +mod async_udf; +mod simple_udaf; +mod simple_udf; +mod simple_udtf; +mod simple_udwf; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + AdvUdaf, + AdvUdf, + AdvUdwf, + AsyncUdf, + Udf, + Udaf, + Udwf, + Udtf, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::AdvUdaf => "adv_udaf", + Self::AdvUdf => "adv_udf", + Self::AdvUdwf => "adv_udwf", + Self::AsyncUdf => "async_udf", + Self::Udf => "udf", + Self::Udaf => "udaf", + Self::Udwf => "udwt", + Self::Udtf => "udtf", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "adv_udaf" => Ok(Self::AdvUdaf), + "adv_udf" => Ok(Self::AdvUdf), + "adv_udwf" => Ok(Self::AdvUdwf), + "async_udf" => Ok(Self::AsyncUdf), + "udaf" => Ok(Self::Udaf), + "udf" => Ok(Self::Udf), + "udtf" => Ok(Self::Udtf), + "udwf" => Ok(Self::Udwf), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 8] = [ + Self::AdvUdaf, + Self::AdvUdf, + Self::AdvUdwf, + Self::AsyncUdf, + Self::Udaf, + Self::Udf, + Self::Udtf, + Self::Udwf, + ]; + + const EXAMPLE_NAME: &str = "udf"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::AdvUdaf => advanced_udaf::advanced_udaf().await?, + ExampleKind::AdvUdf => advanced_udf::advanced_udf().await?, + ExampleKind::AdvUdwf => advanced_udwf::advanced_udwf().await?, + ExampleKind::AsyncUdf => async_udf::async_udf().await?, + ExampleKind::Udaf => simple_udaf::simple_udaf().await?, + ExampleKind::Udf => simple_udf::simple_udf().await?, + ExampleKind::Udtf => simple_udtf::simple_udtf().await?, + ExampleKind::Udwf => simple_udwf::simple_udwf().await?, + } + + Ok(()) +} diff --git a/datafusion-examples/examples/simple_udaf.rs b/datafusion-examples/examples/udf/simple_udaf.rs similarity index 97% rename from datafusion-examples/examples/simple_udaf.rs rename to datafusion-examples/examples/udf/simple_udaf.rs index 82bde7c034a57..e9f905e720997 100644 --- a/datafusion-examples/examples/simple_udaf.rs +++ b/datafusion-examples/examples/udf/simple_udaf.rs @@ -135,8 +135,9 @@ impl Accumulator for GeometricMean { } } -#[tokio::main] -async fn main() -> Result<()> { +/// In this example we register `GeometricMean` +/// as user defined aggregate function and invoke it via the DataFrame API and SQL +pub async fn simple_udaf() -> Result<()> { let ctx = create_context()?; // here is where we define the UDAF. We also declare its signature: diff --git a/datafusion-examples/examples/simple_udf.rs b/datafusion-examples/examples/udf/simple_udf.rs similarity index 99% rename from datafusion-examples/examples/simple_udf.rs rename to datafusion-examples/examples/udf/simple_udf.rs index 5612e0939f709..7d4f3588e313f 100644 --- a/datafusion-examples/examples/simple_udf.rs +++ b/datafusion-examples/examples/udf/simple_udf.rs @@ -57,8 +57,7 @@ fn create_context() -> Result { } /// In this example we will declare a single-type, single return type UDF that exponentiates f64, a^b -#[tokio::main] -async fn main() -> Result<()> { +pub async fn simple_udf() -> Result<()> { let ctx = create_context()?; // First, declare the actual implementation of the calculation diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/udf/simple_udtf.rs similarity index 99% rename from datafusion-examples/examples/simple_udtf.rs rename to datafusion-examples/examples/udf/simple_udtf.rs index b65ffb8d71748..a03b157134aea 100644 --- a/datafusion-examples/examples/simple_udtf.rs +++ b/datafusion-examples/examples/udf/simple_udtf.rs @@ -42,8 +42,7 @@ use std::sync::Arc; // 3. Register the function using [`SessionContext::register_udtf`] /// This example demonstrates how to register a TableFunction -#[tokio::main] -async fn main() -> Result<()> { +pub async fn simple_udtf() -> Result<()> { // create local execution context let ctx = SessionContext::new(); diff --git a/datafusion-examples/examples/simple_udwf.rs b/datafusion-examples/examples/udf/simple_udwf.rs similarity index 99% rename from datafusion-examples/examples/simple_udwf.rs rename to datafusion-examples/examples/udf/simple_udwf.rs index 1736ff00bd700..2cf1df8d8ed86 100644 --- a/datafusion-examples/examples/simple_udwf.rs +++ b/datafusion-examples/examples/udf/simple_udwf.rs @@ -42,8 +42,7 @@ async fn create_context() -> Result { } /// In this example we will declare a user defined window function that computes a moving average and then run it using SQL -#[tokio::main] -async fn main() -> Result<()> { +pub async fn simple_udwf() -> Result<()> { let ctx = create_context().await?; // here is where we define the UDWF. We also declare its signature: From 7591919be7e6582ed7f6a8d0b033f7a0a8ad60f7 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Fri, 7 Nov 2025 11:22:44 +0900 Subject: [PATCH 0016/1589] test: add prepare alias slt test (#18522) ## Which issue does this PR close? This did not work (https://github.com/apache/datafusion/issues/18102#issuecomment-3412355785): ```sql DataFusion CLI v50.3.0 > prepare myplan as select $1 as one, $2 as two; Schema error: No field named one. ``` But now it does (https://github.com/apache/datafusion/issues/18102#issuecomment-3493941636): ```sql DataFusion CLI v50.3.0 > PREPARE my_plan(INT, INT) AS SELECT $1 AS one, $2 AS two; 0 row(s) fetched. Elapsed 0.004 seconds. > ``` ## Rationale for this change This PR add a testcase for placeholder aliases. ## What changes are included in this PR? Add test case in `prepare.slt` ## Are these changes tested? Yes ## Are there any user-facing changes? No --- .../sqllogictest/test_files/prepare.slt | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/datafusion/sqllogictest/test_files/prepare.slt b/datafusion/sqllogictest/test_files/prepare.slt index d61603ae65588..486baca6f54d6 100644 --- a/datafusion/sqllogictest/test_files/prepare.slt +++ b/datafusion/sqllogictest/test_files/prepare.slt @@ -327,3 +327,35 @@ EXECUTE my_plan('a', 'b'); ---- 1 a 2 b + +statement ok +SET datafusion.explain.logical_plan_only=false; + +statement ok +DEALLOCATE my_plan + +statement ok +SET datafusion.explain.logical_plan_only=true; + +# Prepare with alias +query TT +EXPLAIN PREPARE my_plan(INT, INT) AS SELECT $1 AS one, $2 AS two; +---- +logical_plan +01)Prepare: "my_plan" [Int32, Int32] +02)--Projection: $1 AS one, $2 AS two +03)----EmptyRelation: rows=1 + +statement ok +PREPARE my_plan(INT, INT) AS SELECT $1 AS one, $2 AS two; + +query II +EXECUTE my_plan(1, 2) +---- +1 2 + +statement ok +SET datafusion.explain.logical_plan_only=false; + +statement ok +DEALLOCATE my_plan From f32984b2dbf9e5a193c20643ce624167295fbd61 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Fri, 7 Nov 2025 18:51:03 +0800 Subject: [PATCH 0017/1589] CI: add `clippy::needless_pass_by_value` rule (#18468) ## Which issue does this PR close? An initial attempt towards https://github.com/apache/datafusion/issues/18467 ## Rationale for this change ### Rationale for the additional lint rule `clippy::needless_pass_by_value` There is a clippy lint rule that is not turned on by the current strictness level in CI: https://rust-lang.github.io/rust-clippy/master/index.html#needless_pass_by_value Note it has the `Clippy` category `pedantic`, and its description is `lints which are rather strict or have occasional false positives` from https://doc.rust-lang.org/nightly/clippy It seems we have been suffering from the excessive copying issue for quite some time, and @alamb is on the front line now https://github.com/apache/datafusion/issues/18413. I think this extra lint rule is able to help. ### Implementation plan This PR only enables this rule in `datafusion-common` package, and apply `#[allow(clippy::needless_pass_by_value)]` for all violations. If this PR makes sense, we can open a tracking issue and roll out this check to the remaining workspace packages. At least this can help prevent new inefficient patterns and identify existing issues that we can fix gradually. ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --- ci/scripts/rust_clippy.sh | 2 +- datafusion/common/src/hash_utils.rs | 18 +++++++++--------- datafusion/common/src/lib.rs | 5 +++++ datafusion/common/src/scalar/mod.rs | 11 +++++------ datafusion/common/src/scalar/struct_builder.rs | 1 + datafusion/expr/src/logical_plan/plan.rs | 4 ++-- 6 files changed, 23 insertions(+), 18 deletions(-) diff --git a/ci/scripts/rust_clippy.sh b/ci/scripts/rust_clippy.sh index 1557bd56eab4a..6a00ad8109561 100755 --- a/ci/scripts/rust_clippy.sh +++ b/ci/scripts/rust_clippy.sh @@ -18,4 +18,4 @@ # under the License. set -ex -cargo clippy --all-targets --workspace --features avro,pyarrow,integration-tests,extended_tests -- -D warnings +cargo clippy --all-targets --workspace --features avro,pyarrow,integration-tests,extended_tests -- -D warnings \ No newline at end of file diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index 4b18351f708b7..b4488c770d8d4 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -141,7 +141,7 @@ fn hash_array_primitive( /// with the new hash using `combine_hashes` #[cfg(not(feature = "force_hash_collisions"))] fn hash_array( - array: T, + array: &T, random_state: &RandomState, hashes_buffer: &mut [u64], rehash: bool, @@ -400,16 +400,16 @@ pub fn create_hashes<'a>( downcast_primitive_array! { array => hash_array_primitive(array, random_state, hashes_buffer, rehash), DataType::Null => hash_null(random_state, hashes_buffer, rehash), - DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash), - DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, rehash), - DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash), - DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash), - DataType::Binary => hash_array(as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), - DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash), - DataType::LargeBinary => hash_array(as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), + DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash), + DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash), + DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash), + DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash), + DataType::Binary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), + DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash), + DataType::LargeBinary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), DataType::FixedSizeBinary(_) => { let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap(); - hash_array(array, random_state, hashes_buffer, rehash) + hash_array(&array, random_state, hashes_buffer, rehash) } DataType::Dictionary(_, _) => downcast_dictionary_array! { array => hash_dictionary(array, random_state, hashes_buffer, rehash)?, diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 76c7b46e32737..c8d5a30ee3e0b 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -23,6 +23,11 @@ // Make sure fast / cheap clones on Arc are explicit: // https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +// This lint rule is enforced in `../Cargo.toml`, but it's okay to skip them in tests +// See details in https://github.com/apache/datafusion/issues/18503 +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] mod column; mod dfschema; diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 188a169a3dd2f..f2e18f7de8f53 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -4648,9 +4648,9 @@ impl fmt::Display for ScalarValue { } None => write!(f, "NULL")?, }, - ScalarValue::List(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?, - ScalarValue::LargeList(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?, - ScalarValue::FixedSizeList(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?, + ScalarValue::List(arr) => fmt_list(arr.as_ref(), f)?, + ScalarValue::LargeList(arr) => fmt_list(arr.as_ref(), f)?, + ScalarValue::FixedSizeList(arr) => fmt_list(arr.as_ref(), f)?, ScalarValue::Date32(e) => format_option!( f, e.map(|v| { @@ -4772,12 +4772,11 @@ impl fmt::Display for ScalarValue { } } -fn fmt_list(arr: ArrayRef, f: &mut fmt::Formatter) -> fmt::Result { +fn fmt_list(arr: &dyn Array, f: &mut fmt::Formatter) -> fmt::Result { // ScalarValue List, LargeList, FixedSizeList should always have a single element assert_eq!(arr.len(), 1); let options = FormatOptions::default().with_display_error(true); - let formatter = - ArrayFormatter::try_new(arr.as_ref() as &dyn Array, &options).unwrap(); + let formatter = ArrayFormatter::try_new(arr, &options).unwrap(); let value_formatter = formatter.value(0); write!(f, "{value_formatter}") } diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs index 56daee904514a..045b5778243df 100644 --- a/datafusion/common/src/scalar/struct_builder.rs +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -83,6 +83,7 @@ impl ScalarStructBuilder { } /// Add the specified field and `ScalarValue` to the struct. + #[expect(clippy::needless_pass_by_value)] // Skip for public API's compatibility pub fn with_scalar(self, field: impl IntoFieldRef, value: ScalarValue) -> Self { // valid scalar value should not fail let array = value.to_array().unwrap(); diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 0f0d81186d68f..0b89a5250902e 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1156,7 +1156,7 @@ impl LogicalPlan { /// Helper for [Self::with_new_exprs] to use when no expressions are expected. #[inline] - #[allow(clippy::needless_pass_by_value)] // expr is moved intentionally to ensure it's not used again + #[expect(clippy::needless_pass_by_value)] // expr is moved intentionally to ensure it's not used again fn assert_no_expressions(&self, expr: Vec) -> Result<()> { if !expr.is_empty() { return internal_err!("{self:?} should have no exprs, got {:?}", expr); @@ -1166,7 +1166,7 @@ impl LogicalPlan { /// Helper for [Self::with_new_exprs] to use when no inputs are expected. #[inline] - #[allow(clippy::needless_pass_by_value)] // inputs is moved intentionally to ensure it's not used again + #[expect(clippy::needless_pass_by_value)] // inputs is moved intentionally to ensure it's not used again fn assert_no_inputs(&self, inputs: Vec) -> Result<()> { if !inputs.is_empty() { return internal_err!("{self:?} should have no inputs, got: {:?}", inputs); From a899ca0c8626b7d43d1caff7f5057c7e9e7e7a3e Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 7 Nov 2025 12:03:28 -0600 Subject: [PATCH 0018/1589] Refactor create_hashes to accept array references (#18448) ## Background This PR is part of an EPIC to push down hash table references from HashJoinExec into scans. The EPIC is tracked in https://github.com/apache/datafusion/issues/17171. A "target state" is tracked in https://github.com/apache/datafusion/pull/18393. There is a series of PRs to get us to this target state in smaller more reviewable changes that are still valuable on their own: - (This PR): https://github.com/apache/datafusion/pull/18448 - https://github.com/apache/datafusion/pull/18449 (depends on https://github.com/apache/datafusion/pull/18448) - https://github.com/apache/datafusion/pull/18451 ## Changes in this PR Change create_hashes and related functions to work with &dyn Array references instead of requiring ArrayRef (Arc-wrapped arrays). This avoids unnecessary Arc::clone() calls and enables calls that only have an &dyn Array to use the hashing utilities. - Add create_hashes_from_arrays(&[&dyn Array]) function - Refactor hash_dictionary, hash_list_array, hash_fixed_list_array to use references instead of cloning - Extract hash_single_array() helper for common logic --------- Co-authored-by: Andrew Lamb --- datafusion/common/src/hash_utils.rs | 278 ++++++++++++------ datafusion/common/src/scalar/mod.rs | 4 +- .../physical-expr-common/src/binary_map.rs | 2 +- .../src/binary_view_map.rs | 5 +- .../physical-plan/src/joins/hash_join/exec.rs | 24 +- 5 files changed, 200 insertions(+), 113 deletions(-) diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index b4488c770d8d4..d60189fb6fa3f 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -17,9 +17,6 @@ //! Functionality used both on logical and physical plans -#[cfg(not(feature = "force_hash_collisions"))] -use std::sync::Arc; - use ahash::RandomState; use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano}; use arrow::array::*; @@ -215,12 +212,11 @@ fn hash_dictionary( // Hash each dictionary value once, and then use that computed // hash for each key value to avoid a potentially expensive // redundant hashing for large dictionary elements (e.g. strings) - let dict_values = Arc::clone(array.values()); + let dict_values = array.values(); let mut dict_hashes = vec![0; dict_values.len()]; - create_hashes(&[dict_values], random_state, &mut dict_hashes)?; + create_hashes([dict_values], random_state, &mut dict_hashes)?; // combine hash for each index in values - let dict_values = array.values(); for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) { if let Some(key) = key { let idx = key.as_usize(); @@ -308,11 +304,11 @@ fn hash_list_array( where OffsetSize: OffsetSizeTrait, { - let values = Arc::clone(array.values()); + let values = array.values(); let offsets = array.value_offsets(); let nulls = array.nulls(); let mut values_hashes = vec![0u64; values.len()]; - create_hashes(&[values], random_state, &mut values_hashes)?; + create_hashes([values], random_state, &mut values_hashes)?; if let Some(nulls) = nulls { for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { if nulls.is_valid(i) { @@ -339,11 +335,11 @@ fn hash_fixed_list_array( random_state: &RandomState, hashes_buffer: &mut [u64], ) -> Result<()> { - let values = Arc::clone(array.values()); + let values = array.values(); let value_length = array.value_length() as usize; let nulls = array.nulls(); let mut values_hashes = vec![0u64; values.len()]; - create_hashes(&[values], random_state, &mut values_hashes)?; + create_hashes([values], random_state, &mut values_hashes)?; if let Some(nulls) = nulls { for i in 0..array.len() { if nulls.is_valid(i) { @@ -366,83 +362,132 @@ fn hash_fixed_list_array( Ok(()) } -/// Test version of `create_hashes` that produces the same value for -/// all hashes (to test collisions) -/// -/// See comments on `hashes_buffer` for more details +/// Internal helper function that hashes a single array and either initializes or combines +/// the hash values in the buffer. +#[cfg(not(feature = "force_hash_collisions"))] +fn hash_single_array( + array: &dyn Array, + random_state: &RandomState, + hashes_buffer: &mut [u64], + rehash: bool, +) -> Result<()> { + downcast_primitive_array! { + array => hash_array_primitive(array, random_state, hashes_buffer, rehash), + DataType::Null => hash_null(random_state, hashes_buffer, rehash), + DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash), + DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash), + DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash), + DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash), + DataType::Binary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), + DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash), + DataType::LargeBinary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), + DataType::FixedSizeBinary(_) => { + let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap(); + hash_array(&array, random_state, hashes_buffer, rehash) + } + DataType::Dictionary(_, _) => downcast_dictionary_array! { + array => hash_dictionary(array, random_state, hashes_buffer, rehash)?, + _ => unreachable!() + } + DataType::Struct(_) => { + let array = as_struct_array(array)?; + hash_struct_array(array, random_state, hashes_buffer)?; + } + DataType::List(_) => { + let array = as_list_array(array)?; + hash_list_array(array, random_state, hashes_buffer)?; + } + DataType::LargeList(_) => { + let array = as_large_list_array(array)?; + hash_list_array(array, random_state, hashes_buffer)?; + } + DataType::Map(_, _) => { + let array = as_map_array(array)?; + hash_map_array(array, random_state, hashes_buffer)?; + } + DataType::FixedSizeList(_,_) => { + let array = as_fixed_size_list_array(array)?; + hash_fixed_list_array(array, random_state, hashes_buffer)?; + } + _ => { + // This is internal because we should have caught this before. + return _internal_err!( + "Unsupported data type in hasher: {}", + array.data_type() + ); + } + } + Ok(()) +} + +/// Test version of `hash_single_array` that forces all hashes to collide to zero. #[cfg(feature = "force_hash_collisions")] -pub fn create_hashes<'a>( - _arrays: &[ArrayRef], +fn hash_single_array( + _array: &dyn Array, _random_state: &RandomState, - hashes_buffer: &'a mut Vec, -) -> Result<&'a mut Vec> { + hashes_buffer: &mut [u64], + _rehash: bool, +) -> Result<()> { for hash in hashes_buffer.iter_mut() { *hash = 0 } - Ok(hashes_buffer) + Ok(()) +} + +/// Something that can be returned as a `&dyn Array`. +/// +/// We want `create_hashes` to accept either `&dyn Array` or `ArrayRef`, +/// and this seems the best way to do so. +/// +/// We tried having it accept `AsRef` +/// but that is not implemented for and cannot be implemented for +/// `&dyn Array` so callers that have the latter would not be able +/// to call `create_hashes` directly. This shim trait makes it possible. +pub trait AsDynArray { + fn as_dyn_array(&self) -> &dyn Array; +} + +impl AsDynArray for dyn Array { + fn as_dyn_array(&self) -> &dyn Array { + self + } +} + +impl AsDynArray for &dyn Array { + fn as_dyn_array(&self) -> &dyn Array { + *self + } +} + +impl AsDynArray for ArrayRef { + fn as_dyn_array(&self) -> &dyn Array { + self.as_ref() + } } -/// Creates hash values for every row, based on the values in the -/// columns. +impl AsDynArray for &ArrayRef { + fn as_dyn_array(&self) -> &dyn Array { + self.as_ref() + } +} + +/// Creates hash values for every row, based on the values in the columns. /// /// The number of rows to hash is determined by `hashes_buffer.len()`. -/// `hashes_buffer` should be pre-sized appropriately -#[cfg(not(feature = "force_hash_collisions"))] -pub fn create_hashes<'a>( - arrays: &[ArrayRef], +/// `hashes_buffer` should be pre-sized appropriately. +pub fn create_hashes<'a, I, T>( + arrays: I, random_state: &RandomState, hashes_buffer: &'a mut Vec, -) -> Result<&'a mut Vec> { - for (i, col) in arrays.iter().enumerate() { - let array = col.as_ref(); +) -> Result<&'a mut Vec> +where + I: IntoIterator, + T: AsDynArray, +{ + for (i, array) in arrays.into_iter().enumerate() { // combine hashes with `combine_hashes` for all columns besides the first let rehash = i >= 1; - downcast_primitive_array! { - array => hash_array_primitive(array, random_state, hashes_buffer, rehash), - DataType::Null => hash_null(random_state, hashes_buffer, rehash), - DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash), - DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash), - DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash), - DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash), - DataType::Binary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), - DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash), - DataType::LargeBinary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), - DataType::FixedSizeBinary(_) => { - let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap(); - hash_array(&array, random_state, hashes_buffer, rehash) - } - DataType::Dictionary(_, _) => downcast_dictionary_array! { - array => hash_dictionary(array, random_state, hashes_buffer, rehash)?, - _ => unreachable!() - } - DataType::Struct(_) => { - let array = as_struct_array(array)?; - hash_struct_array(array, random_state, hashes_buffer)?; - } - DataType::List(_) => { - let array = as_list_array(array)?; - hash_list_array(array, random_state, hashes_buffer)?; - } - DataType::LargeList(_) => { - let array = as_large_list_array(array)?; - hash_list_array(array, random_state, hashes_buffer)?; - } - DataType::Map(_, _) => { - let array = as_map_array(array)?; - hash_map_array(array, random_state, hashes_buffer)?; - } - DataType::FixedSizeList(_,_) => { - let array = as_fixed_size_list_array(array)?; - hash_fixed_list_array(array, random_state, hashes_buffer)?; - } - _ => { - // This is internal because we should have caught this before. - return _internal_err!( - "Unsupported data type in hasher: {}", - col.data_type() - ); - } - } + hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?; } Ok(hashes_buffer) } @@ -465,7 +510,7 @@ mod tests { .collect::() .with_precision_and_scale(20, 3) .unwrap(); - let array_ref = Arc::new(array); + let array_ref: ArrayRef = Arc::new(array); let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; array_ref.len()]; let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?; @@ -478,15 +523,21 @@ mod tests { let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish(); let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; 0]; - let hashes = create_hashes(&[Arc::new(empty_array)], &random_state, hashes_buff)?; + let hashes = create_hashes( + &[Arc::new(empty_array) as ArrayRef], + &random_state, + hashes_buff, + )?; assert_eq!(hashes, &Vec::::new()); Ok(()) } #[test] fn create_hashes_for_float_arrays() -> Result<()> { - let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); - let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); + let f32_arr: ArrayRef = + Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); + let f64_arr: ArrayRef = + Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; f32_arr.len()]; @@ -514,8 +565,10 @@ mod tests { Some(b"Longer than 12 bytes string"), ]; - let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>()); - let ref_array = Arc::new(binary.iter().cloned().collect::()); + let binary_array: ArrayRef = + Arc::new(binary.iter().cloned().collect::<$ARRAY>()); + let ref_array: ArrayRef = + Arc::new(binary.iter().cloned().collect::()); let random_state = RandomState::with_seeds(0, 0, 0, 0); @@ -553,7 +606,7 @@ mod tests { #[test] fn create_hashes_fixed_size_binary() -> Result<()> { let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]]; - let fixed_size_binary_array = + let fixed_size_binary_array: ArrayRef = Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap()); let random_state = RandomState::with_seeds(0, 0, 0, 0); @@ -580,8 +633,9 @@ mod tests { Some("Longer than 12 bytes string"), ]; - let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>()); - let dict_array = Arc::new( + let string_array: ArrayRef = + Arc::new(strings.iter().cloned().collect::<$ARRAY>()); + let dict_array: ArrayRef = Arc::new( strings .iter() .cloned() @@ -629,8 +683,9 @@ mod tests { fn create_hashes_for_dict_arrays() { let strings = [Some("foo"), None, Some("bar"), Some("foo"), None]; - let string_array = Arc::new(strings.iter().cloned().collect::()); - let dict_array = Arc::new( + let string_array: ArrayRef = + Arc::new(strings.iter().cloned().collect::()); + let dict_array: ArrayRef = Arc::new( strings .iter() .cloned() @@ -865,8 +920,9 @@ mod tests { let strings1 = [Some("foo"), None, Some("bar")]; let strings2 = [Some("blarg"), Some("blah"), None]; - let string_array = Arc::new(strings1.iter().cloned().collect::()); - let dict_array = Arc::new( + let string_array: ArrayRef = + Arc::new(strings1.iter().cloned().collect::()); + let dict_array: ArrayRef = Arc::new( strings2 .iter() .cloned() @@ -896,4 +952,52 @@ mod tests { assert_ne!(one_col_hashes, two_col_hashes); } + + #[test] + fn test_create_hashes_from_arrays() { + let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let float_array: ArrayRef = + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let hashes_buff = &mut vec![0; int_array.len()]; + let hashes = + create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap(); + assert_eq!(hashes.len(), 4,); + } + + #[test] + fn test_create_hashes_from_dyn_arrays() { + let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let float_array: ArrayRef = + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])); + + // Verify that we can call create_hashes with only &dyn Array + fn test(arr1: &dyn Array, arr2: &dyn Array) { + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let hashes_buff = &mut vec![0; arr1.len()]; + let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap(); + assert_eq!(hashes.len(), 4,); + } + test(&*int_array, &*float_array); + } + + #[test] + fn test_create_hashes_equivalence() { + let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + let mut hashes1 = vec![0; array.len()]; + create_hashes( + &[Arc::clone(&array) as ArrayRef], + &random_state, + &mut hashes1, + ) + .unwrap(); + + let mut hashes2 = vec![0; array.len()]; + create_hashes([array], &random_state, &mut hashes2).unwrap(); + + assert_eq!(hashes1, hashes2); + } } diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index f2e18f7de8f53..52e0159111249 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -878,10 +878,10 @@ impl Hash for ScalarValue { fn hash_nested_array(arr: ArrayRef, state: &mut H) { let len = arr.len(); - let arrays = vec![arr]; let hashes_buffer = &mut vec![0; len]; let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0); - let hashes = create_hashes(&arrays, &random_state, hashes_buffer).unwrap(); + let hashes = create_hashes(&[arr], &random_state, hashes_buffer) + .expect("hash_nested_array: failed to create row hashes"); // Hash back to std::hash::Hasher hashes.hash(state); } diff --git a/datafusion/physical-expr-common/src/binary_map.rs b/datafusion/physical-expr-common/src/binary_map.rs index b37d9a7773eeb..24bc430630598 100644 --- a/datafusion/physical-expr-common/src/binary_map.rs +++ b/datafusion/physical-expr-common/src/binary_map.rs @@ -349,7 +349,7 @@ where let batch_hashes = &mut self.hashes_buffer; batch_hashes.clear(); batch_hashes.resize(values.len(), 0); - create_hashes(&[Arc::clone(values)], &self.random_state, batch_hashes) + create_hashes([values], &self.random_state, batch_hashes) // hash is supported for all types and create_hashes only // returns errors for unsupported types .unwrap(); diff --git a/datafusion/physical-expr-common/src/binary_view_map.rs b/datafusion/physical-expr-common/src/binary_view_map.rs index 7ce943030a453..2de563472c789 100644 --- a/datafusion/physical-expr-common/src/binary_view_map.rs +++ b/datafusion/physical-expr-common/src/binary_view_map.rs @@ -19,6 +19,7 @@ //! `StringViewArray`/`BinaryViewArray`. //! Much of the code is from `binary_map.rs`, but with simpler implementation because we directly use the //! [`GenericByteViewBuilder`]. +use crate::binary_map::OutputType; use ahash::RandomState; use arrow::array::cast::AsArray; use arrow::array::{Array, ArrayBuilder, ArrayRef, GenericByteViewBuilder}; @@ -28,8 +29,6 @@ use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt}; use std::fmt::Debug; use std::sync::Arc; -use crate::binary_map::OutputType; - /// HashSet optimized for storing string or binary values that can produce that /// the final set as a `GenericBinaryViewArray` with minimal copies. #[derive(Debug)] @@ -243,7 +242,7 @@ where let batch_hashes = &mut self.hashes_buffer; batch_hashes.clear(); batch_hashes.resize(values.len(), 0); - create_hashes(&[Arc::clone(values)], &self.random_state, batch_hashes) + create_hashes([values], &self.random_state, batch_hashes) // hash is supported for all types and create_hashes only // returns errors for unsupported types .unwrap(); diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 0a582bd911cb0..c552e6954c8f9 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -3452,11 +3452,7 @@ mod tests { let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; left.num_rows()]; - let hashes = create_hashes( - &[Arc::clone(&left.columns()[0])], - &random_state, - hashes_buff, - )?; + let hashes = create_hashes([&left.columns()[0]], &random_state, hashes_buff)?; // Maps both values to both indices (1 and 2, representing input 0 and 1) // 0 -> (0, 1) @@ -3485,11 +3481,7 @@ mod tests { let right_keys_values = key_column.evaluate(&right)?.into_array(right.num_rows())?; let mut hashes_buffer = vec![0; right.num_rows()]; - create_hashes( - &[Arc::clone(&right_keys_values)], - &random_state, - &mut hashes_buffer, - )?; + create_hashes([&right_keys_values], &random_state, &mut hashes_buffer)?; let (l, r, _) = lookup_join_hashmap( &join_hash_map, @@ -3523,11 +3515,7 @@ mod tests { let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; left.num_rows()]; - let hashes = create_hashes( - &[Arc::clone(&left.columns()[0])], - &random_state, - hashes_buff, - )?; + let hashes = create_hashes([&left.columns()[0]], &random_state, hashes_buff)?; hashmap_left.insert_unique(hashes[0], (hashes[0], 1u32), |(h, _)| *h); hashmap_left.insert_unique(hashes[0], (hashes[0], 2u32), |(h, _)| *h); @@ -3550,11 +3538,7 @@ mod tests { let right_keys_values = key_column.evaluate(&right)?.into_array(right.num_rows())?; let mut hashes_buffer = vec![0; right.num_rows()]; - create_hashes( - &[Arc::clone(&right_keys_values)], - &random_state, - &mut hashes_buffer, - )?; + create_hashes([&right_keys_values], &random_state, &mut hashes_buffer)?; let (l, r, _) = lookup_join_hashmap( &join_hash_map, From d02642ee0230bd0cf851a6505926c27cb4d28e79 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 7 Nov 2025 15:38:46 -0600 Subject: [PATCH 0019/1589] Add a SpillingPool to manage collections of spill files (#18207) Addresses https://github.com/apache/datafusion/pull/18014#discussion_r2422164629, potentially paves the path to solve https://github.com/apache/datafusion/issues/18011 for other operators as well --------- Co-authored-by: Yongting You <2010youy01@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- datafusion/common/src/config.rs | 17 + datafusion/execution/src/disk_manager.rs | 253 ++- .../physical-plan/src/repartition/mod.rs | 893 ++++++++--- .../src/spill/in_progress_spill_file.rs | 6 + datafusion/physical-plan/src/spill/mod.rs | 5 + .../physical-plan/src/spill/spill_manager.rs | 5 + .../physical-plan/src/spill/spill_pool.rs | 1425 +++++++++++++++++ .../test_files/information_schema.slt | 2 + docs/source/user-guide/configs.md | 1 + 9 files changed, 2352 insertions(+), 255 deletions(-) create mode 100644 datafusion/physical-plan/src/spill/spill_pool.rs diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index f4afdf7002078..0ed499da04757 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -517,6 +517,23 @@ config_namespace! { /// batches and merged. pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024 + /// Maximum size in bytes for individual spill files before rotating to a new file. + /// + /// When operators spill data to disk (e.g., RepartitionExec), they write + /// multiple batches to the same file until this size limit is reached, then rotate + /// to a new file. This reduces syscall overhead compared to one-file-per-batch + /// while preventing files from growing too large. + /// + /// A larger value reduces file creation overhead but may hold more disk space. + /// A smaller value creates more files but allows finer-grained space reclamation + /// as files can be deleted once fully consumed. + /// + /// Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators + /// may create spill files larger than the limit. + /// + /// Default: 128 MB + pub max_spill_file_size_bytes: usize, default = 128 * 1024 * 1024 + /// Number of files to read in parallel when inferring schema and statistics pub meta_fetch_concurrency: usize, default = 32 diff --git a/datafusion/execution/src/disk_manager.rs b/datafusion/execution/src/disk_manager.rs index c3aa1bfa2958c..c2923d6112a6c 100644 --- a/datafusion/execution/src/disk_manager.rs +++ b/datafusion/execution/src/disk_manager.rs @@ -283,11 +283,13 @@ impl DiskManager { let dir_index = rng().random_range(0..local_dirs.len()); Ok(RefCountedTempFile { - _parent_temp_dir: Arc::clone(&local_dirs[dir_index]), - tempfile: Builder::new() - .tempfile_in(local_dirs[dir_index].as_ref()) - .map_err(DataFusionError::IoError)?, - current_file_disk_usage: 0, + parent_temp_dir: Arc::clone(&local_dirs[dir_index]), + tempfile: Arc::new( + Builder::new() + .tempfile_in(local_dirs[dir_index].as_ref()) + .map_err(DataFusionError::IoError)?, + ), + current_file_disk_usage: Arc::new(AtomicU64::new(0)), disk_manager: Arc::clone(self), }) } @@ -301,26 +303,50 @@ impl DiskManager { /// must invoke [`Self::update_disk_usage`] to update the global disk usage counter. /// This ensures the disk manager can properly enforce usage limits configured by /// [`DiskManager::with_max_temp_directory_size`]. +/// +/// This type is Clone-able, allowing multiple references to the same underlying file. +/// The file is deleted only when the last reference is dropped. +/// +/// The parent temporary directory is also kept alive as long as any reference to +/// this file exists, preventing premature cleanup of the directory. +/// +/// Once all references to this file are dropped, the file is deleted, and the +/// disk usage is subtracted from the disk manager's total. #[derive(Debug)] pub struct RefCountedTempFile { /// The reference to the directory in which temporary files are created to ensure /// it is not cleaned up prior to the NamedTempFile - _parent_temp_dir: Arc, - tempfile: NamedTempFile, + parent_temp_dir: Arc, + /// The underlying temporary file, wrapped in Arc to allow cloning + tempfile: Arc, /// Tracks the current disk usage of this temporary file. See /// [`Self::update_disk_usage`] for more details. - current_file_disk_usage: u64, + /// + /// This is wrapped in `Arc` so that all clones share the same + /// disk usage tracking, preventing incorrect accounting when clones are dropped. + current_file_disk_usage: Arc, /// The disk manager that created and manages this temporary file disk_manager: Arc, } +impl Clone for RefCountedTempFile { + fn clone(&self) -> Self { + Self { + parent_temp_dir: Arc::clone(&self.parent_temp_dir), + tempfile: Arc::clone(&self.tempfile), + current_file_disk_usage: Arc::clone(&self.current_file_disk_usage), + disk_manager: Arc::clone(&self.disk_manager), + } + } +} + impl RefCountedTempFile { pub fn path(&self) -> &Path { self.tempfile.path() } pub fn inner(&self) -> &NamedTempFile { - &self.tempfile + self.tempfile.as_ref() } /// Updates the global disk usage counter after modifications to the underlying file. @@ -332,11 +358,14 @@ impl RefCountedTempFile { let metadata = self.tempfile.as_file().metadata()?; let new_disk_usage = metadata.len(); + // Get the old disk usage + let old_disk_usage = self.current_file_disk_usage.load(Ordering::Relaxed); + // Update the global disk usage by: // 1. Subtracting the old file size from the global counter self.disk_manager .used_disk_space - .fetch_sub(self.current_file_disk_usage, Ordering::Relaxed); + .fetch_sub(old_disk_usage, Ordering::Relaxed); // 2. Adding the new file size to the global counter self.disk_manager .used_disk_space @@ -352,23 +381,29 @@ impl RefCountedTempFile { } // 4. Update the local file size tracking - self.current_file_disk_usage = new_disk_usage; + self.current_file_disk_usage + .store(new_disk_usage, Ordering::Relaxed); Ok(()) } pub fn current_disk_usage(&self) -> u64 { - self.current_file_disk_usage + self.current_file_disk_usage.load(Ordering::Relaxed) } } /// When the temporary file is dropped, subtract its disk usage from the disk manager's total impl Drop for RefCountedTempFile { fn drop(&mut self) { - // Subtract the current file's disk usage from the global counter - self.disk_manager - .used_disk_space - .fetch_sub(self.current_file_disk_usage, Ordering::Relaxed); + // Only subtract disk usage when this is the last reference to the file + // Check if we're the last one by seeing if there's only one strong reference + // left to the underlying tempfile (the one we're holding) + if Arc::strong_count(&self.tempfile) == 1 { + let current_usage = self.current_file_disk_usage.load(Ordering::Relaxed); + self.disk_manager + .used_disk_space + .fetch_sub(current_usage, Ordering::Relaxed); + } } } @@ -523,4 +558,190 @@ mod tests { Ok(()) } + + #[test] + fn test_disk_usage_basic() -> Result<()> { + use std::io::Write; + + let dm = Arc::new(DiskManagerBuilder::default().build()?); + let mut temp_file = dm.create_tmp_file("Testing")?; + + // Initially, disk usage should be 0 + assert_eq!(dm.used_disk_space(), 0); + assert_eq!(temp_file.current_disk_usage(), 0); + + // Write some data to the file + temp_file.inner().as_file().write_all(b"hello world")?; + temp_file.update_disk_usage()?; + + // Disk usage should now reflect the written data + let expected_usage = temp_file.current_disk_usage(); + assert!(expected_usage > 0); + assert_eq!(dm.used_disk_space(), expected_usage); + + // Write more data + temp_file.inner().as_file().write_all(b" more data")?; + temp_file.update_disk_usage()?; + + // Disk usage should increase + let new_usage = temp_file.current_disk_usage(); + assert!(new_usage > expected_usage); + assert_eq!(dm.used_disk_space(), new_usage); + + // Drop the file + drop(temp_file); + + // Disk usage should return to 0 + assert_eq!(dm.used_disk_space(), 0); + + Ok(()) + } + + #[test] + fn test_disk_usage_with_clones() -> Result<()> { + use std::io::Write; + + let dm = Arc::new(DiskManagerBuilder::default().build()?); + let mut temp_file = dm.create_tmp_file("Testing")?; + + // Write some data + temp_file.inner().as_file().write_all(b"test data")?; + temp_file.update_disk_usage()?; + + let usage_after_write = temp_file.current_disk_usage(); + assert!(usage_after_write > 0); + assert_eq!(dm.used_disk_space(), usage_after_write); + + // Clone the file + let clone1 = temp_file.clone(); + let clone2 = temp_file.clone(); + + // All clones should see the same disk usage + assert_eq!(clone1.current_disk_usage(), usage_after_write); + assert_eq!(clone2.current_disk_usage(), usage_after_write); + + // Global disk usage should still be the same (not multiplied by number of clones) + assert_eq!(dm.used_disk_space(), usage_after_write); + + // Write more data through one clone + clone1.inner().as_file().write_all(b" more data")?; + let mut mutable_clone1 = clone1; + mutable_clone1.update_disk_usage()?; + + let new_usage = mutable_clone1.current_disk_usage(); + assert!(new_usage > usage_after_write); + + // All clones should see the updated disk usage + assert_eq!(temp_file.current_disk_usage(), new_usage); + assert_eq!(clone2.current_disk_usage(), new_usage); + assert_eq!(mutable_clone1.current_disk_usage(), new_usage); + + // Global disk usage should reflect the new size (not multiplied) + assert_eq!(dm.used_disk_space(), new_usage); + + // Drop one clone + drop(mutable_clone1); + + // Disk usage should NOT change (other clones still exist) + assert_eq!(dm.used_disk_space(), new_usage); + assert_eq!(temp_file.current_disk_usage(), new_usage); + assert_eq!(clone2.current_disk_usage(), new_usage); + + // Drop another clone + drop(clone2); + + // Disk usage should still NOT change (original still exists) + assert_eq!(dm.used_disk_space(), new_usage); + assert_eq!(temp_file.current_disk_usage(), new_usage); + + // Drop the original + drop(temp_file); + + // Now disk usage should return to 0 (last reference dropped) + assert_eq!(dm.used_disk_space(), 0); + + Ok(()) + } + + #[test] + fn test_disk_usage_clones_dropped_out_of_order() -> Result<()> { + use std::io::Write; + + let dm = Arc::new(DiskManagerBuilder::default().build()?); + let mut temp_file = dm.create_tmp_file("Testing")?; + + // Write data + temp_file.inner().as_file().write_all(b"test")?; + temp_file.update_disk_usage()?; + + let usage = temp_file.current_disk_usage(); + assert_eq!(dm.used_disk_space(), usage); + + // Create multiple clones + let clone1 = temp_file.clone(); + let clone2 = temp_file.clone(); + let clone3 = temp_file.clone(); + + // Drop the original first (out of order) + drop(temp_file); + + // Disk usage should still be tracked (clones exist) + assert_eq!(dm.used_disk_space(), usage); + assert_eq!(clone1.current_disk_usage(), usage); + + // Drop clones in different order + drop(clone2); + assert_eq!(dm.used_disk_space(), usage); + + drop(clone1); + assert_eq!(dm.used_disk_space(), usage); + + // Drop the last clone + drop(clone3); + + // Now disk usage should be 0 + assert_eq!(dm.used_disk_space(), 0); + + Ok(()) + } + + #[test] + fn test_disk_usage_multiple_files() -> Result<()> { + use std::io::Write; + + let dm = Arc::new(DiskManagerBuilder::default().build()?); + + // Create multiple temp files + let mut file1 = dm.create_tmp_file("Testing1")?; + let mut file2 = dm.create_tmp_file("Testing2")?; + + // Write to first file + file1.inner().as_file().write_all(b"file1")?; + file1.update_disk_usage()?; + let usage1 = file1.current_disk_usage(); + + assert_eq!(dm.used_disk_space(), usage1); + + // Write to second file + file2.inner().as_file().write_all(b"file2 data")?; + file2.update_disk_usage()?; + let usage2 = file2.current_disk_usage(); + + // Global usage should be sum of both files + assert_eq!(dm.used_disk_space(), usage1 + usage2); + + // Drop first file + drop(file1); + + // Usage should only reflect second file + assert_eq!(dm.used_disk_space(), usage2); + + // Drop second file + drop(file2); + + // Usage should be 0 + assert_eq!(dm.used_disk_space(), 0); + + Ok(()) + } } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 08fac9fc69b39..74cf798895998 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -34,11 +34,9 @@ use crate::execution_plan::{CardinalityEffect, EvaluationType, SchedulingType}; use crate::hash_utils::create_hashes; use crate::metrics::{BaselineMetrics, SpillMetrics}; use crate::projection::{all_columns, make_with_child, update_expr, ProjectionExec}; -use crate::repartition::distributor_channels::{ - channels, partition_aware_channels, DistributionReceiver, DistributionSender, -}; use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::spill::spill_manager::SpillManager; +use crate::spill::spill_pool::{self, SpillPoolWriter}; use crate::stream::RecordBatchStreamAdapter; use crate::{DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics}; @@ -51,7 +49,6 @@ use datafusion_common::utils::transpose; use datafusion_common::{internal_err, ColumnStatistics, HashMap}; use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_common_runtime::SpawnedTask; -use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::memory_pool::MemoryConsumer; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr}; @@ -67,27 +64,101 @@ use log::trace; use parking_lot::Mutex; mod distributor_channels; +use distributor_channels::{ + channels, partition_aware_channels, DistributionReceiver, DistributionSender, +}; -/// A batch in the repartition queue - either in memory or spilled to disk +/// A batch in the repartition queue - either in memory or spilled to disk. +/// +/// This enum represents the two states a batch can be in during repartitioning. +/// The decision to spill is made based on memory availability when sending a batch +/// to an output partition. +/// +/// # Batch Flow with Spilling +/// +/// ```text +/// Input Stream ──▶ Partition Logic ──▶ try_grow() +/// │ +/// ┌───────────────┴────────────────┐ +/// │ │ +/// ▼ ▼ +/// try_grow() succeeds try_grow() fails +/// (Memory Available) (Memory Pressure) +/// │ │ +/// ▼ ▼ +/// RepartitionBatch::Memory spill_writer.push_batch() +/// (batch held in memory) (batch written to disk) +/// │ │ +/// │ ▼ +/// │ RepartitionBatch::Spilled +/// │ (marker - no batch data) +/// │ │ +/// └────────┬───────────────────────┘ +/// │ +/// ▼ +/// Send to channel +/// │ +/// ▼ +/// Output Stream (poll) +/// │ +/// ┌──────────────┴─────────────┐ +/// │ │ +/// ▼ ▼ +/// RepartitionBatch::Memory RepartitionBatch::Spilled +/// Return batch immediately Poll spill_stream (blocks) +/// │ │ +/// └────────┬───────────────────┘ +/// │ +/// ▼ +/// Return batch +/// (FIFO order preserved) +/// ``` +/// +/// See [`RepartitionExec`] for overall architecture and [`StreamState`] for +/// the state machine that handles reading these batches. #[derive(Debug)] enum RepartitionBatch { /// Batch held in memory (counts against memory reservation) Memory(RecordBatch), - /// Batch spilled to disk (one file per batch for queue semantics) - /// File automatically deleted when dropped via reference counting - /// The size field stores the original batch size for validation when reading back - Spilled { - spill_file: RefCountedTempFile, - size: usize, - }, + /// Marker indicating a batch was spilled to the partition's SpillPool. + /// The actual batch can be retrieved by reading from the SpillPoolStream. + /// This variant contains no data itself - it's just a signal to the reader + /// to fetch the next batch from the spill stream. + Spilled, } type MaybeBatch = Option>; type InputPartitionsToCurrentPartitionSender = Vec>; type InputPartitionsToCurrentPartitionReceiver = Vec>; -/// Channels and resources for a single output partition -#[derive(Debug)] +/// Output channel with its associated memory reservation and spill writer +struct OutputChannel { + sender: DistributionSender, + reservation: SharedMemoryReservation, + spill_writer: SpillPoolWriter, +} + +/// Channels and resources for a single output partition. +/// +/// Each output partition has channels to receive data from all input partitions. +/// To handle memory pressure, each (input, output) pair gets its own +/// [`SpillPool`](crate::spill::spill_pool) channel via [`spill_pool::channel`]. +/// +/// # Structure +/// +/// For an output partition receiving from N input partitions: +/// - `tx`: N senders (one per input) for sending batches to this output +/// - `rx`: N receivers (one per input) for receiving batches at this output +/// - `spill_writers`: N spill writers (one per input) for writing spilled data +/// - `spill_readers`: N spill readers (one per input) for reading spilled data +/// +/// This 1:1 mapping between input partitions and spill channels ensures that +/// batches from each input are processed in FIFO order, even when some batches +/// are spilled to disk and others remain in memory. +/// +/// See [`RepartitionExec`] for the overall N×M architecture. +/// +/// [`spill_pool::channel`]: crate::spill::spill_pool::channel struct PartitionChannels { /// Senders for each input partition to send data to this output partition tx: InputPartitionsToCurrentPartitionSender, @@ -95,20 +166,32 @@ struct PartitionChannels { rx: InputPartitionsToCurrentPartitionReceiver, /// Memory reservation for this output partition reservation: SharedMemoryReservation, - /// Spill manager for handling disk spills for this output partition - spill_manager: Arc, + /// Spill writers for writing spilled data. + /// SpillPoolWriter is Clone, so multiple writers can share state in non-preserve-order mode. + spill_writers: Vec, + /// Spill readers for reading spilled data - one per input partition (FIFO semantics). + /// Each (input, output) pair gets its own reader to maintain proper ordering. + spill_readers: Vec, } -#[derive(Debug)] struct ConsumingInputStreamsState { /// Channels for sending batches from input partitions to output partitions. /// Key is the partition number. channels: HashMap, - /// Helper that ensures that that background job is killed once it is no longer needed. + /// Helper that ensures that background jobs are killed once they are no longer needed. abort_helper: Arc>>, } +impl Debug for ConsumingInputStreamsState { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConsumingInputStreamsState") + .field("num_channels", &self.channels.len()) + .field("abort_helper", &self.abort_helper) + .finish() + } +} + /// Inner state of [`RepartitionExec`]. #[derive(Default)] enum RepartitionExecState { @@ -167,6 +250,7 @@ impl RepartitionExecState { Ok(()) } + #[expect(clippy::too_many_arguments)] fn consume_input_streams( &mut self, input: Arc, @@ -175,6 +259,7 @@ impl RepartitionExecState { preserve_order: bool, name: String, context: Arc, + spill_manager: SpillManager, ) -> Result<&mut ConsumingInputStreamsState> { let streams_and_metrics = match self { RepartitionExecState::NotInitialized => { @@ -198,17 +283,19 @@ impl RepartitionExecState { let num_input_partitions = streams_and_metrics.len(); let num_output_partitions = partitioning.partition_count(); + let spill_manager = Arc::new(spill_manager); + let (txs, rxs) = if preserve_order { - let (txs, rxs) = + // Create partition-aware channels with one channel per (input, output) pair + // This provides backpressure while maintaining proper ordering + let (txs_all, rxs_all) = partition_aware_channels(num_input_partitions, num_output_partitions); // Take transpose of senders and receivers. `state.channels` keeps track of entries per output partition - let txs = transpose(txs); - let rxs = transpose(rxs); + let txs = transpose(txs_all); + let rxs = transpose(rxs_all); (txs, rxs) } else { - // create one channel per *output* partition - // note we use a custom channel that ensures there is always data for each receiver - // but limits the amount of buffering if required. + // Create one channel per *output* partition with backpressure let (txs, rxs) = channels(num_output_partitions); // Clone sender for each input partitions let txs = txs @@ -226,19 +313,34 @@ impl RepartitionExecState { .with_can_spill(true) .register(context.memory_pool()), )); - let spill_metrics = SpillMetrics::new(&metrics, partition); - let spill_manager = Arc::new(SpillManager::new( - Arc::clone(&context.runtime_env()), - spill_metrics, - input.schema(), - )); + + // Create spill channels based on mode: + // - preserve_order: one spill channel per (input, output) pair for proper FIFO ordering + // - non-preserve-order: one shared spill channel per output partition since all inputs + // share the same receiver + let max_file_size = context + .session_config() + .options() + .execution + .max_spill_file_size_bytes; + let num_spill_channels = if preserve_order { + num_input_partitions + } else { + 1 + }; + let (spill_writers, spill_readers): (Vec<_>, Vec<_>) = (0 + ..num_spill_channels) + .map(|_| spill_pool::channel(max_file_size, Arc::clone(&spill_manager))) + .unzip(); + channels.insert( partition, PartitionChannels { tx, rx, reservation, - spill_manager, + spill_readers, + spill_writers, }, ); } @@ -251,34 +353,38 @@ impl RepartitionExecState { let txs: HashMap<_, _> = channels .iter() .map(|(partition, channels)| { + // In preserve_order mode: each input gets its own spill writer (index i) + // In non-preserve-order mode: all inputs share spill writer 0 via clone + let spill_writer_idx = if preserve_order { i } else { 0 }; ( *partition, - ( - channels.tx[i].clone(), - Arc::clone(&channels.reservation), - Arc::clone(&channels.spill_manager), - ), + OutputChannel { + sender: channels.tx[i].clone(), + reservation: Arc::clone(&channels.reservation), + spill_writer: channels.spill_writers[spill_writer_idx] + .clone(), + }, ) }) .collect(); + // Extract senders for wait_for_task before moving txs + let senders: HashMap<_, _> = txs + .iter() + .map(|(partition, channel)| (*partition, channel.sender.clone())) + .collect(); + let input_task = SpawnedTask::spawn(RepartitionExec::pull_from_input( stream, - txs.clone(), + txs, partitioning.clone(), metrics, )); // In a separate task, wait for each input to be done // (and pass along any errors, including panic!s) - let wait_for_task = SpawnedTask::spawn(RepartitionExec::wait_for_task( - input_task, - txs.into_iter() - .map(|(partition, (tx, _reservation, _spill_manager))| { - (partition, tx) - }) - .collect(), - )); + let wait_for_task = + SpawnedTask::spawn(RepartitionExec::wait_for_task(input_task, senders)); spawned_tasks.push(wait_for_task); } *self = Self::ConsumingInputStreams(ConsumingInputStreamsState { @@ -507,6 +613,38 @@ impl BatchPartitioner { /// arbitrary interleaving (and thus unordered) unless /// [`Self::with_preserve_order`] specifies otherwise. /// +/// # Spilling Architecture +/// +/// RepartitionExec uses [`SpillPool`](crate::spill::spill_pool) channels to handle +/// memory pressure during repartitioning. Each (input partition, output partition) +/// pair gets its own SpillPool channel for FIFO ordering. +/// +/// ```text +/// Input Partitions (N) Output Partitions (M) +/// ──────────────────── ───────────────────── +/// +/// Input 0 ──┐ ┌──▶ Output 0 +/// │ ┌──────────────┐ │ +/// ├─▶│ SpillPool │────┤ +/// │ │ [In0→Out0] │ │ +/// Input 1 ──┤ └──────────────┘ ├──▶ Output 1 +/// │ │ +/// │ ┌──────────────┐ │ +/// ├─▶│ SpillPool │────┤ +/// │ │ [In1→Out0] │ │ +/// Input 2 ──┤ └──────────────┘ ├──▶ Output 2 +/// │ │ +/// │ ... (N×M SpillPools total) +/// │ │ +/// │ ┌──────────────┐ │ +/// └─▶│ SpillPool │────┘ +/// │ [InN→OutM] │ +/// └──────────────┘ +/// +/// Each SpillPool maintains FIFO order for its (input, output) pair. +/// See `RepartitionBatch` for details on the memory/spill decision logic. +/// ``` +/// /// # Footnote /// /// The "Exchange Operator" was first described in the 1989 paper @@ -586,7 +724,7 @@ impl RepartitionExec { &self.cache.partitioning } - /// Get preserve_order flag of the RepartitionExecutor + /// Get preserve_order flag of the RepartitionExec /// `true` means `SortPreservingRepartitionExec`, `false` means `RepartitionExec` pub fn preserve_order(&self) -> bool { self.preserve_order @@ -692,6 +830,8 @@ impl ExecutionPlan for RepartitionExec { partition ); + let spill_metrics = SpillMetrics::new(&self.metrics, partition); + let input = Arc::clone(&self.input); let partitioning = self.partitioning().clone(); let metrics = self.metrics.clone(); @@ -700,6 +840,12 @@ impl ExecutionPlan for RepartitionExec { let schema = self.schema(); let schema_captured = Arc::clone(&schema); + let spill_manager = SpillManager::new( + Arc::clone(&context.runtime_env()), + spill_metrics, + input.schema(), + ); + // Get existing ordering to use for merging let sort_exprs = self.sort_exprs().cloned(); @@ -713,11 +859,11 @@ impl ExecutionPlan for RepartitionExec { )?; } - let stream = futures::stream::once(async move { - let num_input_partitions = input.output_partitioning().partition_count(); + let num_input_partitions = input.output_partitioning().partition_count(); + let stream = futures::stream::once(async move { // lock scope - let (mut rx, reservation, spill_manager, abort_helper) = { + let (rx, reservation, spill_readers, abort_helper) = { // lock mutexes let mut state = state.lock(); let state = state.consume_input_streams( @@ -727,6 +873,7 @@ impl ExecutionPlan for RepartitionExec { preserve_order, name.clone(), Arc::clone(&context), + spill_manager.clone(), )?; // now return stream for the specified *output* partition which will @@ -734,7 +881,7 @@ impl ExecutionPlan for RepartitionExec { let PartitionChannels { rx, reservation, - spill_manager, + spill_readers, .. } = state .channels @@ -744,7 +891,7 @@ impl ExecutionPlan for RepartitionExec { ( rx, reservation, - spill_manager, + spill_readers, Arc::clone(&state.abort_helper), ) }; @@ -755,17 +902,20 @@ impl ExecutionPlan for RepartitionExec { if preserve_order { // Store streams from all the input partitions: + // Each input partition gets its own spill reader to maintain proper FIFO ordering let input_streams = rx .into_iter() - .map(|receiver| { - Box::pin(PerPartitionStream { - schema: Arc::clone(&schema_captured), + .zip(spill_readers) + .map(|(receiver, spill_stream)| { + // In preserve_order mode, each receiver corresponds to exactly one input partition + Box::pin(PerPartitionStream::new( + Arc::clone(&schema_captured), receiver, - _drop_helper: Arc::clone(&abort_helper), - reservation: Arc::clone(&reservation), - spill_manager: Arc::clone(&spill_manager), - state: RepartitionStreamState::ReceivingFromChannel, - }) as SendableRecordBatchStream + Arc::clone(&abort_helper), + Arc::clone(&reservation), + spill_stream, + 1, // Each receiver handles one input partition + )) as SendableRecordBatchStream }) .collect::>(); // Note that receiver size (`rx.len()`) and `num_input_partitions` are same. @@ -784,18 +934,25 @@ impl ExecutionPlan for RepartitionExec { .with_batch_size(context.session_config().batch_size()) .with_fetch(fetch) .with_reservation(merge_reservation) + .with_spill_manager(spill_manager) .build() } else { - Ok(Box::pin(RepartitionStream { - num_input_partitions, - num_input_partitions_processed: 0, - schema: input.schema(), - input: rx.swap_remove(0), - _drop_helper: abort_helper, + // Non-preserve-order case: single input stream, so use the first spill reader + let spill_stream = spill_readers + .into_iter() + .next() + .expect("at least one spill reader should exist"); + + Ok(Box::pin(PerPartitionStream::new( + schema_captured, + rx.into_iter() + .next() + .expect("at least one receiver should exist"), + abort_helper, reservation, - spill_manager, - state: RepartitionStreamState::ReceivingFromChannel, - }) as SendableRecordBatchStream) + spill_stream, + num_input_partitions, + )) as SendableRecordBatchStream) } }) .try_flatten(); @@ -1030,17 +1187,10 @@ impl RepartitionExec { /// Pulls data from the specified input plan, feeding it to the /// output partitions based on the desired partitioning /// - /// txs hold the output sending channels for each output partition + /// `output_channels` holds the output sending channels for each output partition async fn pull_from_input( mut stream: SendableRecordBatchStream, - mut output_channels: HashMap< - usize, - ( - DistributionSender, - SharedMemoryReservation, - Arc, - ), - >, + mut output_channels: HashMap, partitioning: Partitioning, metrics: RepartitionMetrics, ) -> Result<()> { @@ -1072,37 +1222,27 @@ impl RepartitionExec { let timer = metrics.send_time[partition].timer(); // if there is still a receiver, send to it - if let Some((tx, reservation, spill_manager)) = - output_channels.get_mut(&partition) - { + if let Some(channel) = output_channels.get_mut(&partition) { let (batch_to_send, is_memory_batch) = - match reservation.lock().try_grow(size) { + match channel.reservation.lock().try_grow(size) { Ok(_) => { // Memory available - send in-memory batch (RepartitionBatch::Memory(batch), true) } Err(_) => { - // We're memory limited - spill this single batch to its own file - let spill_file = spill_manager - .spill_record_batch_and_finish( - &[batch], - &format!( - "RepartitionExec spill partition {partition}" - ), - )? - // Note that we handled empty batch above, so this is safe - .expect("non-empty batch should produce spill file"); - - // Store size for validation when reading back - (RepartitionBatch::Spilled { spill_file, size }, false) + // We're memory limited - spill to SpillPool + // SpillPool handles file handle reuse and rotation + channel.spill_writer.push_batch(&batch)?; + // Send marker indicating batch was spilled + (RepartitionBatch::Spilled, false) } }; - if tx.send(Some(Ok(batch_to_send))).await.is_err() { + if channel.sender.send(Some(Ok(batch_to_send))).await.is_err() { // If the other end has hung up, it was an early shutdown (e.g. LIMIT) // Only shrink memory if it was a memory batch if is_memory_batch { - reservation.lock().shrink(size); + channel.reservation.lock().shrink(size); } output_channels.remove(&partition); } @@ -1134,6 +1274,8 @@ impl RepartitionExec { } } + // Spill writers will auto-finalize when dropped + // No need for explicit flush Ok(()) } @@ -1176,7 +1318,7 @@ impl RepartitionExec { // Input task completed successfully Ok(Ok(())) => { // notify each output partition that this input partition has no more data - for (_, tx) in txs { + for (_partition, tx) in txs { tx.send(None).await.ok(); } } @@ -1184,118 +1326,55 @@ impl RepartitionExec { } } -enum RepartitionStreamState { - /// Waiting for next item from channel - ReceivingFromChannel, - /// Reading a spilled batch from disk (stream reads via tokio::fs) - ReadingSpilledBatch(SendableRecordBatchStream), -} - -struct RepartitionStream { - /// Number of input partitions that will be sending batches to this output channel - num_input_partitions: usize, - - /// Number of input partitions that have finished sending batches to this output channel - num_input_partitions_processed: usize, - - /// Schema wrapped by Arc - schema: SchemaRef, - - /// channel containing the repartitioned batches - input: DistributionReceiver, - - /// Handle to ensure background tasks are killed when no longer needed. - _drop_helper: Arc>>, - - /// Memory reservation. - reservation: SharedMemoryReservation, - - /// Spill manager for reading spilled batches - spill_manager: Arc, - - /// Current state of the stream - state: RepartitionStreamState, -} - -impl Stream for RepartitionStream { - type Item = Result; - - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - loop { - match &mut self.state { - RepartitionStreamState::ReceivingFromChannel => { - let value = futures::ready!(self.input.recv().poll_unpin(cx)); - match value { - Some(Some(v)) => match v { - Ok(RepartitionBatch::Memory(batch)) => { - // Release memory and return - self.reservation - .lock() - .shrink(batch.get_array_memory_size()); - return Poll::Ready(Some(Ok(batch))); - } - Ok(RepartitionBatch::Spilled { spill_file, size }) => { - // Read from disk - SpillReaderStream uses tokio::fs internally - // Pass the original size for validation - let stream = self - .spill_manager - .read_spill_as_stream(spill_file, Some(size))?; - self.state = - RepartitionStreamState::ReadingSpilledBatch(stream); - // Continue loop to poll the stream immediately - } - Err(e) => { - return Poll::Ready(Some(Err(e))); - } - }, - Some(None) => { - self.num_input_partitions_processed += 1; - - if self.num_input_partitions - == self.num_input_partitions_processed - { - // all input partitions have finished sending batches - return Poll::Ready(None); - } else { - // other partitions still have data to send - continue; - } - } - None => { - return Poll::Ready(None); - } - } - } - RepartitionStreamState::ReadingSpilledBatch(stream) => { - match futures::ready!(stream.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - // Return batch and stay in ReadingSpilledBatch state to read more batches - return Poll::Ready(Some(Ok(batch))); - } - Some(Err(e)) => { - self.state = RepartitionStreamState::ReceivingFromChannel; - return Poll::Ready(Some(Err(e))); - } - None => { - // Spill stream ended - go back to receiving from channel - self.state = RepartitionStreamState::ReceivingFromChannel; - continue; - } - } - } - } - } - } -} - -impl RecordBatchStream for RepartitionStream { - /// Get the schema - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } +/// State for tracking whether we're reading from memory channel or spill stream. +/// +/// This state machine ensures proper ordering when batches are mixed between memory +/// and spilled storage. When a [`RepartitionBatch::Spilled`] marker is received, +/// the stream must block on the spill stream until the corresponding batch arrives. +/// +/// # State Machine +/// +/// ```text +/// ┌─────────────────┐ +/// ┌───▶│ ReadingMemory │◀───┐ +/// │ └────────┬────────┘ │ +/// │ │ │ +/// │ Poll channel │ +/// │ │ │ +/// │ ┌──────────┼─────────────┐ +/// │ │ │ │ +/// │ ▼ ▼ │ +/// │ Memory Spilled │ +/// Got batch │ batch marker │ +/// from spill │ │ │ │ +/// │ │ ▼ │ +/// │ │ ┌──────────────────┐ │ +/// │ │ │ ReadingSpilled │ │ +/// │ │ └────────┬─────────┘ │ +/// │ │ │ │ +/// │ │ Poll spill_stream │ +/// │ │ │ │ +/// │ │ ▼ │ +/// │ │ Get batch │ +/// │ │ │ │ +/// └──┴───────────┴────────────┘ +/// │ +/// ▼ +/// Return batch +/// (Order preserved within +/// (input, output) pair) +/// ``` +/// +/// The transition to `ReadingSpilled` blocks further channel polling to maintain +/// FIFO ordering - we cannot read the next item from the channel until the spill +/// stream provides the current batch. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum StreamState { + /// Reading from the memory channel (normal operation) + ReadingMemory, + /// Waiting for a spilled batch from the spill stream. + /// Must not poll channel until spilled batch is received to preserve ordering. + ReadingSpilled, } /// This struct converts a receiver to a stream. @@ -1313,11 +1392,37 @@ struct PerPartitionStream { /// Memory reservation. reservation: SharedMemoryReservation, - /// Spill manager for reading spilled batches - spill_manager: Arc, + /// Infinite stream for reading from the spill pool + spill_stream: SendableRecordBatchStream, - /// Current state of the stream - state: RepartitionStreamState, + /// Internal state indicating if we are reading from memory or spill stream + state: StreamState, + + /// Number of input partitions that have not yet finished. + /// In non-preserve-order mode, multiple input partitions send to the same channel, + /// each sending None when complete. We must wait for all of them. + remaining_partitions: usize, +} + +impl PerPartitionStream { + fn new( + schema: SchemaRef, + receiver: DistributionReceiver, + drop_helper: Arc>>, + reservation: SharedMemoryReservation, + spill_stream: SendableRecordBatchStream, + num_input_partitions: usize, + ) -> Self { + Self { + schema, + receiver, + _drop_helper: drop_helper, + reservation, + spill_stream, + state: StreamState::ReadingMemory, + remaining_partitions: num_input_partitions, + } + } } impl Stream for PerPartitionStream { @@ -1327,55 +1432,74 @@ impl Stream for PerPartitionStream { mut self: Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { + use futures::StreamExt; + loop { - match &mut self.state { - RepartitionStreamState::ReceivingFromChannel => { - let value = futures::ready!(self.receiver.recv().poll_unpin(cx)); + match self.state { + StreamState::ReadingMemory => { + // Poll the memory channel for next message + let value = match self.receiver.recv().poll_unpin(cx) { + Poll::Ready(v) => v, + Poll::Pending => { + // Nothing from channel, wait + return Poll::Pending; + } + }; + match value { Some(Some(v)) => match v { Ok(RepartitionBatch::Memory(batch)) => { - // Release memory and return + // Release memory and return batch self.reservation .lock() .shrink(batch.get_array_memory_size()); return Poll::Ready(Some(Ok(batch))); } - Ok(RepartitionBatch::Spilled { spill_file, size }) => { - // Read from disk - SpillReaderStream uses tokio::fs internally - // Pass the original size for validation - let stream = self - .spill_manager - .read_spill_as_stream(spill_file, Some(size))?; - self.state = - RepartitionStreamState::ReadingSpilledBatch(stream); - // Continue loop to poll the stream immediately + Ok(RepartitionBatch::Spilled) => { + // Batch was spilled, transition to reading from spill stream + // We must block on spill stream until we get the batch + // to preserve ordering + self.state = StreamState::ReadingSpilled; + continue; } Err(e) => { return Poll::Ready(Some(Err(e))); } }, Some(None) => { - // Input partition has finished sending batches + // One input partition finished + self.remaining_partitions -= 1; + if self.remaining_partitions == 0 { + // All input partitions finished + return Poll::Ready(None); + } + // Continue to poll for more data from other partitions + continue; + } + None => { + // Channel closed unexpectedly return Poll::Ready(None); } - None => return Poll::Ready(None), } } - - RepartitionStreamState::ReadingSpilledBatch(stream) => { - match futures::ready!(stream.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - // Return batch and stay in ReadingSpilledBatch state to read more batches + StreamState::ReadingSpilled => { + // Poll spill stream for the spilled batch + match self.spill_stream.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + self.state = StreamState::ReadingMemory; return Poll::Ready(Some(Ok(batch))); } - Some(Err(e)) => { - self.state = RepartitionStreamState::ReceivingFromChannel; + Poll::Ready(Some(Err(e))) => { return Poll::Ready(Some(Err(e))); } - None => { - // Spill stream ended - go back to receiving from channel - self.state = RepartitionStreamState::ReceivingFromChannel; - continue; + Poll::Ready(None) => { + // Spill stream ended, keep draining the memory channel + self.state = StreamState::ReadingMemory; + } + Poll::Pending => { + // Spilled batch not ready yet, must wait + // This preserves ordering by blocking until spill data arrives + return Poll::Pending; } } } @@ -2136,12 +2260,105 @@ mod tests { ) .unwrap() } + + /// Create batches with sequential values for ordering tests + fn create_ordered_batches(num_batches: usize) -> Vec { + let schema = test_schema(); + (0..num_batches) + .map(|i| { + let start = (i * 8) as u32; + RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(UInt32Array::from( + (start..start + 8).collect::>(), + ))], + ) + .unwrap() + }) + .collect() + } + + #[tokio::test] + async fn test_repartition_ordering_with_spilling() -> Result<()> { + // Test that repartition preserves ordering when spilling occurs + // This tests the state machine fix where we must block on spill_stream + // when a Spilled marker is received, rather than continuing to poll the channel + + let schema = test_schema(); + // Create batches with sequential values: batch 0 has [0,1,2,3,4,5,6,7], + // batch 1 has [8,9,10,11,12,13,14,15], etc. + let partition = create_ordered_batches(20); + let input_partitions = vec![partition]; + + // Use RoundRobinBatch to ensure predictable ordering + let partitioning = Partitioning::RoundRobinBatch(2); + + // Set up context with very tight memory limit to force spilling + let runtime = RuntimeEnvBuilder::default() + .with_memory_limit(1, 1.0) + .build_arc()?; + + let task_ctx = TaskContext::default().with_runtime(runtime); + let task_ctx = Arc::new(task_ctx); + + // create physical plan + let exec = + TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?; + let exec = RepartitionExec::try_new(exec, partitioning)?; + + // Collect all output partitions + let mut all_batches = Vec::new(); + for i in 0..exec.partitioning().partition_count() { + let mut partition_batches = Vec::new(); + let mut stream = exec.execute(i, Arc::clone(&task_ctx))?; + while let Some(result) = stream.next().await { + let batch = result?; + partition_batches.push(batch); + } + all_batches.push(partition_batches); + } + + // Verify spilling occurred + let metrics = exec.metrics().unwrap(); + assert!( + metrics.spill_count().unwrap() > 0, + "Expected spilling to occur, but spill_count = 0" + ); + + // Verify ordering is preserved within each partition + // With RoundRobinBatch, even batches go to partition 0, odd batches to partition 1 + for (partition_idx, batches) in all_batches.iter().enumerate() { + let mut last_value = None; + for batch in batches { + let array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..array.len() { + let value = array.value(i); + if let Some(last) = last_value { + assert!( + value > last, + "Ordering violated in partition {partition_idx}: {value} is not greater than {last}" + ); + } + last_value = Some(value); + } + } + } + + Ok(()) + } } #[cfg(test)] mod test { + use arrow::array::record_batch; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::assert_batches_eq; use super::*; use crate::test::TestMemoryExec; @@ -2224,6 +2441,204 @@ mod test { Ok(()) } + #[tokio::test] + async fn test_preserve_order_with_spilling() -> Result<()> { + use datafusion_execution::runtime_env::RuntimeEnvBuilder; + use datafusion_execution::TaskContext; + + // Create sorted input data across multiple partitions + // Partition1: [1,3], [5,7], [9,11] + // Partition2: [2,4], [6,8], [10,12] + let batch1 = record_batch!(("c0", UInt32, [1, 3])).unwrap(); + let batch2 = record_batch!(("c0", UInt32, [2, 4])).unwrap(); + let batch3 = record_batch!(("c0", UInt32, [5, 7])).unwrap(); + let batch4 = record_batch!(("c0", UInt32, [6, 8])).unwrap(); + let batch5 = record_batch!(("c0", UInt32, [9, 11])).unwrap(); + let batch6 = record_batch!(("c0", UInt32, [10, 12])).unwrap(); + let schema = batch1.schema(); + let sort_exprs = LexOrdering::new([PhysicalSortExpr { + expr: col("c0", &schema).unwrap(), + options: SortOptions::default().asc(), + }]) + .unwrap(); + let partition1 = vec![batch1.clone(), batch3.clone(), batch5.clone()]; + let partition2 = vec![batch2.clone(), batch4.clone(), batch6.clone()]; + let input_partitions = vec![partition1, partition2]; + + // Set up context with tight memory limit to force spilling + // Sorting needs some non-spillable memory, so 64 bytes should force spilling while still allowing the query to complete + let runtime = RuntimeEnvBuilder::default() + .with_memory_limit(64, 1.0) + .build_arc()?; + + let task_ctx = TaskContext::default().with_runtime(runtime); + let task_ctx = Arc::new(task_ctx); + + // Create physical plan with order preservation + let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)? + .try_with_sort_information(vec![sort_exprs.clone(), sort_exprs])?; + let exec = Arc::new(TestMemoryExec::update_cache(Arc::new(exec))); + // Repartition into 3 partitions with order preservation + // We expect 1 batch per output partition after repartitioning + let exec = RepartitionExec::try_new(exec, Partitioning::RoundRobinBatch(3))? + .with_preserve_order(); + + let mut batches = vec![]; + + // Collect all partitions - should succeed by spilling to disk + for i in 0..exec.partitioning().partition_count() { + let mut stream = exec.execute(i, Arc::clone(&task_ctx))?; + while let Some(result) = stream.next().await { + let batch = result?; + batches.push(batch); + } + } + + #[rustfmt::skip] + let expected = [ + [ + "+----+", + "| c0 |", + "+----+", + "| 1 |", + "| 2 |", + "| 3 |", + "| 4 |", + "+----+", + ], + [ + "+----+", + "| c0 |", + "+----+", + "| 5 |", + "| 6 |", + "| 7 |", + "| 8 |", + "+----+", + ], + [ + "+----+", + "| c0 |", + "+----+", + "| 9 |", + "| 10 |", + "| 11 |", + "| 12 |", + "+----+", + ], + ]; + + for (batch, expected) in batches.iter().zip(expected.iter()) { + assert_batches_eq!(expected, std::slice::from_ref(batch)); + } + + // We should have spilled ~ all of the data. + // - We spill data during the repartitioning phase + // - We may also spill during the final merge sort + let all_batches = [batch1, batch2, batch3, batch4, batch5, batch6]; + let metrics = exec.metrics().unwrap(); + assert!( + metrics.spill_count().unwrap() > input_partitions.len(), + "Expected spill_count > {} for order-preserving repartition, but got {:?}", + input_partitions.len(), + metrics.spill_count() + ); + assert!( + metrics.spilled_bytes().unwrap() + > all_batches + .iter() + .map(|b| b.get_array_memory_size()) + .sum::(), + "Expected spilled_bytes > {} for order-preserving repartition, got {}", + all_batches + .iter() + .map(|b| b.get_array_memory_size()) + .sum::(), + metrics.spilled_bytes().unwrap() + ); + assert!( + metrics.spilled_rows().unwrap() + >= all_batches.iter().map(|b| b.num_rows()).sum::(), + "Expected spilled_rows > {} for order-preserving repartition, got {}", + all_batches.iter().map(|b| b.num_rows()).sum::(), + metrics.spilled_rows().unwrap() + ); + + Ok(()) + } + + #[tokio::test] + async fn test_hash_partitioning_with_spilling() -> Result<()> { + use datafusion_execution::runtime_env::RuntimeEnvBuilder; + use datafusion_execution::TaskContext; + + // Create input data similar to the round-robin test + let batch1 = record_batch!(("c0", UInt32, [1, 3])).unwrap(); + let batch2 = record_batch!(("c0", UInt32, [2, 4])).unwrap(); + let batch3 = record_batch!(("c0", UInt32, [5, 7])).unwrap(); + let batch4 = record_batch!(("c0", UInt32, [6, 8])).unwrap(); + let schema = batch1.schema(); + + let partition1 = vec![batch1.clone(), batch3.clone()]; + let partition2 = vec![batch2.clone(), batch4.clone()]; + let input_partitions = vec![partition1, partition2]; + + // Set up context with memory limit to test hash partitioning with spilling infrastructure + let runtime = RuntimeEnvBuilder::default() + .with_memory_limit(1, 1.0) + .build_arc()?; + + let task_ctx = TaskContext::default().with_runtime(runtime); + let task_ctx = Arc::new(task_ctx); + + // Create physical plan with hash partitioning + let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?; + let exec = Arc::new(TestMemoryExec::update_cache(Arc::new(exec))); + // Hash partition into 2 partitions by column c0 + let hash_expr = col("c0", &schema)?; + let exec = + RepartitionExec::try_new(exec, Partitioning::Hash(vec![hash_expr], 2))?; + + // Collect all partitions concurrently using JoinSet - this prevents deadlock + // where the distribution channel gate closes when all output channels are full + let mut join_set = tokio::task::JoinSet::new(); + for i in 0..exec.partitioning().partition_count() { + let stream = exec.execute(i, Arc::clone(&task_ctx))?; + join_set.spawn(async move { + let mut count = 0; + futures::pin_mut!(stream); + while let Some(result) = stream.next().await { + let batch = result?; + count += batch.num_rows(); + } + Ok::(count) + }); + } + + // Wait for all partitions and sum the rows + let mut total_rows = 0; + while let Some(result) = join_set.join_next().await { + total_rows += result.unwrap()?; + } + + // Verify we got all rows back + let all_batches = [batch1, batch2, batch3, batch4]; + let expected_rows: usize = all_batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, expected_rows); + + // Verify metrics are available + let metrics = exec.metrics().unwrap(); + // Just verify the metrics can be retrieved (spilling may or may not occur) + let spill_count = metrics.spill_count().unwrap_or(0); + assert!(spill_count > 0); + let spilled_bytes = metrics.spilled_bytes().unwrap_or(0); + assert!(spilled_bytes > 0); + let spilled_rows = metrics.spilled_rows().unwrap_or(0); + assert!(spilled_rows > 0); + + Ok(()) + } + #[tokio::test] async fn test_repartition() -> Result<()> { let schema = test_schema(); diff --git a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs index 14917e23b7921..e7f354a73b4cd 100644 --- a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs +++ b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs @@ -88,6 +88,12 @@ impl InProgressSpillFile { Ok(()) } + /// Returns a reference to the in-progress file, if it exists. + /// This can be used to get the file path for creating readers before the file is finished. + pub fn file(&self) -> Option<&RefCountedTempFile> { + self.in_progress_file.as_ref() + } + /// Finalizes the file, returning the completed file reference. /// If there are no batches spilled before, it returns `None`. pub fn finish(&mut self) -> Result> { diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs index 5b9a91e781b16..58fd016a63dd7 100644 --- a/datafusion/physical-plan/src/spill/mod.rs +++ b/datafusion/physical-plan/src/spill/mod.rs @@ -19,6 +19,11 @@ pub(crate) mod in_progress_spill_file; pub(crate) mod spill_manager; +pub mod spill_pool; + +// Re-export SpillManager for doctests only (hidden from public docs) +#[doc(hidden)] +pub use spill_manager::SpillManager; use std::fs::File; use std::io::BufReader; diff --git a/datafusion/physical-plan/src/spill/spill_manager.rs b/datafusion/physical-plan/src/spill/spill_manager.rs index cc39102d89819..6fd97a8e2e6a0 100644 --- a/datafusion/physical-plan/src/spill/spill_manager.rs +++ b/datafusion/physical-plan/src/spill/spill_manager.rs @@ -72,6 +72,11 @@ impl SpillManager { self } + /// Returns the schema for batches managed by this SpillManager + pub fn schema(&self) -> &SchemaRef { + &self.schema + } + /// Creates a temporary file for in-progress operations, returning an error /// message if file creation fails. The file can be used to append batches /// incrementally and then finish the file when done. diff --git a/datafusion/physical-plan/src/spill/spill_pool.rs b/datafusion/physical-plan/src/spill/spill_pool.rs new file mode 100644 index 0000000000000..bbe54ca45caa3 --- /dev/null +++ b/datafusion/physical-plan/src/spill/spill_pool.rs @@ -0,0 +1,1425 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use futures::{Stream, StreamExt}; +use std::collections::VecDeque; +use std::sync::Arc; +use std::task::Waker; + +use parking_lot::Mutex; + +use arrow::datatypes::SchemaRef; +use arrow::record_batch::RecordBatch; +use datafusion_common::Result; +use datafusion_execution::disk_manager::RefCountedTempFile; +use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; + +use super::in_progress_spill_file::InProgressSpillFile; +use super::spill_manager::SpillManager; + +/// Shared state between the writer and readers of a spill pool. +/// This contains the queue of files and coordination state. +/// +/// # Locking Design +/// +/// This struct uses **fine-grained locking** with nested `Arc>`: +/// - `SpillPoolShared` is wrapped in `Arc>` (outer lock) +/// - Each `ActiveSpillFileShared` is wrapped in `Arc>` (inner lock) +/// +/// This enables: +/// 1. **Short critical sections**: The outer lock is held only for queue operations +/// 2. **I/O outside locks**: Disk I/O happens while holding only the file-specific lock +/// 3. **Concurrent operations**: Reader can access the queue while writer does I/O +/// +/// **Lock ordering discipline**: Never hold both locks simultaneously to prevent deadlock. +/// Always: acquire outer lock → release outer lock → acquire inner lock (if needed). +struct SpillPoolShared { + /// Queue of ALL files (including the current write file if it exists). + /// Readers always read from the front of this queue (FIFO). + /// Each file has its own lock to enable concurrent reader/writer access. + files: VecDeque>>, + /// SpillManager for creating files and tracking metrics + spill_manager: Arc, + /// Pool-level waker to notify when new files are available (single reader) + waker: Option, + /// Whether the writer has been dropped (no more files will be added) + writer_dropped: bool, + /// Writer's reference to the current file (shared by all cloned writers). + /// Has its own lock to allow I/O without blocking queue access. + current_write_file: Option>>, +} + +impl SpillPoolShared { + /// Creates a new shared pool state + fn new(spill_manager: Arc) -> Self { + Self { + files: VecDeque::new(), + spill_manager, + waker: None, + writer_dropped: false, + current_write_file: None, + } + } + + /// Registers a waker to be notified when new data is available (pool-level) + fn register_waker(&mut self, waker: Waker) { + self.waker = Some(waker); + } + + /// Wakes the pool-level reader + fn wake(&mut self) { + if let Some(waker) = self.waker.take() { + waker.wake(); + } + } +} + +/// Writer for a spill pool. Provides coordinated write access with FIFO semantics. +/// +/// Created by [`channel`]. See that function for architecture diagrams and usage examples. +/// +/// The writer is `Clone`, allowing multiple writers to coordinate on the same pool. +/// All clones share the same current write file and coordinate file rotation. +/// The writer automatically manages file rotation based on the `max_file_size_bytes` +/// configured in [`channel`]. When the last writer clone is dropped, it finalizes the +/// current file so readers can access all written data. +#[derive(Clone)] +pub struct SpillPoolWriter { + /// Maximum size in bytes before rotating to a new file. + /// Typically set from configuration `datafusion.execution.max_spill_file_size_bytes`. + max_file_size_bytes: usize, + /// Shared state with readers (includes current_write_file for coordination) + shared: Arc>, +} + +impl SpillPoolWriter { + /// Spills a batch to the pool, rotating files when necessary. + /// + /// If the current file would exceed `max_file_size_bytes` after adding + /// this batch, the file is finalized and a new one is started. + /// + /// See [`channel`] for overall architecture and examples. + /// + /// # File Rotation Logic + /// + /// ```text + /// push_batch() + /// │ + /// ▼ + /// Current file exists? + /// │ + /// ├─ No ──▶ Create new file ──▶ Add to shared queue + /// │ Wake readers + /// ▼ + /// Write batch to current file + /// │ + /// ▼ + /// estimated_size > max_file_size_bytes? + /// │ + /// ├─ No ──▶ Keep current file for next batch + /// │ + /// ▼ + /// Yes: finish() current file + /// Mark writer_finished = true + /// Wake readers + /// │ + /// ▼ + /// Next push_batch() creates new file + /// ``` + /// + /// # Errors + /// + /// Returns an error if disk I/O fails or disk quota is exceeded. + pub fn push_batch(&self, batch: &RecordBatch) -> Result<()> { + if batch.num_rows() == 0 { + // Skip empty batches + return Ok(()); + } + + let batch_size = batch.get_array_memory_size(); + + // Fine-grained locking: Lock shared state briefly for queue access + let mut shared = self.shared.lock(); + + // Create new file if we don't have one yet + if shared.current_write_file.is_none() { + let spill_manager = Arc::clone(&shared.spill_manager); + // Release shared lock before disk I/O (fine-grained locking) + drop(shared); + + let writer = spill_manager.create_in_progress_file("SpillPool")?; + // Clone the file so readers can access it immediately + let file = writer.file().expect("InProgressSpillFile should always have a file when it is first created").clone(); + + let file_shared = Arc::new(Mutex::new(ActiveSpillFileShared { + writer: Some(writer), + file: Some(file), // Set immediately so readers can access it + batches_written: 0, + estimated_size: 0, + writer_finished: false, + waker: None, + })); + + // Re-acquire lock and push to shared queue + shared = self.shared.lock(); + shared.files.push_back(Arc::clone(&file_shared)); + shared.current_write_file = Some(file_shared); + shared.wake(); // Wake readers waiting for new files + } + + let current_write_file = shared.current_write_file.take(); + // Release shared lock before file I/O (fine-grained locking) + // This allows readers to access the queue while we do disk I/O + drop(shared); + + // Write batch to current file - lock only the specific file + if let Some(current_file) = current_write_file { + // Now lock just this file for I/O (separate from shared lock) + let mut file_shared = current_file.lock(); + + // Append the batch + if let Some(ref mut writer) = file_shared.writer { + writer.append_batch(batch)?; + file_shared.batches_written += 1; + file_shared.estimated_size += batch_size; + } + + // Wake reader waiting on this specific file + file_shared.wake(); + + // Check if we need to rotate + let needs_rotation = file_shared.estimated_size > self.max_file_size_bytes; + + if needs_rotation { + // Finish the IPC writer + if let Some(mut writer) = file_shared.writer.take() { + writer.finish()?; + } + // Mark as finished so readers know not to wait for more data + file_shared.writer_finished = true; + // Wake reader waiting on this file (it's now finished) + file_shared.wake(); + // Don't put back current_write_file - let it rotate + } else { + // Release file lock + drop(file_shared); + // Put back the current file for further writing + let mut shared = self.shared.lock(); + shared.current_write_file = Some(current_file); + } + } + + Ok(()) + } +} + +impl Drop for SpillPoolWriter { + fn drop(&mut self) { + let mut shared = self.shared.lock(); + + // Finalize the current file when the last writer is dropped + if let Some(current_file) = shared.current_write_file.take() { + // Release shared lock before locking file + drop(shared); + + let mut file_shared = current_file.lock(); + + // Finish the current writer if it exists + if let Some(mut writer) = file_shared.writer.take() { + // Ignore errors on drop - we're in destructor + let _ = writer.finish(); + } + + // Mark as finished so readers know not to wait for more data + file_shared.writer_finished = true; + + // Wake reader waiting on this file (it's now finished) + file_shared.wake(); + + drop(file_shared); + shared = self.shared.lock(); + } + + // Mark writer as dropped and wake pool-level readers + shared.writer_dropped = true; + shared.wake(); + } +} + +/// Creates a paired writer and reader for a spill pool with MPSC (multi-producer, single-consumer) +/// semantics. +/// +/// This is the recommended way to create a spill pool. The writer is `Clone`, allowing +/// multiple producers to coordinate writes to the same pool. The reader can consume batches +/// in FIFO order. The reader can start reading immediately after a writer appends a batch +/// to the spill file, without waiting for the file to be sealed, while writers continue to +/// write more data. +/// +/// Internally this coordinates rotating spill files based on size limits, and +/// handles asynchronous notification between the writer and reader using wakers. +/// This ensures that we manage disk usage efficiently while allowing concurrent +/// I/O between the writer and reader. +/// +/// # Data Flow Overview +/// +/// 1. Writer write batch `B0` to F1 +/// 2. Writer write batch `B1` to F1, notices the size limit exceeded, finishes F1. +/// 3. Reader read `B0` from F1 +/// 4. Reader read `B1`, no more batch to read -> wait on the waker +/// 5. Writer write batch `B2` to a new file `F2`, wake up the waiting reader. +/// 6. Reader read `B2` from F2. +/// 7. Repeat until writer is dropped. +/// +/// # Architecture +/// +/// ```text +/// ┌─────────────────────────────────────────────────────────────────────────┐ +/// │ SpillPool │ +/// │ │ +/// │ Writer Side Shared State Reader Side │ +/// │ ─────────── ──────────── ─────────── │ +/// │ │ +/// │ SpillPoolWriter ┌────────────────────┐ SpillPoolReader │ +/// │ │ │ VecDeque │ │ │ +/// │ │ │ ┌────┐┌────┐ │ │ │ +/// │ push_batch() │ │ F1 ││ F2 │ ... │ next().await │ +/// │ │ │ └────┘└────┘ │ │ │ +/// │ ▼ │ (FIFO order) │ ▼ │ +/// │ ┌─────────┐ │ │ ┌──────────┐ │ +/// │ │Current │───────▶│ Coordination: │◀───│ Current │ │ +/// │ │Write │ │ - Wakers │ │ Read │ │ +/// │ │File │ │ - Batch counts │ │ File │ │ +/// │ └─────────┘ │ - Writer status │ └──────────┘ │ +/// │ │ └────────────────────┘ │ │ +/// │ │ │ │ +/// │ Size > limit? Read all batches? │ +/// │ │ │ │ +/// │ ▼ ▼ │ +/// │ Rotate to new file Pop from queue │ +/// └─────────────────────────────────────────────────────────────────────────┘ +/// +/// Writer produces → Shared FIFO queue → Reader consumes +/// ``` +/// +/// # File State Machine +/// +/// Each file in the pool coordinates between writer and reader: +/// +/// ```text +/// Writer View Reader View +/// ─────────── ─────────── +/// +/// Created writer: Some(..) batches_read: 0 +/// batches_written: 0 (waiting for data) +/// │ +/// ▼ +/// Writing append_batch() Can read if: +/// batches_written++ batches_read < batches_written +/// wake readers +/// │ │ +/// │ ▼ +/// ┌──────┴──────┐ poll_next() → batch +/// │ │ batches_read++ +/// ▼ ▼ +/// Size > limit? More data? +/// │ │ +/// │ └─▶ Yes ──▶ Continue writing +/// ▼ +/// finish() Reader catches up: +/// writer_finished = true batches_read == batches_written +/// wake readers │ +/// │ ▼ +/// └─────────────────────▶ Returns Poll::Ready(None) +/// File complete, pop from queue +/// ``` +/// +/// # Arguments +/// +/// * `max_file_size_bytes` - Maximum size per file before rotation. When a file +/// exceeds this size, the writer automatically rotates to a new file. +/// * `spill_manager` - Manager for file creation and metrics tracking +/// +/// # Returns +/// +/// A tuple of `(SpillPoolWriter, SendableRecordBatchStream)` that share the same +/// underlying pool. The reader is returned as a stream for immediate use with +/// async stream combinators. +/// +/// # Example +/// +/// ``` +/// use std::sync::Arc; +/// use arrow::array::{ArrayRef, Int32Array}; +/// use arrow::datatypes::{DataType, Field, Schema}; +/// use arrow::record_batch::RecordBatch; +/// use datafusion_execution::runtime_env::RuntimeEnv; +/// use futures::StreamExt; +/// +/// # use datafusion_physical_plan::spill::spill_pool; +/// # use datafusion_physical_plan::spill::SpillManager; // Re-exported for doctests +/// # use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, SpillMetrics}; +/// # +/// # #[tokio::main] +/// # async fn main() -> datafusion_common::Result<()> { +/// # // Setup for the example (typically comes from TaskContext in production) +/// # let env = Arc::new(RuntimeEnv::default()); +/// # let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0); +/// # let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); +/// # let spill_manager = Arc::new(SpillManager::new(env, metrics, schema.clone())); +/// # +/// // Create channel with 1MB file size limit +/// let (writer, mut reader) = spill_pool::channel(1024 * 1024, spill_manager); +/// +/// // Spawn writer task to produce batches +/// let write_handle = tokio::spawn(async move { +/// for i in 0..5 { +/// let array: ArrayRef = Arc::new(Int32Array::from(vec![i; 100])); +/// let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); +/// writer.push_batch(&batch).unwrap(); +/// } +/// // Writer dropped here, finalizing current file +/// }); +/// +/// // Reader consumes batches in FIFO order (can run concurrently with writer) +/// let mut batches_read = 0; +/// while let Some(result) = reader.next().await { +/// let batch = result?; +/// batches_read += 1; +/// // Process batch... +/// if batches_read == 5 { +/// break; // Got all expected batches +/// } +/// } +/// +/// write_handle.await.unwrap(); +/// assert_eq!(batches_read, 5); +/// # Ok(()) +/// # } +/// ``` +/// +/// # Why rotate files? +/// +/// File rotation ensures we don't end up with unreferenced disk usage. +/// If we used a single file for all spilled data, we would end up with +/// unreferenced data at the beginning of the file that has already been read +/// by readers but we can't delete because you can't truncate from the start of a file. +/// +/// Consider the case of a query like `SELECT * FROM large_table WHERE false`. +/// Obviously this query produces no output rows, but if we had a spilling operator +/// in the middle of this query between the scan and the filter it would see the entire +/// `large_table` flow through it and thus would spill all of that data to disk. +/// So we'd end up using up to `size(large_table)` bytes of disk space. +/// If instead we use file rotation, and as long as the readers can keep up with the writer, +/// then we can ensure that once a file is fully read by all readers it can be deleted, +/// thus bounding the maximum disk usage to roughly `max_file_size_bytes`. +pub fn channel( + max_file_size_bytes: usize, + spill_manager: Arc, +) -> (SpillPoolWriter, SendableRecordBatchStream) { + let schema = Arc::clone(spill_manager.schema()); + let shared = Arc::new(Mutex::new(SpillPoolShared::new(spill_manager))); + + let writer = SpillPoolWriter { + max_file_size_bytes, + shared: Arc::clone(&shared), + }; + + let reader = SpillPoolReader::new(shared, schema); + + (writer, Box::pin(reader)) +} + +/// Shared state between writer and readers for an active spill file. +/// Protected by a Mutex to coordinate between concurrent readers and the writer. +struct ActiveSpillFileShared { + /// Writer handle - taken (set to None) when finish() is called + writer: Option, + /// The spill file, set when the writer finishes. + /// Taken by the reader when creating a stream (the file stays open via file handles). + file: Option, + /// Total number of batches written to this file + batches_written: usize, + /// Estimated size in bytes of data written to this file + estimated_size: usize, + /// Whether the writer has finished writing to this file + writer_finished: bool, + /// Waker for reader waiting on this specific file (SPSC: only one reader) + waker: Option, +} + +impl ActiveSpillFileShared { + /// Registers a waker to be notified when new data is written to this file + fn register_waker(&mut self, waker: Waker) { + self.waker = Some(waker); + } + + /// Wakes the reader waiting on this file + fn wake(&mut self) { + if let Some(waker) = self.waker.take() { + waker.wake(); + } + } +} + +/// Reader state for a SpillFile (owned by individual SpillFile instances). +/// This is kept separate from the shared state to avoid holding locks during I/O. +struct SpillFileReader { + /// The actual stream reading from disk + stream: SendableRecordBatchStream, + /// Number of batches this reader has consumed + batches_read: usize, +} + +struct SpillFile { + /// Shared coordination state (contains writer and batch counts) + shared: Arc>, + /// Reader state (lazy-initialized, owned by this SpillFile) + reader: Option, + /// Spill manager for creating readers + spill_manager: Arc, +} + +impl Stream for SpillFile { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + use std::task::Poll; + + // Step 1: Lock shared state and check coordination + let (should_read, file) = { + let mut shared = self.shared.lock(); + + // Determine if we can read + let batches_read = self.reader.as_ref().map_or(0, |r| r.batches_read); + + if batches_read < shared.batches_written { + // More data available to read - take the file if we don't have a reader yet + let file = if self.reader.is_none() { + shared.file.take() + } else { + None + }; + (true, file) + } else if shared.writer_finished { + // No more data and writer is done - EOF + return Poll::Ready(None); + } else { + // Caught up to writer, but writer still active - register waker and wait + shared.register_waker(cx.waker().clone()); + return Poll::Pending; + } + }; // Lock released here + + // Step 2: Lazy-create reader stream if needed + if self.reader.is_none() && should_read { + if let Some(file) = file { + match self.spill_manager.read_spill_as_stream(file, None) { + Ok(stream) => { + self.reader = Some(SpillFileReader { + stream, + batches_read: 0, + }); + } + Err(e) => return Poll::Ready(Some(Err(e))), + } + } else { + // File not available yet (writer hasn't finished or already taken) + // Register waker and wait for file to be ready + let mut shared = self.shared.lock(); + shared.register_waker(cx.waker().clone()); + return Poll::Pending; + } + } + + // Step 3: Poll the reader stream (no lock held) + if let Some(reader) = &mut self.reader { + match reader.stream.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + // Successfully read a batch - increment counter + reader.batches_read += 1; + Poll::Ready(Some(Ok(batch))) + } + Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))), + Poll::Ready(None) => { + // Stream exhausted unexpectedly + // This shouldn't happen if coordination is correct, but handle gracefully + Poll::Ready(None) + } + Poll::Pending => Poll::Pending, + } + } else { + // Should not reach here, but handle gracefully + Poll::Ready(None) + } + } +} + +/// A stream that reads from a SpillPool in FIFO order. +/// +/// Created by [`channel`]. See that function for architecture diagrams and usage examples. +/// +/// The stream automatically handles file rotation and reads from completed files. +/// When no data is available, it returns `Poll::Pending` and registers a waker to +/// be notified when the writer produces more data. +/// +/// # Infinite Stream Semantics +/// +/// This stream never returns `None` (`Poll::Ready(None)`) on its own - it will keep +/// waiting for the writer to produce more data. The stream ends only when: +/// - The reader is dropped +/// - The writer is dropped AND all queued data has been consumed +/// +/// This makes it suitable for continuous streaming scenarios where the writer may +/// produce data intermittently. +pub struct SpillPoolReader { + /// Shared reference to the spill pool + shared: Arc>, + /// Current SpillFile we're reading from + current_file: Option, + /// Schema of the spilled data + schema: SchemaRef, +} + +impl SpillPoolReader { + /// Creates a new reader from shared pool state. + /// + /// This is private - use the `channel()` function to create a reader/writer pair. + /// + /// # Arguments + /// + /// * `shared` - Shared reference to the pool state + fn new(shared: Arc>, schema: SchemaRef) -> Self { + Self { + shared, + current_file: None, + schema, + } + } +} + +impl Stream for SpillPoolReader { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + use std::task::Poll; + + loop { + // If we have a current file, try to read from it + if let Some(ref mut file) = self.current_file { + match file.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + // Got a batch, return it + return Poll::Ready(Some(Ok(batch))); + } + Poll::Ready(Some(Err(e))) => { + // Error reading batch + return Poll::Ready(Some(Err(e))); + } + Poll::Ready(None) => { + // Current file stream exhausted + // Check if this file is marked as writer_finished + let writer_finished = { file.shared.lock().writer_finished }; + + if writer_finished { + // File is complete, pop it from the queue and move to next + let mut shared = self.shared.lock(); + shared.files.pop_front(); + drop(shared); // Release lock + + // Clear current file and continue loop to get next file + self.current_file = None; + continue; + } else { + // Stream exhausted but writer not finished - unexpected + // This shouldn't happen with proper coordination + return Poll::Ready(None); + } + } + Poll::Pending => { + // File not ready yet (waiting for writer) + // Register waker so we get notified when writer adds more batches + let mut shared = self.shared.lock(); + shared.register_waker(cx.waker().clone()); + return Poll::Pending; + } + } + } + + // No current file, need to get the next one + let mut shared = self.shared.lock(); + + // Peek at the front of the queue (don't pop yet) + if let Some(file_shared) = shared.files.front() { + // Create a SpillFile from the shared state + let spill_manager = Arc::clone(&shared.spill_manager); + let file_shared = Arc::clone(file_shared); + drop(shared); // Release lock before creating SpillFile + + self.current_file = Some(SpillFile { + shared: file_shared, + reader: None, + spill_manager, + }); + + // Continue loop to poll the new file + continue; + } + + // No files in queue - check if writer is done + if shared.writer_dropped { + // Writer is done and no more files will be added - EOF + return Poll::Ready(None); + } + + // Writer still active, register waker that will get notified when new files are added + shared.register_waker(cx.waker().clone()); + return Poll::Pending; + } + } +} + +impl RecordBatchStream for SpillPoolReader { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metrics::{ExecutionPlanMetricsSet, SpillMetrics}; + use arrow::array::{ArrayRef, Int32Array}; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common_runtime::SpawnedTask; + use datafusion_execution::runtime_env::RuntimeEnv; + use futures::StreamExt; + + fn create_test_schema() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])) + } + + fn create_test_batch(start: i32, count: usize) -> RecordBatch { + let schema = create_test_schema(); + let a: ArrayRef = Arc::new(Int32Array::from( + (start..start + count as i32).collect::>(), + )); + RecordBatch::try_new(schema, vec![a]).unwrap() + } + + fn create_spill_channel( + max_file_size: usize, + ) -> (SpillPoolWriter, SendableRecordBatchStream) { + let env = Arc::new(RuntimeEnv::default()); + let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0); + let schema = create_test_schema(); + let spill_manager = Arc::new(SpillManager::new(env, metrics, schema)); + + channel(max_file_size, spill_manager) + } + + fn create_spill_channel_with_metrics( + max_file_size: usize, + ) -> (SpillPoolWriter, SendableRecordBatchStream, SpillMetrics) { + let env = Arc::new(RuntimeEnv::default()); + let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0); + let schema = create_test_schema(); + let spill_manager = Arc::new(SpillManager::new(env, metrics.clone(), schema)); + + let (writer, reader) = channel(max_file_size, spill_manager); + (writer, reader, metrics) + } + + #[tokio::test] + async fn test_basic_write_and_read() -> Result<()> { + let (writer, mut reader) = create_spill_channel(1024 * 1024); + + // Write one batch + let batch1 = create_test_batch(0, 10); + writer.push_batch(&batch1)?; + + // Read the batch + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 10); + + // Write another batch + let batch2 = create_test_batch(10, 5); + writer.push_batch(&batch2)?; + // Read the second batch + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 5); + + Ok(()) + } + + #[tokio::test] + async fn test_single_batch_write_read() -> Result<()> { + let (writer, mut reader) = create_spill_channel(1024 * 1024); + + // Write one batch + let batch = create_test_batch(0, 5); + writer.push_batch(&batch)?; + + // Read it back + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 5); + + // Verify the actual data + let col = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.value(0), 0); + assert_eq!(col.value(4), 4); + + Ok(()) + } + + #[tokio::test] + async fn test_multiple_batches_sequential() -> Result<()> { + let (writer, mut reader) = create_spill_channel(1024 * 1024); + + // Write multiple batches + for i in 0..5 { + let batch = create_test_batch(i * 10, 10); + writer.push_batch(&batch)?; + } + + // Read all batches and verify FIFO order + for i in 0..5 { + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 10); + + let col = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.value(0), i * 10, "Batch {i} not in FIFO order"); + } + + Ok(()) + } + + #[tokio::test] + async fn test_empty_writer() -> Result<()> { + let (_writer, reader) = create_spill_channel(1024 * 1024); + + // Reader should pend since no batches were written + let mut reader = reader; + let result = + tokio::time::timeout(std::time::Duration::from_millis(100), reader.next()) + .await; + + assert!(result.is_err(), "Reader should timeout on empty writer"); + + Ok(()) + } + + #[tokio::test] + async fn test_empty_batch_skipping() -> Result<()> { + let (writer, mut reader) = create_spill_channel(1024 * 1024); + + // Write empty batch + let empty_batch = create_test_batch(0, 0); + writer.push_batch(&empty_batch)?; + + // Write non-empty batch + let batch = create_test_batch(0, 5); + writer.push_batch(&batch)?; + + // Should only read the non-empty batch + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 5); + + Ok(()) + } + + #[tokio::test] + async fn test_rotation_triggered_by_size() -> Result<()> { + // Set a small max_file_size to trigger rotation after one batch + let batch1 = create_test_batch(0, 10); + let batch_size = batch1.get_array_memory_size() + 1; + + let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size); + + // Write first batch (should fit in first file) + writer.push_batch(&batch1)?; + + // Check metrics after first batch - file created but not finalized yet + assert_eq!( + metrics.spill_file_count.value(), + 1, + "Should have created 1 file after first batch" + ); + assert_eq!( + metrics.spilled_bytes.value(), + 0, + "Spilled bytes should be 0 before file finalization" + ); + assert_eq!( + metrics.spilled_rows.value(), + 10, + "Should have spilled 10 rows from first batch" + ); + + // Write second batch (should trigger rotation - finalize first file) + let batch2 = create_test_batch(10, 10); + assert!( + batch2.get_array_memory_size() <= batch_size, + "batch2 size {} exceeds limit {batch_size}", + batch2.get_array_memory_size(), + ); + assert!( + batch1.get_array_memory_size() + batch2.get_array_memory_size() > batch_size, + "Combined size {} does not exceed limit to trigger rotation", + batch1.get_array_memory_size() + batch2.get_array_memory_size() + ); + writer.push_batch(&batch2)?; + + // Check metrics after rotation - first file finalized, but second file not created yet + // (new file created lazily on next push_batch call) + assert_eq!( + metrics.spill_file_count.value(), + 1, + "Should still have 1 file (second file not created until next write)" + ); + assert!( + metrics.spilled_bytes.value() > 0, + "Spilled bytes should be > 0 after first file finalized (got {})", + metrics.spilled_bytes.value() + ); + assert_eq!( + metrics.spilled_rows.value(), + 20, + "Should have spilled 20 total rows (10 + 10)" + ); + + // Write a third batch to confirm rotation occurred (creates second file) + let batch3 = create_test_batch(20, 5); + writer.push_batch(&batch3)?; + + // Now check that second file was created + assert_eq!( + metrics.spill_file_count.value(), + 2, + "Should have created 2 files after writing to new file" + ); + assert_eq!( + metrics.spilled_rows.value(), + 25, + "Should have spilled 25 total rows (10 + 10 + 5)" + ); + + // Read all three batches + let result1 = reader.next().await.unwrap()?; + assert_eq!(result1.num_rows(), 10); + + let result2 = reader.next().await.unwrap()?; + assert_eq!(result2.num_rows(), 10); + + let result3 = reader.next().await.unwrap()?; + assert_eq!(result3.num_rows(), 5); + + Ok(()) + } + + #[tokio::test] + async fn test_multiple_rotations() -> Result<()> { + let batches = (0..10) + .map(|i| create_test_batch(i * 10, 10)) + .collect::>(); + + let batch_size = batches[0].get_array_memory_size() * 2 + 1; + + // Very small max_file_size to force frequent rotations + let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size); + + // Write many batches to cause multiple rotations + for i in 0..10 { + let batch = create_test_batch(i * 10, 10); + writer.push_batch(&batch)?; + } + + // Check metrics after all writes - should have multiple files due to rotations + // With batch_size = 2 * one_batch + 1, each file fits ~2 batches before rotating + // 10 batches should create multiple files (exact count depends on rotation timing) + let file_count = metrics.spill_file_count.value(); + assert!( + file_count >= 4, + "Should have created at least 4 files with multiple rotations (got {file_count})" + ); + assert!( + metrics.spilled_bytes.value() > 0, + "Spilled bytes should be > 0 after rotations (got {})", + metrics.spilled_bytes.value() + ); + assert_eq!( + metrics.spilled_rows.value(), + 100, + "Should have spilled 100 total rows (10 batches * 10 rows)" + ); + + // Read all batches and verify order + for i in 0..10 { + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 10); + + let col = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + col.value(0), + i * 10, + "Batch {i} not in correct order after rotations" + ); + } + + Ok(()) + } + + #[tokio::test] + async fn test_single_batch_larger_than_limit() -> Result<()> { + // Very small limit + let (writer, mut reader, metrics) = create_spill_channel_with_metrics(100); + + // Write a batch that exceeds the limit + let large_batch = create_test_batch(0, 100); + writer.push_batch(&large_batch)?; + + // Check metrics after large batch - should trigger rotation immediately + assert_eq!( + metrics.spill_file_count.value(), + 1, + "Should have created 1 file for large batch" + ); + assert_eq!( + metrics.spilled_rows.value(), + 100, + "Should have spilled 100 rows from large batch" + ); + + // Should still write and read successfully + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 100); + + // Next batch should go to a new file + let batch2 = create_test_batch(100, 10); + writer.push_batch(&batch2)?; + + // Check metrics after second batch - should have rotated to a new file + assert_eq!( + metrics.spill_file_count.value(), + 2, + "Should have created 2 files after rotation" + ); + assert_eq!( + metrics.spilled_rows.value(), + 110, + "Should have spilled 110 total rows (100 + 10)" + ); + + let result2 = reader.next().await.unwrap()?; + assert_eq!(result2.num_rows(), 10); + + Ok(()) + } + + #[tokio::test] + async fn test_very_small_max_file_size() -> Result<()> { + // Test with just 1 byte max (extreme case) + let (writer, mut reader) = create_spill_channel(1); + + // Any batch will exceed this limit + let batch = create_test_batch(0, 5); + writer.push_batch(&batch)?; + + // Should still work + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 5); + + Ok(()) + } + + #[tokio::test] + async fn test_exact_size_boundary() -> Result<()> { + // Create a batch and measure its approximate size + let batch = create_test_batch(0, 10); + let batch_size = batch.get_array_memory_size(); + + // Set max_file_size to exactly the batch size + let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size); + + // Write first batch (exactly at the size limit) + writer.push_batch(&batch)?; + + // Check metrics after first batch - should NOT rotate yet (size == limit, not >) + assert_eq!( + metrics.spill_file_count.value(), + 1, + "Should have created 1 file after first batch at exact boundary" + ); + assert_eq!( + metrics.spilled_rows.value(), + 10, + "Should have spilled 10 rows from first batch" + ); + + // Write second batch (exceeds the limit, should trigger rotation) + let batch2 = create_test_batch(10, 10); + writer.push_batch(&batch2)?; + + // Check metrics after second batch - rotation triggered, first file finalized + // Note: second file not created yet (lazy creation on next write) + assert_eq!( + metrics.spill_file_count.value(), + 1, + "Should still have 1 file after rotation (second file created lazily)" + ); + assert_eq!( + metrics.spilled_rows.value(), + 20, + "Should have spilled 20 total rows (10 + 10)" + ); + // Verify first file was finalized by checking spilled_bytes + assert!( + metrics.spilled_bytes.value() > 0, + "Spilled bytes should be > 0 after file finalization (got {})", + metrics.spilled_bytes.value() + ); + + // Both should be readable + let result1 = reader.next().await.unwrap()?; + assert_eq!(result1.num_rows(), 10); + + let result2 = reader.next().await.unwrap()?; + assert_eq!(result2.num_rows(), 10); + + // Spill another batch, now we should see the second file created + let batch3 = create_test_batch(20, 5); + writer.push_batch(&batch3)?; + assert_eq!( + metrics.spill_file_count.value(), + 2, + "Should have created 2 files after writing to new file" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_concurrent_reader_writer() -> Result<()> { + let (writer, mut reader) = create_spill_channel(1024 * 1024); + + // Spawn writer task + let writer_handle = SpawnedTask::spawn(async move { + for i in 0..10 { + let batch = create_test_batch(i * 10, 10); + writer.push_batch(&batch).unwrap(); + // Small delay to simulate real concurrent work + tokio::time::sleep(std::time::Duration::from_millis(5)).await; + } + }); + + // Reader task (runs concurrently) + let reader_handle = SpawnedTask::spawn(async move { + let mut count = 0; + for i in 0..10 { + let result = reader.next().await.unwrap().unwrap(); + assert_eq!(result.num_rows(), 10); + + let col = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.value(0), i * 10); + count += 1; + } + count + }); + + // Wait for both to complete + writer_handle.await.unwrap(); + let batches_read = reader_handle.await.unwrap(); + assert_eq!(batches_read, 10); + + Ok(()) + } + + #[tokio::test] + async fn test_reader_catches_up_to_writer() -> Result<()> { + let (writer, mut reader) = create_spill_channel(1024 * 1024); + + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + enum ReadWriteEvent { + ReadStart, + Read(usize), + Write(usize), + } + + let events = Arc::new(Mutex::new(vec![])); + // Start reader first (will pend) + let reader_events = Arc::clone(&events); + let reader_handle = SpawnedTask::spawn(async move { + reader_events.lock().push(ReadWriteEvent::ReadStart); + let result = reader.next().await.unwrap().unwrap(); + reader_events + .lock() + .push(ReadWriteEvent::Read(result.num_rows())); + let result = reader.next().await.unwrap().unwrap(); + reader_events + .lock() + .push(ReadWriteEvent::Read(result.num_rows())); + }); + + // Give reader time to start pending + tokio::time::sleep(std::time::Duration::from_millis(5)).await; + + // Now write a batch (should wake the reader) + let batch = create_test_batch(0, 5); + events.lock().push(ReadWriteEvent::Write(batch.num_rows())); + writer.push_batch(&batch)?; + + // Wait for the reader to process + let processed = async { + loop { + if events.lock().len() >= 3 { + break; + } + tokio::time::sleep(std::time::Duration::from_micros(500)).await; + } + }; + tokio::time::timeout(std::time::Duration::from_secs(1), processed) + .await + .unwrap(); + + // Write another batch + let batch = create_test_batch(5, 10); + events.lock().push(ReadWriteEvent::Write(batch.num_rows())); + writer.push_batch(&batch)?; + + // Reader should complete + reader_handle.await.unwrap(); + let events = events.lock().clone(); + assert_eq!( + events, + vec![ + ReadWriteEvent::ReadStart, + ReadWriteEvent::Write(5), + ReadWriteEvent::Read(5), + ReadWriteEvent::Write(10), + ReadWriteEvent::Read(10) + ] + ); + + Ok(()) + } + + #[tokio::test] + async fn test_reader_starts_after_writer_finishes() -> Result<()> { + let (writer, reader) = create_spill_channel(128); + + // Writer writes all data + for i in 0..5 { + let batch = create_test_batch(i * 10, 10); + writer.push_batch(&batch)?; + } + + drop(writer); + + // Now start reader + let mut reader = reader; + let mut count = 0; + for i in 0..5 { + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 10); + + let col = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.value(0), i * 10); + count += 1; + } + + assert_eq!(count, 5, "Should read all batches after writer finishes"); + + Ok(()) + } + + #[tokio::test] + async fn test_writer_drop_finalizes_file() -> Result<()> { + let env = Arc::new(RuntimeEnv::default()); + let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0); + let schema = create_test_schema(); + let spill_manager = + Arc::new(SpillManager::new(Arc::clone(&env), metrics.clone(), schema)); + + let (writer, mut reader) = channel(1024 * 1024, spill_manager); + + // Write some batches + for i in 0..5 { + let batch = create_test_batch(i * 10, 10); + writer.push_batch(&batch)?; + } + + // Check metrics before drop - spilled_bytes should be 0 since file isn't finalized yet + let spilled_bytes_before = metrics.spilled_bytes.value(); + assert_eq!( + spilled_bytes_before, 0, + "Spilled bytes should be 0 before writer is dropped" + ); + + // Explicitly drop the writer - this should finalize the current file + drop(writer); + + // Check metrics after drop - spilled_bytes should be > 0 now + let spilled_bytes_after = metrics.spilled_bytes.value(); + assert!( + spilled_bytes_after > 0, + "Spilled bytes should be > 0 after writer is dropped (got {spilled_bytes_after})" + ); + + // Verify reader can still read all batches + let mut count = 0; + for i in 0..5 { + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), 10); + + let col = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.value(0), i * 10); + count += 1; + } + + assert_eq!(count, 5, "Should read all batches after writer is dropped"); + + Ok(()) + } + + #[tokio::test] + async fn test_disk_usage_decreases_as_files_consumed() -> Result<()> { + use datafusion_execution::runtime_env::RuntimeEnvBuilder; + + // Test configuration + const NUM_BATCHES: usize = 3; + const ROWS_PER_BATCH: usize = 100; + + // Step 1: Create a test batch and measure its size + let batch = create_test_batch(0, ROWS_PER_BATCH); + let batch_size = batch.get_array_memory_size(); + + // Step 2: Configure file rotation to approximately 1 batch per file + // Create a custom RuntimeEnv so we can access the DiskManager + let runtime = Arc::new(RuntimeEnvBuilder::default().build()?); + let disk_manager = Arc::clone(&runtime.disk_manager); + + let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0); + let schema = create_test_schema(); + let spill_manager = Arc::new(SpillManager::new(runtime, metrics.clone(), schema)); + + let (writer, mut reader) = channel(batch_size, spill_manager); + + // Step 3: Write NUM_BATCHES batches to create approximately NUM_BATCHES files + for i in 0..NUM_BATCHES { + let start = (i * ROWS_PER_BATCH) as i32; + writer.push_batch(&create_test_batch(start, ROWS_PER_BATCH))?; + } + + // Check how many files were created (should be at least a few due to file rotation) + let file_count = metrics.spill_file_count.value(); + assert_eq!( + file_count, + NUM_BATCHES - 1, + "Expected at {} files with rotation, got {file_count}", + NUM_BATCHES - 1 + ); + + // Step 4: Verify initial disk usage reflects all files + let initial_disk_usage = disk_manager.used_disk_space(); + assert!( + initial_disk_usage > 0, + "Expected disk usage > 0 after writing batches, got {initial_disk_usage}" + ); + + // Step 5: Read NUM_BATCHES - 1 batches (all but 1) + // As each file is fully consumed, it should be dropped and disk usage should decrease + for i in 0..(NUM_BATCHES - 1) { + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), ROWS_PER_BATCH); + + let col = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.value(0), (i * ROWS_PER_BATCH) as i32); + } + + // Step 6: Verify disk usage decreased but is not zero (at least 1 batch remains) + let partial_disk_usage = disk_manager.used_disk_space(); + assert!( + partial_disk_usage > 0 + && partial_disk_usage < (batch_size * NUM_BATCHES * 2) as u64, + "Disk usage should be > 0 with remaining batches" + ); + assert!( + partial_disk_usage < initial_disk_usage, + "Disk usage should have decreased after reading most batches: initial={initial_disk_usage}, partial={partial_disk_usage}" + ); + + // Step 7: Read the final batch + let result = reader.next().await.unwrap()?; + assert_eq!(result.num_rows(), ROWS_PER_BATCH); + + // Step 8: Drop writer first to signal no more data will be written + // The reader has infinite stream semantics and will wait for the writer + // to be dropped before returning None + drop(writer); + + // Verify we've read all batches - now the reader should return None + assert!( + reader.next().await.is_none(), + "Should have no more batches to read" + ); + + // Step 9: Drop reader to release all references + drop(reader); + + // Step 10: Verify complete cleanup - disk usage should be 0 + let final_disk_usage = disk_manager.used_disk_space(); + assert_eq!( + final_disk_usage, 0, + "Disk usage should be 0 after all files dropped, got {final_disk_usage}" + ); + + Ok(()) + } +} diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 7009d976d646f..7a34b240bd7c7 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -223,6 +223,7 @@ datafusion.execution.keep_partition_by_columns false datafusion.execution.listing_table_factory_infer_partitions true datafusion.execution.listing_table_ignore_subdirectory true datafusion.execution.max_buffered_batches_per_output_file 2 +datafusion.execution.max_spill_file_size_bytes 134217728 datafusion.execution.meta_fetch_concurrency 32 datafusion.execution.minimum_parallel_output_files 4 datafusion.execution.objectstore_writer_buffer_size 10485760 @@ -343,6 +344,7 @@ datafusion.execution.keep_partition_by_columns false Should DataFusion keep the datafusion.execution.listing_table_factory_infer_partitions true Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). datafusion.execution.listing_table_ignore_subdirectory true Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption +datafusion.execution.max_spill_file_size_bytes 134217728 Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics datafusion.execution.minimum_parallel_output_files 4 Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. datafusion.execution.objectstore_writer_buffer_size 10485760 Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 5950a4fa9a6a9..9f2a3c6085083 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -114,6 +114,7 @@ The following configuration settings are available: | datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | | datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | | datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | +| datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | | datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | | datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | | datafusion.execution.soft_max_rows_per_output_file | 50000000 | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max | From 969fc138dd7c109f7348b0fd0830239f8819d858 Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Sat, 8 Nov 2025 06:37:36 +0800 Subject: [PATCH 0020/1589] chore: Format examples in doc strings - spark, sql, sqllogictest, sibstrait (#18443) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p -- --config format_code_in_doc_comments=true` for the following datasource-related crates: - `datafusion-spark` - `datafusion-sql` - `datafusion-sqllogictest` - `datafusion-substrait` - `datafusion-cli` - `datafusion-examples` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. --------- Co-authored-by: Andrew Lamb --- .../examples/advanced_parquet_index.rs | 1 - .../external_dependency/query-aws-s3.rs | 1 - .../examples/flight/sql_server.rs | 1 - datafusion-examples/examples/parquet_index.rs | 1 - datafusion-examples/examples/sql_query.rs | 1 - datafusion-examples/examples/thread_pools.rs | 2 +- .../spark/src/function/bitwise/bit_shift.rs | 3 --- .../spark/src/function/url/parse_url.rs | 2 -- datafusion/spark/src/lib.rs | 2 +- datafusion/sql/src/parser.rs | 13 +++++---- datafusion/sql/src/planner.rs | 1 - datafusion/sql/src/resolve.rs | 12 ++++----- datafusion/sql/src/unparser/expr.rs | 5 ++-- datafusion/sql/src/unparser/plan.rs | 8 ++++-- datafusion/sql/src/unparser/rewrite.rs | 1 - datafusion/sql/src/utils.rs | 1 - datafusion/sql/tests/cases/diagnostic.rs | 10 ++++--- .../engines/datafusion_engine/normalize.rs | 1 - datafusion/substrait/src/lib.rs | 27 +++++++++++-------- .../consumer/substrait_consumer.rs | 1 - 20 files changed, 44 insertions(+), 50 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 1c560be6d08a6..371c18de354ce 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -121,7 +121,6 @@ use url::Url; /// │ ╚═══════════════════╝ │ 1. With cached ParquetMetadata, so /// └───────────────────────┘ the ParquetSource does not re-read / /// Parquet File decode the thrift footer -/// /// ``` /// /// Within a Row Group, Column Chunks store data in DataPages. This example also diff --git a/datafusion-examples/examples/external_dependency/query-aws-s3.rs b/datafusion-examples/examples/external_dependency/query-aws-s3.rs index da2d7e4879f99..cd0b4562d5f2d 100644 --- a/datafusion-examples/examples/external_dependency/query-aws-s3.rs +++ b/datafusion-examples/examples/external_dependency/query-aws-s3.rs @@ -28,7 +28,6 @@ use url::Url; /// /// - AWS_ACCESS_KEY_ID /// - AWS_SECRET_ACCESS_KEY -/// #[tokio::main] async fn main() -> Result<()> { let ctx = SessionContext::new(); diff --git a/datafusion-examples/examples/flight/sql_server.rs b/datafusion-examples/examples/flight/sql_server.rs index fc7d0817bd5fa..d86860f9d4364 100644 --- a/datafusion-examples/examples/flight/sql_server.rs +++ b/datafusion-examples/examples/flight/sql_server.rs @@ -68,7 +68,6 @@ macro_rules! status { /// /// Based heavily on Ballista's implementation: https://github.com/apache/datafusion-ballista/blob/main/ballista/scheduler/src/flight_sql.rs /// and the example in arrow-rs: https://github.com/apache/arrow-rs/blob/master/arrow-flight/examples/flight_sql_server.rs -/// pub async fn sql_server() -> Result<(), Box> { env_logger::init(); let addr = "0.0.0.0:50051".parse()?; diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index 127c55da982c8..a1dd1f1ffd10d 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -99,7 +99,6 @@ use url::Url; /// Thus some parquet files are │ │ /// "pruned" and thus are not └─────────────┘ /// scanned at all Parquet Files -/// /// ``` /// /// [`ListingTable`]: datafusion::datasource::listing::ListingTable diff --git a/datafusion-examples/examples/sql_query.rs b/datafusion-examples/examples/sql_query.rs index 0ac203cfb7e74..4da07d33d03d4 100644 --- a/datafusion-examples/examples/sql_query.rs +++ b/datafusion-examples/examples/sql_query.rs @@ -32,7 +32,6 @@ use std::sync::Arc; /// /// [`query_memtable`]: a simple query against a [`MemTable`] /// [`query_parquet`]: a simple query against a directory with multiple Parquet files -/// #[tokio::main] async fn main() -> Result<()> { query_memtable().await?; diff --git a/datafusion-examples/examples/thread_pools.rs b/datafusion-examples/examples/thread_pools.rs index bba56b2932abc..9842cccfbfe83 100644 --- a/datafusion-examples/examples/thread_pools.rs +++ b/datafusion-examples/examples/thread_pools.rs @@ -342,7 +342,7 @@ impl CpuRuntime { /// message such as: /// /// ```text - ///A Tokio 1.x context was found, but IO is disabled. + /// A Tokio 1.x context was found, but IO is disabled. /// ``` pub fn handle(&self) -> &Handle { &self.handle diff --git a/datafusion/spark/src/function/bitwise/bit_shift.rs b/datafusion/spark/src/function/bitwise/bit_shift.rs index bb645b7660584..68911b0492c56 100644 --- a/datafusion/spark/src/function/bitwise/bit_shift.rs +++ b/datafusion/spark/src/function/bitwise/bit_shift.rs @@ -42,7 +42,6 @@ use crate::function::error_utils::{ /// /// # Returns /// A new array with the shifted values. -/// fn shift_left( value: &PrimitiveArray, shift: &PrimitiveArray, @@ -71,7 +70,6 @@ where /// /// # Returns /// A new array with the shifted values. -/// fn shift_right( value: &PrimitiveArray, shift: &PrimitiveArray, @@ -132,7 +130,6 @@ impl UShr for i64 { /// /// # Returns /// A new array with the shifted values. -/// fn shift_right_unsigned( value: &PrimitiveArray, shift: &PrimitiveArray, diff --git a/datafusion/spark/src/function/url/parse_url.rs b/datafusion/spark/src/function/url/parse_url.rs index d93c260b4f340..a8afa1d9639f5 100644 --- a/datafusion/spark/src/function/url/parse_url.rs +++ b/datafusion/spark/src/function/url/parse_url.rs @@ -80,7 +80,6 @@ impl ParseUrl { /// * `Ok(Some(String))` - The extracted URL component as a string /// * `Ok(None)` - If the requested component doesn't exist or is empty /// * `Err(DataFusionError)` - If the URL is malformed and cannot be parsed - /// fn parse(value: &str, part: &str, key: Option<&str>) -> Result> { let url: std::result::Result = Url::parse(value); if let Err(ParseError::RelativeUrlWithoutBase) = url { @@ -168,7 +167,6 @@ impl ScalarUDFImpl for ParseUrl { /// - A string array with extracted URL components /// - `None` values where extraction failed or component doesn't exist /// - The output array type (StringArray or LargeStringArray) is determined by input types -/// fn spark_parse_url(args: &[ArrayRef]) -> Result { spark_handled_parse_url(args, |x| x) } diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs index 4d45f3c482af3..5b1fa06cb2c7c 100644 --- a/datafusion/spark/src/lib.rs +++ b/datafusion/spark/src/lib.rs @@ -88,7 +88,7 @@ //! use datafusion_spark::expr_fn::sha2; //! // Create the expression `sha2(my_data, 256)` //! let expr = sha2(col("my_data"), lit(256)); -//!``` +//! ``` //! //![`Expr`]: datafusion_expr::Expr diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 1f1ef2a672abc..99d7467e1b7ca 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -58,7 +58,7 @@ fn parse_file_type(s: &str) -> Result { /// Syntax: /// ```sql /// EXPLAIN [FORMAT format] statement -///``` +/// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct ExplainStatement { /// `EXPLAIN ANALYZE ..` @@ -320,8 +320,7 @@ const DEFAULT_DIALECT: GenericDialect = GenericDialect {}; /// # use datafusion_sql::parser::DFParserBuilder; /// # use datafusion_common::Result; /// # fn test() -> Result<()> { -/// let mut parser = DFParserBuilder::new("SELECT * FROM foo; SELECT 1 + 2") -/// .build()?; +/// let mut parser = DFParserBuilder::new("SELECT * FROM foo; SELECT 1 + 2").build()?; /// // parse the SQL into DFStatements /// let statements = parser.parse_statements()?; /// assert_eq!(statements.len(), 2); @@ -336,13 +335,13 @@ const DEFAULT_DIALECT: GenericDialect = GenericDialect {}; /// # use datafusion_sql::sqlparser::dialect::MySqlDialect; /// # use datafusion_sql::sqlparser::ast::Expr; /// # fn test() -> Result<()> { -/// let dialect = MySqlDialect{}; // Parse using MySQL dialect +/// let dialect = MySqlDialect {}; // Parse using MySQL dialect /// let mut parser = DFParserBuilder::new("1 + 2") -/// .with_dialect(&dialect) -/// .build()?; +/// .with_dialect(&dialect) +/// .build()?; /// // parse 1+2 into an sqlparser::ast::Expr /// let res = parser.parse_expr()?; -/// assert!(matches!(res.expr, Expr::BinaryOp {..})); +/// assert!(matches!(res.expr, Expr::BinaryOp { .. })); /// # Ok(()) /// # } /// ``` diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 7bac0337672dc..eb1e711eb4fd8 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -251,7 +251,6 @@ impl IdentNormalizer { /// This helps resolve scoping issues of CTEs. /// By using cloning, a subquery can inherit CTEs from the outer query /// and can also define its own private CTEs without affecting the outer query. -/// #[derive(Debug, Clone)] pub struct PlannerContext { /// Data types for numbered parameters ($1, $2, etc), if supplied diff --git a/datafusion/sql/src/resolve.rs b/datafusion/sql/src/resolve.rs index 9e909f66fa97a..db5ddd5115194 100644 --- a/datafusion/sql/src/resolve.rs +++ b/datafusion/sql/src/resolve.rs @@ -175,14 +175,14 @@ fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) { /// ## Example with CTEs /// /// ``` -/// # use datafusion_sql::parser::DFParser; +/// # use datafusion_sql::parser::DFParser; /// # use datafusion_sql::resolve::resolve_table_references; -/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;"; -/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); -/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); +/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;"; +/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); +/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); /// assert_eq!(table_refs.len(), 0); -/// assert_eq!(ctes.len(), 1); -/// assert_eq!(ctes[0].to_string(), "my_cte"); +/// assert_eq!(ctes.len(), 1); +/// assert_eq!(ctes[0].to_string(), "my_cte"); /// ``` pub fn resolve_table_references( statement: &crate::parser::Statement, diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 97f2b58bf8402..8dc3092e9ce0a 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -70,9 +70,8 @@ use sqlparser::tokenizer::Span; /// use datafusion_expr::{col, lit}; /// use datafusion_sql::unparser::expr_to_sql; /// let expr = col("a").gt(lit(4)); // form an expression `a > 4` -/// let sql = expr_to_sql(&expr).unwrap(); // convert to ast::Expr -/// // use the Display impl to convert to SQL text -/// assert_eq!(sql.to_string(), "(a > 4)") +/// let sql = expr_to_sql(&expr).unwrap(); // convert to ast::Expr, using +/// assert_eq!(sql.to_string(), "(a > 4)"); // use Display impl for SQL text /// ``` /// /// [`SqlToRel::sql_to_expr`]: crate::planner::SqlToRel::sql_to_expr diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index e7535338b7677..68b42ba05af5f 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -81,9 +81,13 @@ use std::{sync::Arc, vec}; /// .unwrap() /// .build() /// .unwrap(); -/// let sql = plan_to_sql(&plan).unwrap(); // convert to AST +/// // convert to AST +/// let sql = plan_to_sql(&plan).unwrap(); /// // use the Display impl to convert to SQL text -/// assert_eq!(sql.to_string(), "SELECT \"table\".id, \"table\".\"value\" FROM \"table\"") +/// assert_eq!( +/// sql.to_string(), +/// "SELECT \"table\".id, \"table\".\"value\" FROM \"table\"" +/// ) /// ``` /// /// [`SqlToRel::sql_statement_to_plan`]: crate::planner::SqlToRel::sql_statement_to_plan diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs index c961f1d6f1f0c..1b6c3433f79f4 100644 --- a/datafusion/sql/src/unparser/rewrite.rs +++ b/datafusion/sql/src/unparser/rewrite.rs @@ -119,7 +119,6 @@ fn rewrite_sort_expr_for_union(exprs: Vec) -> Result> { /// Projection: table.column1, table.column2 /// Window: window_function /// TableScan: table -/// pub(super) fn rewrite_qualify(plan: LogicalPlan) -> Result { let transformed_plan = plan.transform_up(|plan| match plan { // Check if the filter's input is a Window plan diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 3c86d2d04905f..042ee53730937 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -531,7 +531,6 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> { /// / / /// column2 /// ``` - /// fn f_up(&mut self, expr: Expr) -> Result> { if let Expr::Unnest(ref traversing_unnest) = expr { if traversing_unnest == self.top_most_unnest.as_ref().unwrap() { diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs index 8648dffb50046..7ae839851d04f 100644 --- a/datafusion/sql/tests/cases/diagnostic.rs +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -69,10 +69,12 @@ fn do_query(sql: &'static str) -> Diagnostic { /// ## Example /// /// ```rust -/// let spans = get_spans("SELECT /*whole+left*/speed/*left*/ + /*right*/10/*right+whole*/ FROM cars"); -/// // whole is ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -/// // left is ^^^^^ -/// // right is ^^ +/// let spans = get_spans( +/// "SELECT /*whole+left*/speed/*left*/ + /*right*/10/*right+whole*/ FROM cars", +/// // whole is ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// // left is ^^^^^ +/// // right is ^^ +/// ); /// dbg!(&spans["whole"]); /// dbg!(&spans["left"]); /// dbg!(&spans["right"]); diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs index 87108b67424b2..cb6410d857a87 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs @@ -185,7 +185,6 @@ macro_rules! get_row_value { /// [NULL Values and empty strings]: https://duckdb.org/dev/sqllogictest/result_verification#null-values-and-empty-strings /// /// Floating numbers are rounded to have a consistent representation with the Postgres runner. -/// pub fn cell_to_string(col: &ArrayRef, row: usize, is_spark_path: bool) -> Result { if !col.is_valid(row) { // represent any null value with the string "NULL" diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs index 9a4f44e81df23..8bc31569f294e 100644 --- a/datafusion/substrait/src/lib.rs +++ b/datafusion/substrait/src/lib.rs @@ -66,19 +66,24 @@ //! # use datafusion::arrow::array::{Int32Array, RecordBatch}; //! # use datafusion_substrait::logical_plan; //! // Create a plan that scans table 't' -//! let ctx = SessionContext::new(); -//! let batch = RecordBatch::try_from_iter(vec![("x", Arc::new(Int32Array::from(vec![42])) as _)])?; -//! ctx.register_batch("t", batch)?; -//! let df = ctx.sql("SELECT x from t").await?; -//! let plan = df.into_optimized_plan()?; +//! let ctx = SessionContext::new(); +//! let batch = RecordBatch::try_from_iter(vec![( +//! "x", +//! Arc::new(Int32Array::from(vec![42])) as _, +//! )])?; +//! ctx.register_batch("t", batch)?; +//! let df = ctx.sql("SELECT x from t").await?; +//! let plan = df.into_optimized_plan()?; //! -//! // Convert the plan into a substrait (protobuf) Plan -//! let substrait_plan = logical_plan::producer::to_substrait_plan(&plan, &ctx.state())?; +//! // Convert the plan into a substrait (protobuf) Plan +//! let substrait_plan = logical_plan::producer::to_substrait_plan(&plan, &ctx.state())?; //! -//! // Receive a substrait protobuf from somewhere, and turn it into a LogicalPlan -//! let logical_round_trip = logical_plan::consumer::from_substrait_plan(&ctx.state(), &substrait_plan).await?; -//! let logical_round_trip = ctx.state().optimize(&logical_round_trip)?; -//! assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip)); +//! // Receive a substrait protobuf from somewhere, and turn it into a LogicalPlan +//! let logical_round_trip = +//! logical_plan::consumer::from_substrait_plan(&ctx.state(), &substrait_plan) +//! .await?; +//! let logical_round_trip = ctx.state().optimize(&logical_round_trip)?; +//! assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip)); //! # Ok(()) //! # } //! ``` diff --git a/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs b/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs index 5392dd77b576b..c734b9eb7a541 100644 --- a/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs @@ -150,7 +150,6 @@ use substrait::proto::{ /// } /// } /// ``` -/// pub trait SubstraitConsumer: Send + Sync + Sized { async fn resolve_table_ref( &self, From 92727b52fd0f9d3896be0bb8b09ab670c7d31d4d Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Sat, 8 Nov 2025 08:17:33 +0800 Subject: [PATCH 0021/1589] fix: shuffle seed (#18518) ## Which issue does this PR close? - Closes #18476. ## Rationale for this change shuffle test sometimes fails ## What changes are included in this PR? add seed to shuffle, make sure slt won't fail. ## Are these changes tested? UT ## Are there any user-facing changes? No --- .../spark/src/function/array/shuffle.rs | 102 +++++++++++++++--- .../test_files/spark/array/shuffle.slt | 46 +++----- 2 files changed, 103 insertions(+), 45 deletions(-) diff --git a/datafusion/spark/src/function/array/shuffle.rs b/datafusion/spark/src/function/array/shuffle.rs index abeafd3a93660..9f345b53b89a7 100644 --- a/datafusion/spark/src/function/array/shuffle.rs +++ b/datafusion/spark/src/function/array/shuffle.rs @@ -15,21 +15,25 @@ // specific language governing permissions and limitations // under the License. -use crate::function::functions_nested_utils::make_scalar_function; use arrow::array::{ Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, MutableArrayData, OffsetSizeTrait, }; use arrow::buffer::OffsetBuffer; +use arrow::datatypes::DataType; use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null}; -use arrow::datatypes::{DataType, FieldRef}; +use arrow::datatypes::FieldRef; use datafusion_common::cast::{ as_fixed_size_list_array, as_large_list_array, as_list_array, }; -use datafusion_common::{exec_err, utils::take_function_args, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue}; +use datafusion_expr::{ + ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, ScalarUDFImpl, + Signature, TypeSignature, Volatility, +}; use rand::rng; -use rand::seq::SliceRandom; +use rand::rngs::StdRng; +use rand::{seq::SliceRandom, Rng, SeedableRng}; use std::any::Any; use std::sync::Arc; @@ -47,7 +51,25 @@ impl Default for SparkShuffle { impl SparkShuffle { pub fn new() -> Self { Self { - signature: Signature::arrays(1, None, Volatility::Volatile), + signature: Signature { + type_signature: TypeSignature::OneOf(vec![ + // Only array argument + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Array], + array_coercion: None, + }), + // Array + Index (seed) argument + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::Index, + ], + array_coercion: None, + }), + ]), + volatility: Volatility::Volatile, + parameter_names: None, + }, } } } @@ -73,25 +95,63 @@ impl ScalarUDFImpl for SparkShuffle { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - make_scalar_function(array_shuffle_inner)(&args.args) + if args.args.is_empty() { + return exec_err!("shuffle expects at least 1 argument"); + } + if args.args.len() > 2 { + return exec_err!("shuffle expects at most 2 arguments"); + } + + // Extract seed from second argument if present + let seed = if args.args.len() == 2 { + extract_seed(&args.args[1])? + } else { + None + }; + + // Convert arguments to arrays + let arrays = ColumnarValue::values_to_arrays(&args.args[..1])?; + array_shuffle_with_seed(&arrays, seed).map(ColumnarValue::Array) + } +} + +/// Extract seed value from ColumnarValue +fn extract_seed(seed_arg: &ColumnarValue) -> Result> { + match seed_arg { + ColumnarValue::Scalar(scalar) => { + let seed = match scalar { + ScalarValue::Int64(Some(v)) => Some(*v as u64), + ScalarValue::Null => None, + _ => { + return exec_err!( + "shuffle seed must be Int64 type, got '{}'", + scalar.data_type() + ); + } + }; + Ok(seed) + } + ColumnarValue::Array(_) => { + exec_err!("shuffle seed must be a scalar value, not an array") + } } } -/// array_shuffle SQL function -pub fn array_shuffle_inner(arg: &[ArrayRef]) -> Result { +/// array_shuffle SQL function with optional seed +fn array_shuffle_with_seed(arg: &[ArrayRef], seed: Option) -> Result { let [input_array] = take_function_args("shuffle", arg)?; match &input_array.data_type() { List(field) => { let array = as_list_array(input_array)?; - general_array_shuffle::(array, field) + general_array_shuffle::(array, field, seed) } LargeList(field) => { let array = as_large_list_array(input_array)?; - general_array_shuffle::(array, field) + general_array_shuffle::(array, field, seed) } FixedSizeList(field, _) => { let array = as_fixed_size_list_array(input_array)?; - fixed_size_array_shuffle(array, field) + fixed_size_array_shuffle(array, field, seed) } Null => Ok(Arc::clone(input_array)), array_type => exec_err!("shuffle does not support type '{array_type}'."), @@ -101,6 +161,7 @@ pub fn array_shuffle_inner(arg: &[ArrayRef]) -> Result { fn general_array_shuffle( array: &GenericListArray, field: &FieldRef, + seed: Option, ) -> Result { let values = array.values(); let original_data = values.to_data(); @@ -109,7 +170,13 @@ fn general_array_shuffle( let mut nulls = vec![]; let mut mutable = MutableArrayData::with_capacities(vec![&original_data], false, capacity); - let mut rng = rng(); + let mut rng = if let Some(s) = seed { + StdRng::seed_from_u64(s) + } else { + // Use a random seed from the thread-local RNG + let seed = rng().random::(); + StdRng::seed_from_u64(seed) + }; for (row_index, offset_window) in array.offsets().windows(2).enumerate() { // skip the null value @@ -149,6 +216,7 @@ fn general_array_shuffle( fn fixed_size_array_shuffle( array: &FixedSizeListArray, field: &FieldRef, + seed: Option, ) -> Result { let values = array.values(); let original_data = values.to_data(); @@ -157,7 +225,13 @@ fn fixed_size_array_shuffle( let mut mutable = MutableArrayData::with_capacities(vec![&original_data], false, capacity); let value_length = array.value_length() as usize; - let mut rng = rng(); + let mut rng = if let Some(s) = seed { + StdRng::seed_from_u64(s) + } else { + // Use a random seed from the thread-local RNG + let seed = rng().random::(); + StdRng::seed_from_u64(seed) + }; for row_index in 0..array.len() { // skip the null value diff --git a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt index 7614caef666bb..35aad58144c91 100644 --- a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt +++ b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt @@ -16,27 +16,16 @@ # under the License. # Test shuffle function with simple arrays -query B -SELECT array_sort(shuffle([1, 2, 3, 4, 5, NULL])) = [NULL,1, 2, 3, 4, 5]; ----- -true - -query B -SELECT shuffle([1, 2, 3, 4, 5, NULL]) != [1, 2, 3, 4, 5, NULL]; +query ? +SELECT shuffle([1, 2, 3, 4, 5, NULL], 1); ---- -true +[1, 4, NULL, 2, 5, 3] # Test shuffle function with string arrays - -query B -SELECT array_sort(shuffle(['a', 'b', 'c', 'd', 'e', 'f'])) = ['a', 'b', 'c', 'd', 'e', 'f']; ----- -true - -query B -SELECT shuffle(['a', 'b', 'c', 'd', 'e', 'f']) != ['a', 'b', 'c', 'd', 'e', 'f'];; +query ? +SELECT shuffle(['a', 'b', 'c', 'd', 'e', 'f'], 1); ---- -true +[a, d, f, b, e, c] # Test shuffle function with empty array query ? @@ -57,15 +46,10 @@ SELECT shuffle(NULL); NULL # Test shuffle function with fixed size list arrays -query B -SELECT array_sort(shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)'))) = [NULL, 1, 2, 3, 4, 5]; ----- -true - -query B -SELECT shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)')) != [1, 2, NULL, 3, 4, 5]; +query ? +SELECT shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)'), 1); ---- -true +[1, 3, 5, 2, 4, NULL] # Test shuffle on table data with different list types statement ok @@ -78,10 +62,10 @@ CREATE TABLE test_shuffle_list_types AS VALUES # Test shuffle with large list from table query ? -SELECT array_sort(shuffle(column1)) FROM test_shuffle_list_types; +SELECT shuffle(column1, 1) FROM test_shuffle_list_types; ---- -[1, 2, 3, 4] -[5, 6, 7, 8, 9] +[1, 4, 3, 2] +[8, 9, 6, 5, 7] [10] NULL [] @@ -96,11 +80,11 @@ CREATE TABLE test_shuffle_fixed_size AS VALUES # Test shuffle with fixed size list from table query ? -SELECT array_sort(shuffle(column1)) FROM test_shuffle_fixed_size; +SELECT shuffle(column1, 1) FROM test_shuffle_fixed_size; ---- [1, 2, 3] -[4, 5, 6] -[NULL, 8, 9] +[4, 6, 5] +[9, NULL, 8] NULL # Clean up From 8259b354c74c3222a051f174ea4b6722c70f0639 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Sat, 8 Nov 2025 11:19:03 +1100 Subject: [PATCH 0022/1589] refactor: simplify `calculate_binary_math` in datafusion-functions (#18525) Ensure the `right` value gets casted if it is a scalar (currently we only cast if it is an array). Remove unused arm for casting left value. --- datafusion/functions/src/utils.rs | 43 +++++++++++-------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs index 932d61e8007cd..ffa238162b1ba 100644 --- a/datafusion/functions/src/utils.rs +++ b/datafusion/functions/src/utils.rs @@ -140,38 +140,25 @@ where F: Fn(L::Native, R::Native) -> Result, R::Native: TryFrom, { - Ok(match right { + let left = left.as_primitive::(); + let right = right.cast_to(&R::DATA_TYPE, None)?; + let result = match right { ColumnarValue::Scalar(scalar) => { - let right_value: R::Native = - R::Native::try_from(scalar.clone()).map_err(|_| { - DataFusionError::NotImplemented(format!( - "Cannot convert scalar value {} to {}", - &scalar, - R::DATA_TYPE - )) - })?; - let left_array = left.as_primitive::(); - // Bind right value - let result = - left_array.try_unary::<_, O, _>(|lvalue| fun(lvalue, right_value))?; - Arc::new(result) as _ + let right = R::Native::try_from(scalar.clone()).map_err(|_| { + DataFusionError::NotImplemented(format!( + "Cannot convert scalar value {} to {}", + &scalar, + R::DATA_TYPE + )) + })?; + left.try_unary::<_, O, _>(|lvalue| fun(lvalue, right))? } ColumnarValue::Array(right) => { - let right_casted = arrow::compute::cast(&right, &R::DATA_TYPE)?; - let right_array = right_casted.as_primitive::(); - - // Types are compatible even they are decimals with different scale or precision - let result = if PrimitiveArray::::is_compatible(&L::DATA_TYPE) { - let left_array = left.as_primitive::(); - try_binary::<_, _, _, O>(left_array, right_array, &fun)? - } else { - let left_casted = arrow::compute::cast(left, &L::DATA_TYPE)?; - let left_array = left_casted.as_primitive::(); - try_binary::<_, _, _, O>(left_array, right_array, &fun)? - }; - Arc::new(result) as _ + let right = right.as_primitive::(); + try_binary::<_, _, _, O>(left, right, &fun)? } - }) + }; + Ok(Arc::new(result) as _) } /// Converts Decimal128 components (value and scale) to an unscaled i128 From 3b4bcca7d14f16495173636282b845a775fe51b7 Mon Sep 17 00:00:00 2001 From: jizezhang Date: Fri, 7 Nov 2025 18:18:31 -0800 Subject: [PATCH 0023/1589] ci: enforce needless_pass_by_value for datafusion-optimzer (#18533) ## Which issue does this PR close? - Closes https://github.com/apache/datafusion/issues/18505. ## Rationale for this change Enforce clippy `needless_pass_by_value`. ## What changes are included in this PR? ## Are these changes tested? Yes ## Are there any user-facing changes? No --- .../src/analyzer/resolve_grouping_function.rs | 6 ++-- .../src/decorrelate_predicate_subquery.rs | 32 +++++++++---------- datafusion/optimizer/src/lib.rs | 3 ++ .../simplify_expressions/expr_simplifier.rs | 4 +-- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs b/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs index fa7ff1b8b19d6..6381db63122dd 100644 --- a/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs +++ b/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs @@ -28,7 +28,7 @@ use arrow::datatypes::DataType; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{ - internal_datafusion_err, plan_err, Column, DFSchemaRef, Result, ScalarValue, + internal_datafusion_err, plan_err, Column, DFSchema, Result, ScalarValue, }; use datafusion_expr::expr::{AggregateFunction, Alias}; use datafusion_expr::logical_plan::LogicalPlan; @@ -74,7 +74,7 @@ fn group_expr_to_bitmap_index(group_expr: &[Expr]) -> Result, - schema: DFSchemaRef, + schema: &DFSchema, group_expr: Vec, aggr_expr: Vec, ) -> Result { @@ -139,7 +139,7 @@ fn analyze_internal(plan: LogicalPlan) -> Result> { schema, .. }) if contains_grouping_function(&aggr_expr) => Ok(Transformed::yes( - replace_grouping_exprs(input, schema, group_expr, aggr_expr)?, + replace_grouping_exprs(input, schema.as_ref(), group_expr, aggr_expr)?, )), _ => Ok(Transformed::no(plan)), })?; diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index ccf90f91e68f9..9e4e44b00770a 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -136,7 +136,7 @@ fn rewrite_inner_subqueries( Expr::Exists(Exists { subquery: Subquery { subquery, .. }, negated, - }) => match mark_join(&cur_input, Arc::clone(&subquery), None, negated, alias)? { + }) => match mark_join(&cur_input, &subquery, None, negated, alias)? { Some((plan, exists_expr)) => { cur_input = plan; Ok(Transformed::yes(exists_expr)) @@ -154,13 +154,7 @@ fn rewrite_inner_subqueries( .map_or(plan_err!("single expression required."), |output_expr| { Ok(Expr::eq(*expr.clone(), output_expr)) })?; - match mark_join( - &cur_input, - Arc::clone(&subquery), - Some(in_predicate), - negated, - alias, - )? { + match mark_join(&cur_input, &subquery, Some(&in_predicate), negated, alias)? { Some((plan, exists_expr)) => { cur_input = plan; Ok(Transformed::yes(exists_expr)) @@ -275,7 +269,13 @@ fn build_join_top( }; let subquery = query_info.query.subquery.as_ref(); let subquery_alias = alias.next("__correlated_sq"); - build_join(left, subquery, in_predicate_opt, join_type, subquery_alias) + build_join( + left, + subquery, + in_predicate_opt.as_ref(), + join_type, + subquery_alias, + ) } /// This is used to handle the case when the subquery is embedded in a more complex boolean @@ -295,8 +295,8 @@ fn build_join_top( /// TableScan: t2 fn mark_join( left: &LogicalPlan, - subquery: Arc, - in_predicate_opt: Option, + subquery: &LogicalPlan, + in_predicate_opt: Option<&Expr>, negated: bool, alias_generator: &Arc, ) -> Result> { @@ -306,7 +306,7 @@ fn mark_join( let exists_expr = if negated { !exists_col } else { exists_col }; Ok( - build_join(left, &subquery, in_predicate_opt, JoinType::LeftMark, alias)? + build_join(left, subquery, in_predicate_opt, JoinType::LeftMark, alias)? .map(|plan| (plan, exists_expr)), ) } @@ -314,12 +314,12 @@ fn mark_join( fn build_join( left: &LogicalPlan, subquery: &LogicalPlan, - in_predicate_opt: Option, + in_predicate_opt: Option<&Expr>, join_type: JoinType, alias: String, ) -> Result> { let mut pull_up = PullUpCorrelatedExpr::new() - .with_in_predicate_opt(in_predicate_opt.clone()) + .with_in_predicate_opt(in_predicate_opt.cloned()) .with_exists_sub_query(in_predicate_opt.is_none()); let new_plan = subquery.clone().rewrite(&mut pull_up).data()?; @@ -342,7 +342,7 @@ fn build_join( replace_qualified_name(filter, &all_correlated_cols, &alias).map(Some) })?; - let join_filter = match (join_filter_opt, in_predicate_opt.clone()) { + let join_filter = match (join_filter_opt, in_predicate_opt.cloned()) { ( Some(join_filter), Some(Expr::BinaryExpr(BinaryExpr { @@ -378,7 +378,7 @@ fn build_join( // Gather all columns needed for the join filter + predicates let mut needed = std::collections::HashSet::new(); expr_to_columns(&join_filter, &mut needed)?; - if let Some(ref in_pred) = in_predicate_opt { + if let Some(in_pred) = in_predicate_opt { expr_to_columns(in_pred, &mut needed)?; } diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs index 85fa9493f449d..07ef2a46cba99 100644 --- a/datafusion/optimizer/src/lib.rs +++ b/datafusion/optimizer/src/lib.rs @@ -23,6 +23,9 @@ // Make sure fast / cheap clones on Arc are explicit: // https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] //! # DataFusion Optimizer //! diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 05b8c28fadd6c..c7912bbf70b05 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1659,7 +1659,7 @@ impl TreeNodeRewriter for Simplifier<'_, S> { .to_string(); Transformed::yes(Expr::Like(Like { pattern: Box::new(to_string_scalar( - data_type, + &data_type, Some(simplified_pattern), )), ..like @@ -1971,7 +1971,7 @@ fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { } } -fn to_string_scalar(data_type: DataType, value: Option) -> Expr { +fn to_string_scalar(data_type: &DataType, value: Option) -> Expr { match data_type { DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None), DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None), From fe24690997597bb8402acd456b45a9075683e178 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 7 Nov 2025 21:37:30 -0500 Subject: [PATCH 0024/1589] Add comments to Cargo.toml about workspace overrides (#18526) ## Which issue does this PR close? - Follow on to https://github.com/apache/datafusion/pull/18468 ## Rationale for this change We missed the fact that you couldn't yet add new linter rules to subcrates via Cargo.toml overrides. Thankfully @Jefffrey sorted is out. Let's try and avoid that again by leaving a comment ## What changes are included in this PR? Add comments to help our future selves remember to add new lints to lib.rs rather than Cargo.toml for subcrates ## Are these changes tested? ## Are there any user-facing changes? --- benchmarks/Cargo.toml | 3 +++ datafusion-examples/Cargo.toml | 3 +++ datafusion/catalog-listing/Cargo.toml | 3 +++ datafusion/catalog/Cargo.toml | 3 +++ datafusion/common-runtime/Cargo.toml | 3 +++ datafusion/common/Cargo.toml | 3 +++ datafusion/core/Cargo.toml | 3 +++ datafusion/datasource-arrow/Cargo.toml | 3 +++ datafusion/datasource-avro/Cargo.toml | 3 +++ datafusion/datasource-csv/Cargo.toml | 3 +++ datafusion/datasource-json/Cargo.toml | 3 +++ datafusion/datasource-parquet/Cargo.toml | 3 +++ datafusion/datasource/Cargo.toml | 3 +++ datafusion/doc/Cargo.toml | 3 +++ datafusion/execution/Cargo.toml | 3 +++ datafusion/expr-common/Cargo.toml | 3 +++ datafusion/expr/Cargo.toml | 3 +++ datafusion/ffi/Cargo.toml | 3 +++ datafusion/functions-aggregate-common/Cargo.toml | 3 +++ datafusion/functions-aggregate/Cargo.toml | 3 +++ datafusion/functions-nested/Cargo.toml | 3 +++ datafusion/functions-table/Cargo.toml | 3 +++ datafusion/functions-window-common/Cargo.toml | 3 +++ datafusion/functions-window/Cargo.toml | 3 +++ datafusion/functions/Cargo.toml | 3 +++ datafusion/macros/Cargo.toml | 3 +++ datafusion/optimizer/Cargo.toml | 3 +++ datafusion/physical-expr-common/Cargo.toml | 3 +++ datafusion/physical-expr/Cargo.toml | 3 +++ datafusion/physical-optimizer/Cargo.toml | 3 +++ datafusion/physical-plan/Cargo.toml | 3 +++ datafusion/proto-common/gen/Cargo.toml | 3 +++ datafusion/proto/gen/Cargo.toml | 3 +++ datafusion/pruning/Cargo.toml | 3 +++ datafusion/session/Cargo.toml | 3 +++ datafusion/spark/Cargo.toml | 3 +++ datafusion/sql/Cargo.toml | 3 +++ datafusion/sqllogictest/Cargo.toml | 3 +++ datafusion/substrait/Cargo.toml | 3 +++ datafusion/wasmtest/Cargo.toml | 3 +++ test-utils/Cargo.toml | 3 +++ 41 files changed, 123 insertions(+) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b3fd520814dbc..870c826f55810 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -26,6 +26,9 @@ repository = { workspace = true } license = { workspace = true } rust-version = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 0ec410ecc6b29..38f1f8b0e0cad 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,6 +29,9 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml index 4eaeed675a206..4b802c0067e59 100644 --- a/datafusion/catalog-listing/Cargo.toml +++ b/datafusion/catalog-listing/Cargo.toml @@ -51,6 +51,9 @@ tokio = { workspace = true } [dev-dependencies] datafusion-datasource-parquet = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index a1db45654be01..1009e9aee477b 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -49,5 +49,8 @@ object_store = { workspace = true } parking_lot = { workspace = true } tokio = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/common-runtime/Cargo.toml b/datafusion/common-runtime/Cargo.toml index e53d97b41360a..fd9a818bcb1d0 100644 --- a/datafusion/common-runtime/Cargo.toml +++ b/datafusion/common-runtime/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index a9eb0f2220c69..b222ae12b92f5 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index f672e3a946816..67a73ac6f6693 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -32,6 +32,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/datasource-arrow/Cargo.toml b/datafusion/datasource-arrow/Cargo.toml index b3d1e3f2accc9..fbadc8708ca69 100644 --- a/datafusion/datasource-arrow/Cargo.toml +++ b/datafusion/datasource-arrow/Cargo.toml @@ -51,6 +51,9 @@ tokio = { workspace = true } [dev-dependencies] chrono = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index 6bab899e7f976..c9299aeb101da 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -47,6 +47,9 @@ object_store = { workspace = true } [dev-dependencies] serde_json = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/datasource-csv/Cargo.toml b/datafusion/datasource-csv/Cargo.toml index 209cea403896b..c9e138759ef4a 100644 --- a/datafusion/datasource-csv/Cargo.toml +++ b/datafusion/datasource-csv/Cargo.toml @@ -47,6 +47,9 @@ object_store = { workspace = true } regex = { workspace = true } tokio = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/datasource-json/Cargo.toml b/datafusion/datasource-json/Cargo.toml index 987ab60c70b7c..37fa8d43a0816 100644 --- a/datafusion/datasource-json/Cargo.toml +++ b/datafusion/datasource-json/Cargo.toml @@ -46,6 +46,9 @@ futures = { workspace = true } object_store = { workspace = true } tokio = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/datasource-parquet/Cargo.toml b/datafusion/datasource-parquet/Cargo.toml index 1f866ffd6cc2f..a5f6f56ac6f33 100644 --- a/datafusion/datasource-parquet/Cargo.toml +++ b/datafusion/datasource-parquet/Cargo.toml @@ -57,6 +57,9 @@ tokio = { workspace = true } [dev-dependencies] chrono = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index 8e0738448a75e..19b247829dbd2 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -74,6 +74,9 @@ zstd = { version = "0.13", optional = true, default-features = false } criterion = { workspace = true } tempfile = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/doc/Cargo.toml b/datafusion/doc/Cargo.toml index b8324565a0c67..c1368c1531533 100644 --- a/datafusion/doc/Cargo.toml +++ b/datafusion/doc/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/execution/Cargo.toml b/datafusion/execution/Cargo.toml index 67a37a86c7066..f9f7a1bc63cc5 100644 --- a/datafusion/execution/Cargo.toml +++ b/datafusion/execution/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml index db85f32079214..0c4fa2c211cf1 100644 --- a/datafusion/expr-common/Cargo.toml +++ b/datafusion/expr-common/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index e6b2734cfff34..11d6ca1533db3 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index babfe28ad5576..3ac08180fb68c 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -30,6 +30,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/functions-aggregate-common/Cargo.toml b/datafusion/functions-aggregate-common/Cargo.toml index a6e0a1fc2f8bb..1d4fb29d9c674 100644 --- a/datafusion/functions-aggregate-common/Cargo.toml +++ b/datafusion/functions-aggregate-common/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index ffc6f3bb7a10a..428855a61698c 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 6e0d1048f9697..8e4801ba2729d 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/functions-table/Cargo.toml b/datafusion/functions-table/Cargo.toml index 78d59257dd480..a5f50c072d1c5 100644 --- a/datafusion/functions-table/Cargo.toml +++ b/datafusion/functions-table/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/functions-window-common/Cargo.toml b/datafusion/functions-window-common/Cargo.toml index 466e7bc68b486..6af668c1459e8 100644 --- a/datafusion/functions-window-common/Cargo.toml +++ b/datafusion/functions-window-common/Cargo.toml @@ -31,6 +31,9 @@ version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/functions-window/Cargo.toml b/datafusion/functions-window/Cargo.toml index 23ee608a82675..7036bbec9d2cb 100644 --- a/datafusion/functions-window/Cargo.toml +++ b/datafusion/functions-window/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 1dbeee7159fd5..ad52a551a7c17 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/macros/Cargo.toml b/datafusion/macros/Cargo.toml index 64781ddeaf421..35714bfe960ba 100644 --- a/datafusion/macros/Cargo.toml +++ b/datafusion/macros/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index f10510e0973c3..15d3261ca5132 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index 58dc767dbad2a..4602e59c422c3 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index b7654a0f6f603..953a46929c394 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index 4df011fc0a05e..395da10d629ba 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 607224782fc46..5858deb83c83c 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/proto-common/gen/Cargo.toml b/datafusion/proto-common/gen/Cargo.toml index ef56d2697d818..2d2557811d0df 100644 --- a/datafusion/proto-common/gen/Cargo.toml +++ b/datafusion/proto-common/gen/Cargo.toml @@ -29,6 +29,9 @@ publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/proto/gen/Cargo.toml b/datafusion/proto/gen/Cargo.toml index c2096b6011123..d446ab0d89741 100644 --- a/datafusion/proto/gen/Cargo.toml +++ b/datafusion/proto/gen/Cargo.toml @@ -29,6 +29,9 @@ publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/pruning/Cargo.toml b/datafusion/pruning/Cargo.toml index 2429123bdf966..bd898cba202ba 100644 --- a/datafusion/pruning/Cargo.toml +++ b/datafusion/pruning/Cargo.toml @@ -9,6 +9,9 @@ repository = { workspace = true } license = { workspace = true } authors = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/session/Cargo.toml b/datafusion/session/Cargo.toml index 0489da61eed86..230e26d1fc9fc 100644 --- a/datafusion/session/Cargo.toml +++ b/datafusion/session/Cargo.toml @@ -38,5 +38,8 @@ datafusion-expr = { workspace = true } datafusion-physical-plan = { workspace = true } parking_lot = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml index 7f6210fb32bf6..279c88b525d3c 100644 --- a/datafusion/spark/Cargo.toml +++ b/datafusion/spark/Cargo.toml @@ -29,6 +29,9 @@ edition = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index ea2cd6dfcc7d8..5e107814176f0 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 9cf397270100f..177761e4af54e 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -30,6 +30,9 @@ version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index bff9a07a13de2..0d7e34881c9cb 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -27,6 +27,9 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index a1e344979ad01..d8b042cbb76c0 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -30,6 +30,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index 3a161d5f4d645..cdaee6f442bf2 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -22,6 +22,9 @@ edition = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true From 2233796e789bf2a230067f915ec952da8eeec988 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Sat, 8 Nov 2025 13:26:58 +0800 Subject: [PATCH 0025/1589] minor: Remove inconsistent comment (#18539) ## Which issue does this PR close? - Closes #. ## Rationale for this change In https://github.com/apache/datafusion/pull/18468, there is a inconsistent comment I forget to remove. ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --- datafusion/common/src/lib.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index c8d5a30ee3e0b..549c265024f91 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -25,8 +25,6 @@ #![deny(clippy::clone_on_ref_ptr)] // https://github.com/apache/datafusion/issues/18503 #![deny(clippy::needless_pass_by_value)] -// This lint rule is enforced in `../Cargo.toml`, but it's okay to skip them in tests -// See details in https://github.com/apache/datafusion/issues/18503 #![cfg_attr(test, allow(clippy::needless_pass_by_value))] mod column; From c826009586b55a1688dd0c119974d4913072f342 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Sat, 8 Nov 2025 16:37:24 +1100 Subject: [PATCH 0026/1589] Refactor `log()` signature to use coercion API + fixes (#18519) ## Which issue does this PR close? Part of #14763 and #14760 ## Rationale for this change Current `log()` signature has some drawbacks: https://github.com/apache/datafusion/blob/a5eb9121ccf802dda547897155403b08a4fbf774/datafusion/functions/src/math/log.rs#L78-L105 - A bit nasty to look at: mixes numeric with exact float/int with exact decimal (of exact precision and scale) - Can't accommodate arbitrary decimals of any precision/scale (this is true for other functions too) Aim of this PR is to refactor it to use the coercion API, uplifting the API where necessary to make this possible. This simplifies the signature in code, whilst not losing flexibility. Also other minor refactors are included to log. ## What changes are included in this PR? New `TypeSignatureClass` variants: Float, Decimal & Numeric Refactor `log()` signature to be more in line with it's supported implementations. Fix issue in `log()` where `ColumnarValue::Scalar`s were being lost as `ColumnarValue::Array`s for the base. Support null propagation in `simplify()` for `log()`. ~~Fix issue with `calculate_binary_math` where it wasn't casting scalars.~~ ## Are these changes tested? Added new tests. - Tests for float16, decimal32, decimal64, decimals with different scales/precisions - Test for null propagation (ensure use array input to avoid function inlining) ## Are there any user-facing changes? No. --- datafusion/common/src/scalar/mod.rs | 6 +- datafusion/common/src/types/native.rs | 27 ++- datafusion/expr-common/src/signature.rs | 26 ++- datafusion/functions/src/math/log.rs | 180 ++++++++++-------- .../sqllogictest/test_files/decimal.slt | 40 +++- datafusion/sqllogictest/test_files/math.slt | 27 +++ datafusion/sqllogictest/test_files/order.slt | 32 ++-- 7 files changed, 216 insertions(+), 122 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 52e0159111249..fadd2e41eaba4 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -1734,7 +1734,7 @@ impl ScalarValue { ) { return _internal_err!("Invalid precision and scale {err}"); } - if *scale <= 0 { + if *scale < 0 { return _internal_err!("Negative scale is not supported"); } match 10_i32.checked_pow((*scale + 1) as u32) { @@ -1750,7 +1750,7 @@ impl ScalarValue { ) { return _internal_err!("Invalid precision and scale {err}"); } - if *scale <= 0 { + if *scale < 0 { return _internal_err!("Negative scale is not supported"); } match i64::from(10).checked_pow((*scale + 1) as u32) { @@ -4407,6 +4407,7 @@ macro_rules! impl_scalar { impl_scalar!(f64, Float64); impl_scalar!(f32, Float32); +impl_scalar!(f16, Float16); impl_scalar!(i8, Int8); impl_scalar!(i16, Int16); impl_scalar!(i32, Int32); @@ -4563,6 +4564,7 @@ impl_try_from!(UInt8, u8); impl_try_from!(UInt16, u16); impl_try_from!(UInt32, u32); impl_try_from!(UInt64, u64); +impl_try_from!(Float16, f16); impl_try_from!(Float32, f32); impl_try_from!(Float64, f64); impl_try_from!(Boolean, bool); diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 8c41701ae5768..a1495b779ac97 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -430,22 +430,7 @@ impl From for NativeType { impl NativeType { #[inline] pub fn is_numeric(&self) -> bool { - use NativeType::*; - matches!( - self, - UInt8 - | UInt16 - | UInt32 - | UInt64 - | Int8 - | Int16 - | Int32 - | Int64 - | Float16 - | Float32 - | Float64 - | Decimal(_, _) - ) + self.is_integer() || self.is_float() || self.is_decimal() } #[inline] @@ -491,4 +476,14 @@ impl NativeType { pub fn is_null(&self) -> bool { matches!(self, NativeType::Null) } + + #[inline] + pub fn is_decimal(&self) -> bool { + matches!(self, Self::Decimal(_, _)) + } + + #[inline] + pub fn is_float(&self) -> bool { + matches!(self, Self::Float16 | Self::Float32 | Self::Float64) + } } diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 2bf7092dd2224..6ee1c4a2a40c6 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -21,7 +21,7 @@ use std::fmt::Display; use std::hash::Hash; use crate::type_coercion::aggregates::NUMERICS; -use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; +use arrow::datatypes::{DataType, Decimal128Type, DecimalType, IntervalUnit, TimeUnit}; use datafusion_common::types::{LogicalType, LogicalTypeRef, NativeType}; use datafusion_common::utils::ListCoercion; use datafusion_common::{internal_err, plan_err, Result}; @@ -333,9 +333,10 @@ pub enum TypeSignatureClass { Interval, Duration, Native(LogicalTypeRef), - // TODO: - // Numeric Integer, + Float, + Decimal, + Numeric, /// Encompasses both the native Binary as well as arbitrarily sized FixedSizeBinary types Binary, } @@ -378,6 +379,13 @@ impl TypeSignatureClass { TypeSignatureClass::Binary => { vec![DataType::Binary] } + TypeSignatureClass::Decimal => vec![Decimal128Type::DEFAULT_TYPE], + TypeSignatureClass::Float => vec![DataType::Float64], + TypeSignatureClass::Numeric => vec![ + DataType::Float64, + DataType::Int64, + Decimal128Type::DEFAULT_TYPE, + ], } } @@ -395,6 +403,9 @@ impl TypeSignatureClass { TypeSignatureClass::Duration if logical_type.is_duration() => true, TypeSignatureClass::Integer if logical_type.is_integer() => true, TypeSignatureClass::Binary if logical_type.is_binary() => true, + TypeSignatureClass::Decimal if logical_type.is_decimal() => true, + TypeSignatureClass::Float if logical_type.is_float() => true, + TypeSignatureClass::Numeric if logical_type.is_numeric() => true, _ => false, } } @@ -428,6 +439,15 @@ impl TypeSignatureClass { TypeSignatureClass::Binary if native_type.is_binary() => { Ok(origin_type.to_owned()) } + TypeSignatureClass::Decimal if native_type.is_decimal() => { + Ok(origin_type.to_owned()) + } + TypeSignatureClass::Float if native_type.is_float() => { + Ok(origin_type.to_owned()) + } + TypeSignatureClass::Numeric if native_type.is_numeric() => { + Ok(origin_type.to_owned()) + } _ if native_type.is_null() => Ok(origin_type.to_owned()), _ => internal_err!("May miss the matching logic in `matches_native_type`"), } diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index f66f6fcfc1f88..24000a3876bd2 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -18,18 +18,18 @@ //! Math function: `log()`. use std::any::Any; -use std::sync::Arc; use super::power::PowerFunc; use crate::utils::{calculate_binary_math, decimal128_to_i128}; use arrow::array::{Array, ArrayRef}; +use arrow::compute::kernels::cast; use arrow::datatypes::{ - DataType, Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int32Type, - Int64Type, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, + DataType, Decimal128Type, Decimal256Type, Float16Type, Float32Type, Float64Type, }; use arrow::error::ArrowError; use arrow_buffer::i256; +use datafusion_common::types::NativeType; use datafusion_common::{ exec_err, internal_err, plan_datafusion_err, plan_err, Result, ScalarValue, }; @@ -37,11 +37,12 @@ use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_expr::{ - lit, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF, - TypeSignature::*, + lit, Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF, + TypeSignature, TypeSignatureClass, }; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; +use num_traits::Float; #[user_doc( doc_section(label = "Math Functions"), @@ -72,37 +73,28 @@ impl Default for LogFunc { impl LogFunc { pub fn new() -> Self { + // Converts decimals & integers to float64, accepting other floats as is + let as_float = Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ); Self { signature: Signature::one_of( + // Ensure decimals have precedence over floats since we have + // a native decimal implementation for log vec![ - Numeric(1), - Numeric(2), - Exact(vec![DataType::Float32, DataType::Float32]), - Exact(vec![DataType::Float64, DataType::Float64]), - Exact(vec![ - DataType::Int64, - DataType::Decimal128(DECIMAL128_MAX_PRECISION, 0), - ]), - Exact(vec![ - DataType::Float32, - DataType::Decimal128(DECIMAL128_MAX_PRECISION, 0), - ]), - Exact(vec![ - DataType::Float64, - DataType::Decimal128(DECIMAL128_MAX_PRECISION, 0), - ]), - Exact(vec![ - DataType::Int64, - DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0), - ]), - Exact(vec![ - DataType::Float32, - DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0), - ]), - Exact(vec![ - DataType::Float64, - DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0), + // log(value) + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Decimal, + )]), + TypeSignature::Coercible(vec![as_float.clone()]), + // log(base, value) + TypeSignature::Coercible(vec![ + as_float.clone(), + Coercion::new_exact(TypeSignatureClass::Decimal), ]), + TypeSignature::Coercible(vec![as_float.clone(), as_float.clone()]), ], Volatility::Immutable, ), @@ -160,6 +152,7 @@ impl ScalarUDFImpl for LogFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { // Check last argument (value) match &arg_types.last().ok_or(plan_datafusion_err!("No args"))? { + DataType::Float16 => Ok(DataType::Float16), DataType::Float32 => Ok(DataType::Float32), _ => Ok(DataType::Float64), } @@ -192,68 +185,67 @@ impl ScalarUDFImpl for LogFunc { // Support overloaded log(base, x) and log(x) which defaults to log(10, x) fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let args = ColumnarValue::values_to_arrays(&args.args)?; + if args.arg_fields.iter().any(|a| a.data_type().is_null()) { + return ColumnarValue::Scalar(ScalarValue::Null) + .cast_to(args.return_type(), None); + } - let (base, value) = if args.len() == 2 { - // note in f64::log params order is different than in sql. e.g in sql log(base, x) == f64::log(x, base) - (ColumnarValue::Array(Arc::clone(&args[0])), &args[1]) + let (base, value) = if args.args.len() == 2 { + (args.args[0].clone(), &args.args[1]) } else { - // log(num) - assume base is 10 - let ret_type = if args[0].data_type().is_null() { - &DataType::Float64 - } else { - args[0].data_type() - }; + // no base specified, default to 10 ( - ColumnarValue::Array( - ScalarValue::new_ten(ret_type)?.to_array_of_size(args[0].len())?, - ), - &args[0], + ColumnarValue::Scalar(ScalarValue::new_ten(args.return_type())?), + &args.args[0], ) }; + let value = value.to_array(args.number_rows)?; - // All log functors have format 'log(value, base)' - // Therefore, for `calculate_binary_math` the first type means a type of main array - // The second type is the type of the base array (even if derived from main) - let arr: ArrayRef = match value.data_type() { - DataType::Float32 => calculate_binary_math::< - Float32Type, - Float32Type, - Float32Type, - _, - >(value, &base, |x, b| Ok(f32::log(x, b)))?, - DataType::Float64 => calculate_binary_math::< - Float64Type, - Float64Type, - Float64Type, - _, - >(value, &base, |x, b| Ok(f64::log(x, b)))?, - DataType::Int32 => { - calculate_binary_math::( - value, + let output: ArrayRef = match value.data_type() { + DataType::Float16 => { + calculate_binary_math::( + &value, + &base, + |value, base| Ok(value.log(base)), + )? + } + DataType::Float32 => { + calculate_binary_math::( + &value, &base, - |x, b| Ok(f64::log(x as f64, b)), + |value, base| Ok(value.log(base)), )? } - DataType::Int64 => { - calculate_binary_math::( - value, + DataType::Float64 => { + calculate_binary_math::( + &value, &base, - |x, b| Ok(f64::log(x as f64, b)), + |value, base| Ok(value.log(base)), )? } - DataType::Decimal128(_precision, scale) => { + // TODO: native log support for decimal 32 & 64; right now upcast + // to decimal128 to calculate + // https://github.com/apache/datafusion/issues/17555 + DataType::Decimal32(precision, scale) + | DataType::Decimal64(precision, scale) => { calculate_binary_math::( - value, + &cast(&value, &DataType::Decimal128(*precision, *scale))?, &base, - |x, b| log_decimal128(x, *scale, b), + |value, base| log_decimal128(value, *scale, base), )? } - DataType::Decimal256(_precision, scale) => { + DataType::Decimal128(_, scale) => { + calculate_binary_math::( + &value, + &base, + |value, base| log_decimal128(value, *scale, base), + )? + } + DataType::Decimal256(_, scale) => { calculate_binary_math::( - value, + &value, &base, - |x, b| log_decimal256(x, *scale, b), + |value, base| log_decimal256(value, *scale, base), )? } other => { @@ -261,7 +253,7 @@ impl ScalarUDFImpl for LogFunc { } }; - Ok(ColumnarValue::Array(arr)) + Ok(ColumnarValue::Array(output)) } fn documentation(&self) -> Option<&Documentation> { @@ -277,17 +269,28 @@ impl ScalarUDFImpl for LogFunc { mut args: Vec, info: &dyn SimplifyInfo, ) -> Result { + let mut arg_types = args + .iter() + .map(|arg| info.get_data_type(arg)) + .collect::>>()?; + let return_type = self.return_type(&arg_types)?; + + // Null propagation + if arg_types.iter().any(|dt| dt.is_null()) { + return Ok(ExprSimplifyResult::Simplified(lit( + ScalarValue::Null.cast_to(&return_type)? + ))); + } + // Args are either // log(number) // log(base, number) let num_args = args.len(); - if num_args > 2 { + if num_args != 1 && num_args != 2 { return plan_err!("Expected log to have 1 or 2 arguments, got {num_args}"); } - let number = args.pop().ok_or_else(|| { - plan_datafusion_err!("Expected log to have 1 or 2 arguments, got 0") - })?; - let number_datatype = info.get_data_type(&number)?; + let number = args.pop().unwrap(); + let number_datatype = arg_types.pop().unwrap(); // default to base 10 let base = if let Some(base) = args.pop() { base @@ -339,6 +342,7 @@ fn is_pow(func: &ScalarUDF) -> bool { #[cfg(test)] mod tests { use std::collections::HashMap; + use std::sync::Arc; use super::*; @@ -353,6 +357,18 @@ mod tests { use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::simplify::SimplifyContext; + #[test] + fn test_log_decimal_native() { + let value = 10_i128.pow(35); + assert_eq!((value as f64).log2(), 116.26748332105768); + assert_eq!( + log_decimal128(value, 0, 2.0).unwrap(), + // TODO: see we're losing our decimal points compared to above + // https://github.com/apache/datafusion/issues/18524 + 116.0 + ); + } + #[test] fn test_log_invalid_base_type() { let arg_fields = vec![ diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt index 502821fcc3043..f350d9b3bfe1f 100644 --- a/datafusion/sqllogictest/test_files/decimal.slt +++ b/datafusion/sqllogictest/test_files/decimal.slt @@ -794,15 +794,47 @@ select 100000000000000000000000000000000000::decimal(38,0), arrow_typeof(1000000 ---- 100000000000000000000000000000000000 Decimal128(38, 0) +# log for small decimal32 +query R +select log(arrow_cast(100, 'Decimal32(9, 0)')); +---- +2 + +query R +select log(arrow_cast(100, 'Decimal32(9, 2)')); +---- +2 + +# log for small decimal64 +query R +select log(arrow_cast(100, 'Decimal64(18, 0)')); +---- +2 + +query R +select log(arrow_cast(100, 'Decimal64(18, 2)')); +---- +2 + # log for small decimal128 query R -select log(100::decimal(38,0)); +select log(arrow_cast(100, 'Decimal128(38, 0)')); +---- +2 + +query R +select log(arrow_cast(100, 'Decimal128(38, 2)')); ---- 2 # log for small decimal256 query R -select log(100::decimal(76,0)); +select log(arrow_cast(100, 'Decimal256(76, 0)')); +---- +2 + +query R +select log(arrow_cast(100, 'Decimal256(76, 2)')); ---- 2 @@ -858,10 +890,12 @@ select log(2, 100000000000000000000000000000000000::decimal(38,0)); 116 # log(10^35) for decimal128 with another base +# TODO: this should be 116.267483321058, error with native decimal log impl +# https://github.com/apache/datafusion/issues/18524 query R select log(2.0, 100000000000000000000000000000000000::decimal(38,0)); ---- -116.267483321058 +116 # null cases query R diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index 1cb68b85b2bce..edba5354e001d 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -705,3 +705,30 @@ select FACTORIAL(350943270); statement ok drop table signed_integers + +# Null propagation for log +query TT +EXPLAIN SELECT log(NULL, c2) from aggregate_simple; +---- +logical_plan +01)Projection: Float64(NULL) AS log(NULL,aggregate_simple.c2) +02)--TableScan: aggregate_simple projection=[] +physical_plan +01)ProjectionExec: expr=[NULL as log(NULL,aggregate_simple.c2)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_simple.csv]]}, file_type=csv, has_header=true + +# Float 16/32/64 for log +query RT +SELECT log(2.5, arrow_cast(10.9, 'Float16')), arrow_typeof(log(2.5, arrow_cast(10.9, 'Float16'))); +---- +2.6074219 Float16 + +query RT +SELECT log(2.5, 10.9::float), arrow_typeof(log(2.5, 10.9::float)); +---- +2.606992 Float32 + +query RT +SELECT log(2.5, 10.9::double), arrow_typeof(log(2.5, 10.9::double)); +---- +2.606992198152 Float64 diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index 04a7615c764b8..a73f56079e3fa 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -675,11 +675,11 @@ query TT ---- logical_plan 01)Sort: log_c11_base_c12 ASC NULLS LAST -02)--Projection: log(aggregate_test_100.c12, CAST(aggregate_test_100.c11 AS Float64)) AS log_c11_base_c12 +02)--Projection: log(aggregate_test_100.c12, aggregate_test_100.c11) AS log_c11_base_c12 03)----TableScan: aggregate_test_100 projection=[c11, c12] physical_plan 01)SortPreservingMergeExec: [log_c11_base_c12@0 ASC NULLS LAST] -02)--ProjectionExec: expr=[log(c12@1, CAST(c11@0 AS Float64)) as log_c11_base_c12] +02)--ProjectionExec: expr=[log(c12@1, c11@0) as log_c11_base_c12] 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11, c12], output_orderings=[[c11@0 ASC NULLS LAST], [c12@1 DESC NULLS LAST]], file_type=csv, has_header=true @@ -690,11 +690,11 @@ ORDER BY log_c12_base_c11 DESC NULLS LAST; ---- logical_plan 01)Sort: log_c12_base_c11 DESC NULLS LAST -02)--Projection: log(CAST(aggregate_test_100.c11 AS Float64), aggregate_test_100.c12) AS log_c12_base_c11 +02)--Projection: log(aggregate_test_100.c11, aggregate_test_100.c12) AS log_c12_base_c11 03)----TableScan: aggregate_test_100 projection=[c11, c12] physical_plan 01)SortPreservingMergeExec: [log_c12_base_c11@0 DESC NULLS LAST] -02)--ProjectionExec: expr=[log(CAST(c11@0 AS Float64), c12@1) as log_c12_base_c11] +02)--ProjectionExec: expr=[log(c11@0, c12@1) as log_c12_base_c11] 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11, c12], output_orderings=[[c11@0 ASC NULLS LAST], [c12@1 DESC NULLS LAST]], file_type=csv, has_header=true @@ -1024,10 +1024,10 @@ ORDER BY SUM(column1) # ORDER BY with a GROUP BY clause query I -SELECT SUM(column1) - FROM foo -GROUP BY column2 -ORDER BY SUM(column1) +SELECT SUM(column1) + FROM foo +GROUP BY column2 +ORDER BY SUM(column1) ---- 0 2 @@ -1039,12 +1039,12 @@ ORDER BY SUM(column1) # ORDER BY with a GROUP BY clause and a HAVING clause query I -SELECT - SUM(column1) -FROM foo -GROUP BY column2 -HAVING SUM(column1) < 3 -ORDER BY SUM(column1) +SELECT + SUM(column1) +FROM foo +GROUP BY column2 +HAVING SUM(column1) < 3 +ORDER BY SUM(column1) ---- 0 2 @@ -1179,7 +1179,7 @@ physical_plan 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true statement ok -drop table ordered_table; +drop table ordered_table; # ABS(x) breaks the ordering if x's range contains both negative and positive values. @@ -1215,7 +1215,7 @@ physical_plan 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true statement ok -drop table ordered_table; +drop table ordered_table; # ABS(x) preserves the ordering if x's range falls into positive values. # Since x is defined as INT UNSIGNED, its range is assumed to be from 0 to INF. From f470914c9e780a02e97039640da349e4f2333e74 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 8 Nov 2025 04:50:22 -0500 Subject: [PATCH 0027/1589] chore(deps): bump taiki-e/install-action from 2.62.46 to 2.62.47 (#18508) Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.62.46 to 2.62.47.
Release notes

Sourced from taiki-e/install-action's releases.

2.62.47

  • Update vacuum@latest to 0.20.0.

  • Update cargo-nextest@latest to 0.9.111.

  • Update cargo-shear@latest to 1.6.2.

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

  • Update cargo-udeps@latest to 0.1.60.

  • Update zizmor@latest to 1.16.3.

[2.62.47] - 2025-11-05

  • Update vacuum@latest to 0.20.0.

  • Update cargo-nextest@latest to 0.9.111.

  • Update cargo-shear@latest to 1.6.2.

[2.62.46] - 2025-11-04

  • Update vacuum@latest to 0.19.5.

  • Update syft@latest to 1.37.0.

  • Update mise@latest to 2025.11.2.

  • Update knope@latest to 0.21.5.

[2.62.45] - 2025-11-02

  • Update zizmor@latest to 1.16.2.

  • Update cargo-binstall@latest to 1.15.10.

  • Update ubi@latest to 0.8.4.

  • Update mise@latest to 2025.11.1.

  • Update cargo-semver-checks@latest to 0.45.0.

[2.62.44] - 2025-11-01

  • Update mise@latest to 2025.11.0.

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.62.46&new-version=2.62.47)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Oleks V --- .github/workflows/audit.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index f0a03d9841a9d..f269331e83ca7 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@f535147c22906d77695e11cb199e764aa610a4fc # v2.62.46 + uses: taiki-e/install-action@6f9c7cc51aa54b13cbcbd12f8bbf69d8ba405b4b # v2.62.47 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4b3c31e6b3b0c..c57300eec0e4d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -434,7 +434,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@f535147c22906d77695e11cb199e764aa610a4fc # v2.62.46 + uses: taiki-e/install-action@6f9c7cc51aa54b13cbcbd12f8bbf69d8ba405b4b # v2.62.47 with: tool: wasm-pack - name: Run tests with headless mode @@ -761,7 +761,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@f535147c22906d77695e11cb199e764aa610a4fc # v2.62.46 + uses: taiki-e/install-action@6f9c7cc51aa54b13cbcbd12f8bbf69d8ba405b4b # v2.62.47 with: tool: cargo-msrv From 3f2b6ebd2599daf2a98dd19a2b4be3358023e81d Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Sat, 8 Nov 2025 12:08:42 +0200 Subject: [PATCH 0028/1589] feat: Enhance `array_slice` functionality to support `ListView` and `LargeListView` types (#18432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #18351 ## Rationale for this change `array_slice` accepts `ListView` / `LargeListView` inputs. ## What changes are included in this PR? - Extend array_slice_inner to handle `ListView`/`LargeListView` arrays directly. - Share the stride/bounds logic between list and list‑view implementations via a new `SlicePlan`. ## Are these changes tested? Yes ## Are there any user-facing changes? Yes. `array_slice` now accepts `ListView` and `LargeListView` arrays without requiring an explicit cast. --- datafusion/functions-nested/src/extract.rs | 598 ++++++++++++++----- datafusion/sqllogictest/test_files/array.slt | 29 + 2 files changed, 491 insertions(+), 136 deletions(-) diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index a46c9c75094c6..57505c59493af 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -18,18 +18,21 @@ //! [`ScalarUDFImpl`] definitions for array_element, array_slice, array_pop_front, array_pop_back, and array_any_value functions. use arrow::array::{ - Array, ArrayRef, ArrowNativeTypeOp, Capacities, GenericListArray, Int64Array, + Array, ArrayRef, Capacities, GenericListArray, GenericListViewArray, Int64Array, MutableArrayData, NullArray, NullBufferBuilder, OffsetSizeTrait, }; -use arrow::buffer::OffsetBuffer; +use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::datatypes::DataType; use arrow::datatypes::{ - DataType::{FixedSizeList, LargeList, List, Null}, + DataType::{FixedSizeList, LargeList, LargeListView, List, ListView, Null}, Field, }; -use datafusion_common::cast::as_int64_array; use datafusion_common::cast::as_large_list_array; use datafusion_common::cast::as_list_array; +use datafusion_common::cast::{ + as_int64_array, as_large_list_view_array, as_list_view_array, +}; +use datafusion_common::internal_err; use datafusion_common::utils::ListCoercion; use datafusion_common::{ exec_datafusion_err, exec_err, internal_datafusion_err, plan_err, @@ -449,10 +452,162 @@ fn array_slice_inner(args: &[ArrayRef]) -> Result { let array = as_large_list_array(&args[0])?; general_array_slice::(array, from_array, to_array, stride) } + ListView(_) => { + let array = as_list_view_array(&args[0])?; + general_list_view_array_slice::(array, from_array, to_array, stride) + } + LargeListView(_) => { + let array = as_large_list_view_array(&args[0])?; + general_list_view_array_slice::(array, from_array, to_array, stride) + } _ => exec_err!("array_slice does not support type: {}", array_data_type), } } +fn adjusted_from_index(index: i64, len: O) -> Result> +where + i64: TryInto, +{ + // 0 ~ len - 1 + let adjusted_zero_index = if index < 0 { + if let Ok(index) = index.try_into() { + // When index < 0 and -index > length, index is clamped to the beginning of the list. + // Otherwise, when index < 0, the index is counted from the end of the list. + // + // Note, we actually test the contrapositive, index < -length, because negating a + // negative will panic if the negative is equal to the smallest representable value + // while negating a positive is always safe. + if index < (O::zero() - O::one()) * len { + O::zero() + } else { + index + len + } + } else { + return exec_err!("array_slice got invalid index: {}", index); + } + } else { + // array_slice(arr, 1, to) is the same as array_slice(arr, 0, to) + if let Ok(index) = index.try_into() { + std::cmp::max(index - O::usize_as(1), O::usize_as(0)) + } else { + return exec_err!("array_slice got invalid index: {}", index); + } + }; + + if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len { + Ok(Some(adjusted_zero_index)) + } else { + // Out of bounds + Ok(None) + } +} + +fn adjusted_to_index(index: i64, len: O) -> Result> +where + i64: TryInto, +{ + // 0 ~ len - 1 + let adjusted_zero_index = if index < 0 { + // array_slice in duckdb with negative to_index is python-like, so index itself is exclusive + if let Ok(index) = index.try_into() { + index + len + } else { + return exec_err!("array_slice got invalid index: {}", index); + } + } else { + // array_slice(arr, from, len + 1) is the same as array_slice(arr, from, len) + if let Ok(index) = index.try_into() { + std::cmp::min(index - O::usize_as(1), len - O::usize_as(1)) + } else { + return exec_err!("array_slice got invalid index: {}", index); + } + }; + + if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len { + Ok(Some(adjusted_zero_index)) + } else { + // Out of bounds + Ok(None) + } +} + +/// Internal plan describing how to materialize a single row's slice after +/// the slice bounds/stride have been normalized. Both list layouts consume +/// this to drive their copy logic. +enum SlicePlan { + /// No values should be produced. + Empty, + /// A contiguous run starting at `start` (relative to the row) with `len` + /// elements can be copied in one go. + Contiguous { start: O, len: O }, + /// Arbitrary positions (already relative to the row) must be copied in + /// sequence. + Indices(Vec), +} + +/// Produces a [`SlicePlan`] for the given logical slice parameters. +fn compute_slice_plan( + len: O, + from_raw: i64, + to_raw: i64, + stride_raw: Option, +) -> Result> +where + i64: TryInto, +{ + if len == O::usize_as(0) { + return Ok(SlicePlan::Empty); + } + + let from_index = adjusted_from_index::(from_raw, len)?; + let to_index = adjusted_to_index::(to_raw, len)?; + + let (Some(from), Some(to)) = (from_index, to_index) else { + return Ok(SlicePlan::Empty); + }; + + let stride_value = stride_raw.unwrap_or(1); + if stride_value == 0 { + return exec_err!( + "array_slice got invalid stride: {:?}, it cannot be 0", + stride_value + ); + } + + if (from < to && stride_value.is_negative()) + || (from > to && stride_value.is_positive()) + { + return Ok(SlicePlan::Empty); + } + + let stride: O = stride_value.try_into().map_err(|_| { + internal_datafusion_err!("array_slice got invalid stride: {}", stride_value) + })?; + + if from <= to && stride_value.is_positive() { + if stride_value == 1 { + let len = to - from + O::usize_as(1); + Ok(SlicePlan::Contiguous { start: from, len }) + } else { + let mut indices = Vec::new(); + let mut index = from; + while index <= to { + indices.push(index); + index += stride; + } + Ok(SlicePlan::Indices(indices)) + } + } else { + let mut indices = Vec::new(); + let mut index = from; + while index >= to { + indices.push(index); + index += stride; + } + Ok(SlicePlan::Indices(indices)) + } +} + fn general_array_slice( array: &GenericListArray, from_array: &Int64Array, @@ -472,73 +627,6 @@ where // We have the slice syntax compatible with DuckDB v0.8.1. // The rule `adjusted_from_index` and `adjusted_to_index` follows the rule of array_slice in duckdb. - fn adjusted_from_index(index: i64, len: O) -> Result> - where - i64: TryInto, - { - // 0 ~ len - 1 - let adjusted_zero_index = if index < 0 { - if let Ok(index) = index.try_into() { - // When index < 0 and -index > length, index is clamped to the beginning of the list. - // Otherwise, when index < 0, the index is counted from the end of the list. - // - // Note, we actually test the contrapositive, index < -length, because negating a - // negative will panic if the negative is equal to the smallest representable value - // while negating a positive is always safe. - if index < (O::zero() - O::one()) * len { - O::zero() - } else { - index + len - } - } else { - return exec_err!("array_slice got invalid index: {}", index); - } - } else { - // array_slice(arr, 1, to) is the same as array_slice(arr, 0, to) - if let Ok(index) = index.try_into() { - std::cmp::max(index - O::usize_as(1), O::usize_as(0)) - } else { - return exec_err!("array_slice got invalid index: {}", index); - } - }; - - if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len { - Ok(Some(adjusted_zero_index)) - } else { - // Out of bounds - Ok(None) - } - } - - fn adjusted_to_index(index: i64, len: O) -> Result> - where - i64: TryInto, - { - // 0 ~ len - 1 - let adjusted_zero_index = if index < 0 { - // array_slice in duckdb with negative to_index is python-like, so index itself is exclusive - if let Ok(index) = index.try_into() { - index + len - } else { - return exec_err!("array_slice got invalid index: {}", index); - } - } else { - // array_slice(arr, from, len + 1) is the same as array_slice(arr, from, len) - if let Ok(index) = index.try_into() { - std::cmp::min(index - O::usize_as(1), len - O::usize_as(1)) - } else { - return exec_err!("array_slice got invalid index: {}", index); - } - }; - - if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len { - Ok(Some(adjusted_zero_index)) - } else { - // Out of bounds - Ok(None) - } - } - let mut offsets = vec![O::usize_as(0)]; let mut null_builder = NullBufferBuilder::new(array.len()); @@ -551,6 +639,7 @@ where if array.is_null(row_index) || from_array.is_null(row_index) || to_array.is_null(row_index) + || stride.is_some_and(|s| s.is_null(row_index)) { mutable.extend_nulls(1); offsets.push(offsets[row_index] + O::usize_as(1)); @@ -565,72 +654,32 @@ where continue; } - let from_index = adjusted_from_index::(from_array.value(row_index), len)?; - let to_index = adjusted_to_index::(to_array.value(row_index), len)?; - - if let (Some(from), Some(to)) = (from_index, to_index) { - let stride = stride.map(|s| s.value(row_index)); - // Default stride is 1 if not provided - let stride = stride.unwrap_or(1); - if stride.is_zero() { - return exec_err!( - "array_slice got invalid stride: {:?}, it cannot be 0", - stride - ); - } else if (from < to && stride.is_negative()) - || (from > to && stride.is_positive()) - { - // return empty array - offsets.push(offsets[row_index]); - continue; + let slice_plan = compute_slice_plan::( + len, + from_array.value(row_index), + to_array.value(row_index), + stride.map(|s| s.value(row_index)), + )?; + + match slice_plan { + SlicePlan::Empty => offsets.push(offsets[row_index]), + SlicePlan::Contiguous { + start: rel_start, + len: slice_len, + } => { + let start_index = (start + rel_start).to_usize().unwrap(); + let end_index = (start + rel_start + slice_len).to_usize().unwrap(); + mutable.extend(0, start_index, end_index); + offsets.push(offsets[row_index] + slice_len); } - - let stride: O = stride.try_into().map_err(|_| { - internal_datafusion_err!("array_slice got invalid stride: {}", stride) - })?; - - if from <= to && stride > O::zero() { - assert!(start + to <= end); - if stride.eq(&O::one()) { - // stride is default to 1 - mutable.extend( - 0, - (start + from).to_usize().unwrap(), - (start + to + O::usize_as(1)).to_usize().unwrap(), - ); - offsets.push(offsets[row_index] + (to - from + O::usize_as(1))); - continue; - } - let mut index = start + from; - let mut cnt = 0; - while index <= start + to { - mutable.extend( - 0, - index.to_usize().unwrap(), - index.to_usize().unwrap() + 1, - ); - index += stride; - cnt += 1; + SlicePlan::Indices(indices) => { + let count = indices.len(); + for rel_index in indices { + let absolute_index = (start + rel_index).to_usize().unwrap(); + mutable.extend(0, absolute_index, absolute_index + 1); } - offsets.push(offsets[row_index] + O::usize_as(cnt)); - } else { - let mut index = start + from; - let mut cnt = 0; - while index >= start + to { - mutable.extend( - 0, - index.to_usize().unwrap(), - index.to_usize().unwrap() + 1, - ); - index += stride; - cnt += 1; - } - // invalid range, return empty array - offsets.push(offsets[row_index] + O::usize_as(cnt)); + offsets.push(offsets[row_index] + O::usize_as(count)); } - } else { - // invalid range, return empty array - offsets.push(offsets[row_index]); } } @@ -644,6 +693,107 @@ where )?)) } +fn general_list_view_array_slice( + array: &GenericListViewArray, + from_array: &Int64Array, + to_array: &Int64Array, + stride: Option<&Int64Array>, +) -> Result +where + i64: TryInto, +{ + let values = array.values(); + let original_data = values.to_data(); + let capacity = Capacities::Array(original_data.len()); + let field = match array.data_type() { + ListView(field) | LargeListView(field) => Arc::clone(field), + other => { + return internal_err!("array_slice got unexpected data type: {}", other); + } + }; + + let mut mutable = + MutableArrayData::with_capacities(vec![&original_data], true, capacity); + + // We must build `offsets` and `sizes` buffers manually as ListView does not enforce + // monotonically increasing offsets. + let mut offsets = Vec::with_capacity(array.len()); + let mut sizes = Vec::with_capacity(array.len()); + let mut current_offset = O::usize_as(0); + let mut null_builder = NullBufferBuilder::new(array.len()); + + for row_index in 0..array.len() { + // Propagate NULL semantics: any NULL input yields a NULL output slot. + if array.is_null(row_index) + || from_array.is_null(row_index) + || to_array.is_null(row_index) + || stride.is_some_and(|s| s.is_null(row_index)) + { + null_builder.append_null(); + offsets.push(current_offset); + sizes.push(O::usize_as(0)); + continue; + } + null_builder.append_non_null(); + + let len = array.value_size(row_index); + + // Empty arrays always return an empty array. + if len == O::usize_as(0) { + offsets.push(current_offset); + sizes.push(O::usize_as(0)); + continue; + } + + let slice_plan = compute_slice_plan::( + len, + from_array.value(row_index), + to_array.value(row_index), + stride.map(|s| s.value(row_index)), + )?; + + let start = array.value_offset(row_index); + match slice_plan { + SlicePlan::Empty => { + offsets.push(current_offset); + sizes.push(O::usize_as(0)); + } + SlicePlan::Contiguous { + start: rel_start, + len: slice_len, + } => { + let start_index = (start + rel_start).to_usize().unwrap(); + let end_index = (start + rel_start + slice_len).to_usize().unwrap(); + mutable.extend(0, start_index, end_index); + offsets.push(current_offset); + sizes.push(slice_len); + current_offset += slice_len; + } + SlicePlan::Indices(indices) => { + let count = indices.len(); + for rel_index in indices { + let absolute_index = (start + rel_index).to_usize().unwrap(); + mutable.extend(0, absolute_index, absolute_index + 1); + } + let length = O::usize_as(count); + offsets.push(current_offset); + sizes.push(length); + current_offset += length; + } + } + } + + let data = mutable.freeze(); + + Ok(Arc::new(GenericListViewArray::::try_new( + field, + ScalarBuffer::from(offsets), + ScalarBuffer::from(sizes), + arrow::array::make_array(data), + null_builder.finish(), + )?)) +} + #[user_doc( doc_section(label = "Array Functions"), description = "Returns the array without the first element.", @@ -977,12 +1127,28 @@ where #[cfg(test)] mod tests { - use super::array_element_udf; + use super::{array_element_udf, general_list_view_array_slice}; + use arrow::array::{ + cast::AsArray, Array, ArrayRef, GenericListViewArray, Int32Array, Int64Array, + ListViewArray, + }; + use arrow::buffer::ScalarBuffer; use arrow::datatypes::{DataType, Field}; - use datafusion_common::{Column, DFSchema}; + use datafusion_common::{Column, DFSchema, Result}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::{Expr, ExprSchemable}; use std::collections::HashMap; + use std::sync::Arc; + + fn list_view_values(array: &GenericListViewArray) -> Vec> { + (0..array.len()) + .map(|i| { + let child = array.value(i); + let values = child.as_any().downcast_ref::().unwrap(); + values.iter().map(|v| v.unwrap()).collect() + }) + .collect() + } // Regression test for https://github.com/apache/datafusion/issues/13755 #[test] @@ -1028,4 +1194,164 @@ mod tests { fixed_size_list_type ); } + + #[test] + fn test_array_slice_list_view_basic() -> Result<()> { + let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let offsets = ScalarBuffer::from(vec![0, 3]); + let sizes = ScalarBuffer::from(vec![3, 2]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let array = ListViewArray::new(field, offsets, sizes, values, None); + + let from = Int64Array::from(vec![2, 1]); + let to = Int64Array::from(vec![3, 2]); + + let result = general_list_view_array_slice::( + &array, + &from, + &to, + None::<&Int64Array>, + )?; + let result = result.as_ref().as_list_view::(); + + assert_eq!(list_view_values(result), vec![vec![2, 3], vec![4, 5]]); + Ok(()) + } + + #[test] + fn test_array_slice_list_view_non_monotonic_offsets() -> Result<()> { + // First list references the tail of the values buffer, second list references the head. + let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let offsets = ScalarBuffer::from(vec![3, 0]); + let sizes = ScalarBuffer::from(vec![2, 3]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let array = ListViewArray::new(field, offsets, sizes, values, None); + + let from = Int64Array::from(vec![1, 1]); + let to = Int64Array::from(vec![2, 2]); + + let result = general_list_view_array_slice::( + &array, + &from, + &to, + None::<&Int64Array>, + )?; + let result = result.as_ref().as_list_view::(); + + assert_eq!(list_view_values(result), vec![vec![4, 5], vec![1, 2]]); + Ok(()) + } + + #[test] + fn test_array_slice_list_view_negative_stride() -> Result<()> { + let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let offsets = ScalarBuffer::from(vec![0, 3]); + let sizes = ScalarBuffer::from(vec![3, 2]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let array = ListViewArray::new(field, offsets, sizes, values, None); + + let from = Int64Array::from(vec![3, 2]); + let to = Int64Array::from(vec![1, 1]); + let stride = Int64Array::from(vec![-1, -1]); + + let result = + general_list_view_array_slice::(&array, &from, &to, Some(&stride))?; + let result = result.as_ref().as_list_view::(); + + assert_eq!(list_view_values(result), vec![vec![3, 2, 1], vec![5, 4]]); + Ok(()) + } + + #[test] + fn test_array_slice_list_view_out_of_order() -> Result<()> { + let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let offsets = ScalarBuffer::from(vec![3, 1, 0]); + let sizes = ScalarBuffer::from(vec![2, 2, 1]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let array = ListViewArray::new(field, offsets, sizes, values, None); + assert_eq!( + list_view_values(&array), + vec![vec![4, 5], vec![2, 3], vec![1]] + ); + + let from = Int64Array::from(vec![2, 2, 2]); + let to = Int64Array::from(vec![1, 1, 1]); + let stride = Int64Array::from(vec![-1, -1, -1]); + + let result = + general_list_view_array_slice::(&array, &from, &to, Some(&stride))?; + let result = result.as_ref().as_list_view::(); + + assert_eq!( + list_view_values(result), + vec![vec![5, 4], vec![3, 2], vec![]] + ); + Ok(()) + } + + #[test] + fn test_array_slice_list_view_with_nulls() -> Result<()> { + let values: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + None, + Some(3), + Some(4), + Some(5), + ])); + let offsets = ScalarBuffer::from(vec![0, 2, 5]); + let sizes = ScalarBuffer::from(vec![2, 3, 0]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let array = ListViewArray::new(field, offsets, sizes, values, None); + + let from = Int64Array::from(vec![1, 1, 1]); + let to = Int64Array::from(vec![2, 2, 1]); + + let result = general_list_view_array_slice::(&array, &from, &to, None)?; + let result = result.as_ref().as_list_view::(); + + let actual: Vec>> = (0..result.len()) + .map(|i| { + result + .value(i) + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .collect() + }) + .collect(); + + assert_eq!( + actual, + vec![vec![Some(1), None], vec![Some(3), Some(4)], Vec::new(),] + ); + + // Test with NULL stride - should return NULL for rows with NULL stride + let stride_with_null = Int64Array::from(vec![Some(1), None, Some(1)]); + let result = general_list_view_array_slice::( + &array, + &from, + &to, + Some(&stride_with_null), + )?; + let result = result.as_ref().as_list_view::(); + + // First row: stride = 1, should return [1, None] + // Second row: stride = NULL, should return NULL + // Third row: stride = 1, empty array should return empty + assert!(!result.is_null(0)); // First row should not be null + assert!(result.is_null(1)); // Second row should be null (stride is NULL) + assert!(!result.is_null(2)); // Third row should not be null + + let first_row: Vec> = result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .collect(); + assert_eq!(first_row, vec![Some(1), None]); + + Ok(()) + } } diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 00629c392df48..7aa267a4dc6d7 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -1943,6 +1943,19 @@ select array_slice(make_array(1, 2, 3, 4, 5), 5, 1, -2), array_slice(make_array( ---- [5, 3, 1] [o, l, h] +# Test NULL stride +query ?? +select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, NULL); +---- +NULL NULL + +# Test NULL stride +query ?? +select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 1, 5, NULL), + array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 5, NULL); +---- +NULL NULL + query ?? select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 2); ---- @@ -1965,6 +1978,14 @@ select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, ---- [1, 2, 3, 4, 5] [h, e, l, l, o] +# TODO: Enable once arrow_cast supports ListView types. +# Expected output (once supported): +# ---- +# [1, 2, 3, 4, 5] [h, e, l, l, o] +query error DataFusion error: Execution error: Unsupported type 'ListView\(Int64\)'. Must be a supported arrow type name such as 'Int32' or 'Timestamp\(ns\)'. Error unknown token: ListView +select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'ListView(Int64)'), 0, 6), + array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'ListView(Utf8)'), 0, 5); + query ?? select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 0, 6), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 0, 5); @@ -2004,6 +2025,14 @@ select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, ---- [2, 3, 4, 5] [l, l, o] +# TODO: Enable once arrow_cast supports LargeListView types. +# Expected output (once supported): +# ---- +# [2, 3, 4, 5] [l, l, o] +query error DataFusion error: Execution error: Unsupported type 'LargeListView\(Int64\)'. Must be a supported arrow type name such as 'Int32' or 'Timestamp\(ns\)'. Error unknown token: LargeListView +select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeListView(Int64)'), 2, 6), + array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeListView(Utf8)'), 3, 7); + # array_slice scalar function #6 (with positive indexes; nested array) query ? select array_slice(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 1, 1); From 3de195a6d4b4f32a857513ee7107b7e3b07734bc Mon Sep 17 00:00:00 2001 From: Sergey Zhukov <62326549+cj-zhukov@users.noreply.github.com> Date: Sat, 8 Nov 2025 13:20:22 +0300 Subject: [PATCH 0029/1589] Consolidate builtin functions examples (#18142) (#18523) ## Which issue does this PR close? - part of #https://github.com/apache/datafusion/issues/18142. ## Rationale for this change This PR is for consolidating all the `builtin-functions` examples into a single example binary. We are agreed on the pattern and we can apply it to the remaining examples ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --------- Co-authored-by: Sergey Zhukov --- datafusion-examples/README.md | 6 +- .../date_time.rs} | 16 +++- .../function_factory.rs | 3 +- .../examples/builtin_functions/main.rs | 94 +++++++++++++++++++ .../{ => builtin_functions}/regexp.rs | 5 +- 5 files changed, 114 insertions(+), 10 deletions(-) rename datafusion-examples/examples/{date_time_functions.rs => builtin_functions/date_time.rs} (97%) rename datafusion-examples/examples/{ => builtin_functions}/function_factory.rs (99%) create mode 100644 datafusion-examples/examples/builtin_functions/main.rs rename datafusion-examples/examples/{ => builtin_functions}/regexp.rs (99%) diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index f87f62e170af0..1befba6be66fd 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -61,12 +61,13 @@ cargo run --example dataframe - [`custom_file_format.rs`](examples/custom_file_format.rs): Write data to a custom file format - [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 - [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file. +- [`examples/builtin_functions/date_time`](examples/builtin_functions/date_time.rs): Examples of date-time related functions and queries - [`default_column_values.rs`](examples/default_column_values.rs): Implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter - [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs - [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s - [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks. - [`flight/sql_server.rs`](examples/flight/sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from Flight and and FlightSQL (e.g. JDBC) clients -- [`function_factory.rs`](examples/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros +- [`examples/builtin_functions/function_factory.rs`](examples/builtin_functions/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros - [`memory_pool_tracking.rs`](examples/memory_pool_tracking.rs): Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages - [`memory_pool_execution_plan.rs`](examples/memory_pool_execution_plan.rs): Shows how to implement memory-aware ExecutionPlan with memory reservation and spilling - [`optimizer_rule.rs`](examples/optimizer_rule.rs): Use a custom OptimizerRule to replace certain predicates @@ -81,7 +82,7 @@ cargo run --example dataframe - [`pruning.rs`](examples/pruning.rs): Use pruning to rule out files based on statistics - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP -- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions +- [`examples/builtin_functions/regexp.rs`](examples/builtin_functions/regexp.rs): Examples of using regular expression functions - [`remote_catalog.rs`](examples/regexp.rs): Examples of interfacing with a remote catalog (e.g. over a network) - [`examples/udf/simple_udaf.rs`](examples/udf/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF) - [`examples/udf/simple_udf.rs`](examples/udf/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF) @@ -91,7 +92,6 @@ cargo run --example dataframe - [`sql_frontend.rs`](examples/sql_frontend.rs): Create LogicalPlans (only) from sql strings - [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a custom SQL dialect on top of `DFParser` - [`sql_query.rs`](examples/memtable.rs): Query data using SQL (in memory `RecordBatches`, local Parquet files) -- [`date_time_function.rs`](examples/date_time_function.rs): Examples of date-time related functions and queries. ## Distributed diff --git a/datafusion-examples/examples/date_time_functions.rs b/datafusion-examples/examples/builtin_functions/date_time.rs similarity index 97% rename from datafusion-examples/examples/date_time_functions.rs rename to datafusion-examples/examples/builtin_functions/date_time.rs index 2628319ae31f0..178cba979cb95 100644 --- a/datafusion-examples/examples/date_time_functions.rs +++ b/datafusion-examples/examples/builtin_functions/date_time.rs @@ -26,8 +26,20 @@ use datafusion::common::assert_contains; use datafusion::error::Result; use datafusion::prelude::*; -#[tokio::main] -async fn main() -> Result<()> { +/// Example: Working with Date and Time Functions +/// +/// This example demonstrates how to work with various date and time +/// functions in DataFusion using both the DataFrame API and SQL queries. +/// +/// It includes: +/// - `make_date`: building `DATE` values from year, month, and day columns +/// - `to_date`: converting string expressions into `DATE` values +/// - `to_timestamp`: parsing strings or numeric values into `TIMESTAMP`s +/// - `to_char`: formatting dates, timestamps, and durations as strings +/// +/// Together, these examples show how to create, convert, and format temporal +/// data using DataFusion’s built-in functions. +pub async fn date_time() -> Result<()> { query_make_date().await?; query_to_date().await?; query_to_timestamp().await?; diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/builtin_functions/function_factory.rs similarity index 99% rename from datafusion-examples/examples/function_factory.rs rename to datafusion-examples/examples/builtin_functions/function_factory.rs index d4312ae594091..5d41e7a260713 100644 --- a/datafusion-examples/examples/function_factory.rs +++ b/datafusion-examples/examples/builtin_functions/function_factory.rs @@ -42,8 +42,7 @@ use std::sync::Arc; /// /// This example is rather simple and does not cover all cases required for a /// real implementation. -#[tokio::main] -async fn main() -> Result<()> { +pub async fn function_factory() -> Result<()> { // First we must configure the SessionContext with our function factory let ctx = SessionContext::new() // register custom function factory diff --git a/datafusion-examples/examples/builtin_functions/main.rs b/datafusion-examples/examples/builtin_functions/main.rs new file mode 100644 index 0000000000000..3399c395bfd62 --- /dev/null +++ b/datafusion-examples/examples/builtin_functions/main.rs @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # These are miscellaneous function-related examples +//! +//! These examples demonstrate miscellaneous function-related features. +//! +//! Each subcommand runs a corresponding example: +//! - `date_time` — examples of date-time related functions and queries +//! - `function_factory` — register `CREATE FUNCTION` handler to implement SQL macros +//! - `regexp` — examples of using regular expression functions + +mod date_time; +mod function_factory; +mod regexp; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + DateTime, + FunctionFactory, + Regexp, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::DateTime => "date_time", + Self::FunctionFactory => "function_factory", + Self::Regexp => "regexp", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "date_time" => Ok(Self::DateTime), + "function_factory" => Ok(Self::FunctionFactory), + "regexp" => Ok(Self::Regexp), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 3] = [Self::DateTime, Self::FunctionFactory, Self::Regexp]; + + const EXAMPLE_NAME: &str = "builtin_functions"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::DateTime => date_time::date_time().await?, + ExampleKind::FunctionFactory => function_factory::function_factory().await?, + ExampleKind::Regexp => regexp::regexp().await?, + } + + Ok(()) +} diff --git a/datafusion-examples/examples/regexp.rs b/datafusion-examples/examples/builtin_functions/regexp.rs similarity index 99% rename from datafusion-examples/examples/regexp.rs rename to datafusion-examples/examples/builtin_functions/regexp.rs index 12d115b9b502c..13c0786930283 100644 --- a/datafusion-examples/examples/regexp.rs +++ b/datafusion-examples/examples/builtin_functions/regexp.rs @@ -28,12 +28,11 @@ use datafusion::prelude::*; /// /// Supported flags can be found at /// https://docs.rs/regex/latest/regex/#grouping-and-flags -#[tokio::main] -async fn main() -> Result<()> { +pub async fn regexp() -> Result<()> { let ctx = SessionContext::new(); ctx.register_csv( "examples", - "../../datafusion/physical-expr/tests/data/regex.csv", + "datafusion/physical-expr/tests/data/regex.csv", CsvReadOptions::new(), ) .await?; From 8f9081483fc57d060d125109d85018ae9ddf46b8 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Sat, 8 Nov 2025 23:55:41 +0800 Subject: [PATCH 0030/1589] refactor: update cmp and nested data in binary operator (#18256) ## Which issue does this PR close? - Related #18210 ## Rationale for this change To keep logic clear in binary operator and make it possible to use binary operators for nested data structures in coming changes. ## What changes are included in this PR? Another housekeeping refactor for binary operators. - Keep the API from datum module consistent by using `Operator` instead of kernel function - Move nested data structure check into cmp operators. This allows us to implement binary operators for `List`, `Struct` and etc. ## Are these changes tested? Unit tests ## Are there any user-facing changes? N/A --- datafusion/physical-expr-common/src/datum.rs | 52 ++++++++++++++++--- .../physical-expr/src/expressions/binary.rs | 41 ++++++--------- .../physical-expr/src/expressions/like.rs | 11 ++-- 3 files changed, 66 insertions(+), 38 deletions(-) diff --git a/datafusion/physical-expr-common/src/datum.rs b/datafusion/physical-expr-common/src/datum.rs index 7084bc440e86b..56ef54a1d4509 100644 --- a/datafusion/physical-expr-common/src/datum.rs +++ b/datafusion/physical-expr-common/src/datum.rs @@ -18,7 +18,10 @@ use arrow::array::BooleanArray; use arrow::array::{make_comparator, ArrayRef, Datum}; use arrow::buffer::NullBuffer; -use arrow::compute::SortOptions; +use arrow::compute::kernels::cmp::{ + distinct, eq, gt, gt_eq, lt, lt_eq, neq, not_distinct, +}; +use arrow::compute::{ilike, like, nilike, nlike, SortOptions}; use arrow::error::ArrowError; use datafusion_common::DataFusionError; use datafusion_common::{arrow_datafusion_err, internal_err}; @@ -53,22 +56,49 @@ pub fn apply( } } -/// Applies a binary [`Datum`] comparison kernel `f` to `lhs` and `rhs` +/// Applies a binary [`Datum`] comparison operator `op` to `lhs` and `rhs` pub fn apply_cmp( + op: Operator, lhs: &ColumnarValue, rhs: &ColumnarValue, - f: impl Fn(&dyn Datum, &dyn Datum) -> Result, ) -> Result { - apply(lhs, rhs, |l, r| Ok(Arc::new(f(l, r)?))) + if lhs.data_type().is_nested() { + apply_cmp_for_nested(op, lhs, rhs) + } else { + let f = match op { + Operator::Eq => eq, + Operator::NotEq => neq, + Operator::Lt => lt, + Operator::LtEq => lt_eq, + Operator::Gt => gt, + Operator::GtEq => gt_eq, + Operator::IsDistinctFrom => distinct, + Operator::IsNotDistinctFrom => not_distinct, + + Operator::LikeMatch => like, + Operator::ILikeMatch => ilike, + Operator::NotLikeMatch => nlike, + Operator::NotILikeMatch => nilike, + + _ => { + return internal_err!("Invalid compare operator: {}", op); + } + }; + + apply(lhs, rhs, |l, r| Ok(Arc::new(f(l, r)?))) + } } -/// Applies a binary [`Datum`] comparison kernel `f` to `lhs` and `rhs` for nested type like +/// Applies a binary [`Datum`] comparison operator `op` to `lhs` and `rhs` for nested type like /// List, FixedSizeList, LargeList, Struct, Union, Map, or a dictionary of a nested type pub fn apply_cmp_for_nested( op: Operator, lhs: &ColumnarValue, rhs: &ColumnarValue, ) -> Result { + let left_data_type = lhs.data_type(); + let right_data_type = rhs.data_type(); + if matches!( op, Operator::Eq @@ -79,12 +109,18 @@ pub fn apply_cmp_for_nested( | Operator::GtEq | Operator::IsDistinctFrom | Operator::IsNotDistinctFrom - ) { + ) && left_data_type.equals_datatype(&right_data_type) + { apply(lhs, rhs, |l, r| { Ok(Arc::new(compare_op_for_nested(op, l, r)?)) }) } else { - internal_err!("invalid operator for nested") + internal_err!( + "invalid operator or data type mismatch for nested data, op {} left {}, right {}", + op, + left_data_type, + right_data_type + ) } } @@ -97,7 +133,7 @@ pub fn compare_with_eq( if is_nested { compare_op_for_nested(Operator::Eq, lhs, rhs) } else { - arrow::compute::kernels::cmp::eq(lhs, rhs).map_err(|e| arrow_datafusion_err!(e)) + eq(lhs, rhs).map_err(|e| arrow_datafusion_err!(e)) } } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index ce3d4ced4e3a2..b09d57f02d582 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -24,11 +24,8 @@ use std::{any::Any, sync::Arc}; use arrow::array::*; use arrow::compute::kernels::boolean::{and_kleene, or_kleene}; -use arrow::compute::kernels::cmp::*; use arrow::compute::kernels::concat_elements::concat_elements_utf8; -use arrow::compute::{ - cast, filter_record_batch, ilike, like, nilike, nlike, SlicesIterator, -}; +use arrow::compute::{cast, filter_record_batch, SlicesIterator}; use arrow::datatypes::*; use arrow::error::ArrowError; use datafusion_common::cast::as_boolean_array; @@ -42,7 +39,7 @@ use datafusion_expr::statistics::{ new_generic_from_binary_op, Distribution, }; use datafusion_expr::{ColumnarValue, Operator}; -use datafusion_physical_expr_common::datum::{apply, apply_cmp, apply_cmp_for_nested}; +use datafusion_physical_expr_common::datum::{apply, apply_cmp}; use kernels::{ bitwise_and_dyn, bitwise_and_dyn_scalar, bitwise_or_dyn, bitwise_or_dyn_scalar, @@ -251,13 +248,6 @@ impl PhysicalExpr for BinaryExpr { let schema = batch.schema(); let input_schema = schema.as_ref(); - if left_data_type.is_nested() { - if !left_data_type.equals_datatype(&right_data_type) { - return internal_err!("Cannot evaluate binary expression because of type mismatch: left {}, right {} ", left_data_type, right_data_type); - } - return apply_cmp_for_nested(self.op, &lhs, &rhs); - } - match self.op { Operator::Plus if self.fail_on_overflow => return apply(&lhs, &rhs, add), Operator::Plus => return apply(&lhs, &rhs, add_wrapping), @@ -267,18 +257,21 @@ impl PhysicalExpr for BinaryExpr { Operator::Multiply => return apply(&lhs, &rhs, mul_wrapping), Operator::Divide => return apply(&lhs, &rhs, div), Operator::Modulo => return apply(&lhs, &rhs, rem), - Operator::Eq => return apply_cmp(&lhs, &rhs, eq), - Operator::NotEq => return apply_cmp(&lhs, &rhs, neq), - Operator::Lt => return apply_cmp(&lhs, &rhs, lt), - Operator::Gt => return apply_cmp(&lhs, &rhs, gt), - Operator::LtEq => return apply_cmp(&lhs, &rhs, lt_eq), - Operator::GtEq => return apply_cmp(&lhs, &rhs, gt_eq), - Operator::IsDistinctFrom => return apply_cmp(&lhs, &rhs, distinct), - Operator::IsNotDistinctFrom => return apply_cmp(&lhs, &rhs, not_distinct), - Operator::LikeMatch => return apply_cmp(&lhs, &rhs, like), - Operator::ILikeMatch => return apply_cmp(&lhs, &rhs, ilike), - Operator::NotLikeMatch => return apply_cmp(&lhs, &rhs, nlike), - Operator::NotILikeMatch => return apply_cmp(&lhs, &rhs, nilike), + + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::Gt + | Operator::LtEq + | Operator::GtEq + | Operator::IsDistinctFrom + | Operator::IsNotDistinctFrom + | Operator::LikeMatch + | Operator::ILikeMatch + | Operator::NotLikeMatch + | Operator::NotILikeMatch => { + return apply_cmp(self.op, &lhs, &rhs); + } _ => {} } diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr/src/expressions/like.rs index e86c778d51619..1c9ae530f500d 100644 --- a/datafusion/physical-expr/src/expressions/like.rs +++ b/datafusion/physical-expr/src/expressions/like.rs @@ -19,7 +19,7 @@ use crate::PhysicalExpr; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::{internal_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, Operator}; use datafusion_physical_expr_common::datum::apply_cmp; use std::hash::Hash; use std::{any::Any, sync::Arc}; @@ -118,14 +118,13 @@ impl PhysicalExpr for LikeExpr { } fn evaluate(&self, batch: &RecordBatch) -> Result { - use arrow::compute::*; let lhs = self.expr.evaluate(batch)?; let rhs = self.pattern.evaluate(batch)?; match (self.negated, self.case_insensitive) { - (false, false) => apply_cmp(&lhs, &rhs, like), - (false, true) => apply_cmp(&lhs, &rhs, ilike), - (true, false) => apply_cmp(&lhs, &rhs, nlike), - (true, true) => apply_cmp(&lhs, &rhs, nilike), + (false, false) => apply_cmp(Operator::LikeMatch, &lhs, &rhs), + (false, true) => apply_cmp(Operator::ILikeMatch, &lhs, &rhs), + (true, false) => apply_cmp(Operator::NotLikeMatch, &lhs, &rhs), + (true, true) => apply_cmp(Operator::NotILikeMatch, &lhs, &rhs), } } From c1965b66260e533a5d6b79a7399db3e68ab37383 Mon Sep 17 00:00:00 2001 From: Randy <155058195@qq.com> Date: Sun, 9 Nov 2025 07:59:44 +0800 Subject: [PATCH 0031/1589] Fix: topk_aggregate benchmark failing (#18502) ## Which issue does this PR close? - Closes #18431 ## Rationale for this change -The trace_id in the result is depended on a random number. I think it's better to remove it from the sql to get a stable result ## What changes are included in this PR? Remove the trace_id from the sql and the assert result ## Are these changes tested? N/A ## Are there any user-facing changes? No --- datafusion/core/benches/topk_aggregate.rs | 30 +++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/datafusion/core/benches/topk_aggregate.rs b/datafusion/core/benches/topk_aggregate.rs index 9a5fb7163be5c..7971293c9ce21 100644 --- a/datafusion/core/benches/topk_aggregate.rs +++ b/datafusion/core/benches/topk_aggregate.rs @@ -46,7 +46,7 @@ async fn create_context( opts.optimizer.enable_topk_aggregation = use_topk; let ctx = SessionContext::new_with_config(cfg); let _ = ctx.register_table("traces", mem_table)?; - let sql = format!("select trace_id, max(timestamp_ms) from traces group by trace_id order by max(timestamp_ms) desc limit {limit};"); + let sql = format!("select max(timestamp_ms) from traces group by trace_id order by max(timestamp_ms) desc limit {limit};"); let df = ctx.sql(sql.as_str()).await?; let physical_plan = df.create_physical_plan().await?; let actual_phys_plan = displayable(physical_plan.as_ref()).indent(true).to_string(); @@ -75,20 +75,20 @@ async fn aggregate( let actual = format!("{}", pretty_format_batches(&batches)?).to_lowercase(); let expected_asc = r#" -+----------------------------------+--------------------------+ -| trace_id | max(traces.timestamp_ms) | -+----------------------------------+--------------------------+ -| 5868861a23ed31355efc5200eb80fe74 | 16909009999999 | -| 4040e64656804c3d77320d7a0e7eb1f0 | 16909009999998 | -| 02801bbe533190a9f8713d75222f445d | 16909009999997 | -| 9e31b3b5a620de32b68fefa5aeea57f1 | 16909009999996 | -| 2d88a860e9bd1cfaa632d8e7caeaa934 | 16909009999995 | -| a47edcef8364ab6f191dd9103e51c171 | 16909009999994 | -| 36a3fa2ccfbf8e00337f0b1254384db6 | 16909009999993 | -| 0756be84f57369012e10de18b57d8a2f | 16909009999992 | -| d4d6bf9845fa5897710e3a8db81d5907 | 16909009999991 | -| 3c2cc1abe728a66b61e14880b53482a0 | 16909009999990 | -+----------------------------------+--------------------------+ ++--------------------------+ +| max(traces.timestamp_ms) | ++--------------------------+ +| 16909009999999 | +| 16909009999998 | +| 16909009999997 | +| 16909009999996 | +| 16909009999995 | +| 16909009999994 | +| 16909009999993 | +| 16909009999992 | +| 16909009999991 | +| 16909009999990 | ++--------------------------+ "# .trim(); if asc { From c728d54819ef33c9a3c5f0279cdcc4dd3d8b8661 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Sun, 9 Nov 2025 11:15:56 +0800 Subject: [PATCH 0032/1589] refactor: Add `assert_or_internal_err!` macro for more ergonomic internal invariant checks (#18511) ## Which issue does this PR close? - Closes https://github.com/apache/datafusion/issues/15492 ## Rationale for this change See issue for the rationale and example. This PR introduces the following macros to make invariant checks and throwing internal errors easier, and also let the error message include more assertion details if it failed (what's the expected/actual value), to make debugging easier. - `assert_or_internal_err!()` - `assert_eq_or_internal_err!()` - `assert_ne_or_internal_err!()` ```rust // before if field.name() != expected.name() { return internal_err!( "Field name mismatch at index {}: expected '{}', found '{}'", idx, expected.name(), field.name() ); } // after assert_eq_or_internal_err!( field.name(), expected.name(), "Field name mismatch at index {}", idx ); ``` If the assertion fails, the error now reads: ``` Internal error: Assertion failed: field.name() == expected.name() (left: "foo", right: "bar"): Field name mismatch at index 3. ``` ## What changes are included in this PR? 1. Add macros and UTs to test 2. Updated a few internal error patterns that are applicable for this macro ## Are these changes tested? UTs ## Are there any user-facing changes? No --------- Co-authored-by: Alex Huang --- datafusion/common/src/error.rs | 219 ++++++++++++++++++++++++ datafusion/core/src/physical_planner.rs | 35 ++-- 2 files changed, 237 insertions(+), 17 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index fde52944d0497..4fa6d28e73245 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -758,6 +758,116 @@ macro_rules! unwrap_or_internal_err { }; } +/// Assert a condition, returning `DataFusionError::Internal` on failure. +/// +/// # Examples +/// +/// ```text +/// assert_or_internal_err!(predicate); +/// assert_or_internal_err!(predicate, "human readable message"); +/// assert_or_internal_err!(predicate, format!("details: {}", value)); +/// ``` +#[macro_export] +macro_rules! assert_or_internal_err { + ($cond:expr) => { + if !$cond { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {}", + stringify!($cond) + ))); + } + }; + ($cond:expr, $($arg:tt)+) => { + if !$cond { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {}: {}", + stringify!($cond), + format!($($arg)+) + ))); + } + }; +} + +/// Assert equality, returning `DataFusionError::Internal` on failure. +/// +/// # Examples +/// +/// ```text +/// assert_eq_or_internal_err!(actual, expected); +/// assert_eq_or_internal_err!(left_expr, right_expr, "values must match"); +/// assert_eq_or_internal_err!(lhs, rhs, "metadata: {}", extra); +/// ``` +#[macro_export] +macro_rules! assert_eq_or_internal_err { + ($left:expr, $right:expr $(,)?) => {{ + let left_val = &$left; + let right_val = &$right; + if left_val != right_val { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {} == {} (left: {:?}, right: {:?})", + stringify!($left), + stringify!($right), + left_val, + right_val + ))); + } + }}; + ($left:expr, $right:expr, $($arg:tt)+) => {{ + let left_val = &$left; + let right_val = &$right; + if left_val != right_val { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {} == {} (left: {:?}, right: {:?}): {}", + stringify!($left), + stringify!($right), + left_val, + right_val, + format!($($arg)+) + ))); + } + }}; +} + +/// Assert inequality, returning `DataFusionError::Internal` on failure. +/// +/// # Examples +/// +/// ```text +/// assert_ne_or_internal_err!(left, right); +/// assert_ne_or_internal_err!(lhs_expr, rhs_expr, "values must differ"); +/// assert_ne_or_internal_err!(a, b, "context {}", info); +/// ``` +#[macro_export] +macro_rules! assert_ne_or_internal_err { + ($left:expr, $right:expr $(,)?) => {{ + let left_val = &$left; + let right_val = &$right; + if left_val == right_val { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {} != {} (left: {:?}, right: {:?})", + stringify!($left), + stringify!($right), + left_val, + right_val + ))); + } + }}; + ($left:expr, $right:expr, $($arg:tt)+) => {{ + let left_val = &$left; + let right_val = &$right; + if left_val == right_val { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {} != {} (left: {:?}, right: {:?}): {}", + stringify!($left), + stringify!($right), + left_val, + right_val, + format!($($arg)+) + ))); + } + }}; +} + /// Add a macros for concise DataFusionError::* errors declaration /// supports placeholders the same way as `format!` /// Examples: @@ -974,6 +1084,115 @@ mod test { use std::sync::Arc; use arrow::error::ArrowError; + use insta::assert_snapshot; + + fn ok_result() -> Result<()> { + Ok(()) + } + + #[test] + fn test_assert_eq_or_internal_err_passes() -> Result<()> { + assert_eq_or_internal_err!(1, 1); + ok_result() + } + + #[test] + fn test_assert_eq_or_internal_err_fails() { + fn check() -> Result<()> { + assert_eq_or_internal_err!(1, 2, "expected equality"); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: 1 == 2 (left: 1, right: 2): expected equality. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } + + #[test] + fn test_assert_ne_or_internal_err_passes() -> Result<()> { + assert_ne_or_internal_err!(1, 2); + ok_result() + } + + #[test] + fn test_assert_ne_or_internal_err_fails() { + fn check() -> Result<()> { + assert_ne_or_internal_err!(3, 3, "values must differ"); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: 3 != 3 (left: 3, right: 3): values must differ. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } + + #[test] + fn test_assert_or_internal_err_passes() -> Result<()> { + assert_or_internal_err!(true); + assert_or_internal_err!(true, "message"); + ok_result() + } + + #[test] + fn test_assert_or_internal_err_fails_default() { + fn check() -> Result<()> { + assert_or_internal_err!(false); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: false. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } + + #[test] + fn test_assert_or_internal_err_fails_with_message() { + fn check() -> Result<()> { + assert_or_internal_err!(false, "custom message"); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: false: custom message. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } + + #[test] + fn test_assert_or_internal_err_with_format_arguments() { + fn check() -> Result<()> { + assert_or_internal_err!(false, "custom {}", 42); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: false: custom 42. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } #[test] fn test_error_size() { diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index c280b50a9f07a..6a75485c62849 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -64,7 +64,9 @@ use datafusion_catalog::ScanArgs; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::format::ExplainAnalyzeLevel; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; -use datafusion_common::TableReference; +use datafusion_common::{ + assert_eq_or_internal_err, assert_or_internal_err, TableReference, +}; use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue, @@ -347,11 +349,11 @@ impl DefaultPhysicalPlanner { .flatten() .collect::>(); // Ideally this never happens if we have a valid LogicalPlan tree - if outputs.len() != 1 { - return internal_err!( - "Failed to convert LogicalPlan to ExecutionPlan: More than one root detected" - ); - } + assert_eq_or_internal_err!( + outputs.len(), + 1, + "Failed to convert LogicalPlan to ExecutionPlan: More than one root detected" + ); let plan = outputs.pop().unwrap(); Ok(plan) } @@ -588,9 +590,10 @@ impl DefaultPhysicalPlanner { } } LogicalPlan::Window(Window { window_expr, .. }) => { - if window_expr.is_empty() { - return internal_err!("Impossibly got empty window expression"); - } + assert_or_internal_err!( + !window_expr.is_empty(), + "Impossibly got empty window expression" + ); let input_exec = children.one()?; @@ -1764,14 +1767,12 @@ fn qualify_join_schema_sides( .zip(left_fields.iter().chain(right_fields.iter())) .enumerate() { - if field.name() != expected.name() { - return internal_err!( - "Field name mismatch at index {}: expected '{}', found '{}'", - i, - expected.name(), - field.name() - ); - } + assert_eq_or_internal_err!( + field.name(), + expected.name(), + "Field name mismatch at index {}", + i + ); } // qualify sides From 8e3f157cead22773c7ec96c04ae39cd58fb172d9 Mon Sep 17 00:00:00 2001 From: Dhanush Date: Sun, 9 Nov 2025 11:38:27 +0530 Subject: [PATCH 0033/1589] chore: enforce clippy lint needless_pass_by_value to datafusion-physical-optimizer (#18555) ## Which issue does this PR close? - Closes #18547. ## What changes are included in this PR? enforce clippy lint `needless_pass_by_value` to `datafusion-physical-optimizer` ## Are these changes tested? yes ## Are there any user-facing changes? no --- .../src/enforce_sorting/sort_pushdown.rs | 4 ++-- datafusion/physical-optimizer/src/filter_pushdown.rs | 11 ++++++----- datafusion/physical-optimizer/src/lib.rs | 3 +++ .../physical-optimizer/src/projection_pushdown.rs | 4 ++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs index 6e4e784866129..7c02b901169a7 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs @@ -383,7 +383,7 @@ fn pushdown_requirement_to_children( } else if let Some(hash_join) = plan.as_any().downcast_ref::() { handle_hash_join(hash_join, parent_required) } else { - handle_custom_pushdown(plan, parent_required, maintains_input_order) + handle_custom_pushdown(plan, parent_required, &maintains_input_order) } // TODO: Add support for Projection push down } @@ -604,7 +604,7 @@ fn expr_source_side( fn handle_custom_pushdown( plan: &Arc, parent_required: OrderingRequirements, - maintains_input_order: Vec, + maintains_input_order: &[bool], ) -> Result>>> { // If the plan has no children, return early: if plan.children().is_empty() { diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 5ee7023ff6ee2..df44225159e3a 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -422,7 +422,7 @@ impl PhysicalOptimizerRule for FilterPushdown { config: &ConfigOptions, ) -> Result> { Ok( - push_down_filters(Arc::clone(&plan), vec![], config, self.phase)? + push_down_filters(&Arc::clone(&plan), vec![], config, self.phase)? .updated_node .unwrap_or(plan), ) @@ -438,7 +438,7 @@ impl PhysicalOptimizerRule for FilterPushdown { } fn push_down_filters( - node: Arc, + node: &Arc, parent_predicates: Vec>, config: &ConfigOptions, phase: FilterPushdownPhase, @@ -510,7 +510,8 @@ fn push_down_filters( let num_parent_filters = all_predicates.len() - num_self_filters; // Any filters that could not be pushed down to a child are marked as not-supported to our parents - let result = push_down_filters(Arc::clone(child), all_predicates, config, phase)?; + let result = + push_down_filters(&Arc::clone(child), all_predicates, config, phase)?; if let Some(new_child) = result.updated_node { // If we have a filter pushdown result, we need to update our children @@ -571,7 +572,7 @@ fn push_down_filters( } // Re-create this node with new children - let updated_node = with_new_children_if_necessary(Arc::clone(&node), new_children)?; + let updated_node = with_new_children_if_necessary(Arc::clone(node), new_children)?; // TODO: by calling `handle_child_pushdown_result` we are assuming that the // `ExecutionPlan` implementation will not change the plan itself. @@ -596,7 +597,7 @@ fn push_down_filters( )?; // Compare pointers for new_node and node, if they are different we must replace // ourselves because of changes in our children. - if res.updated_node.is_none() && !Arc::ptr_eq(&updated_node, &node) { + if res.updated_node.is_none() && !Arc::ptr_eq(&updated_node, node) { res.updated_node = Some(updated_node) } Ok(res) diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs index 79db43c1cbe94..d238a4264ff02 100644 --- a/datafusion/physical-optimizer/src/lib.rs +++ b/datafusion/physical-optimizer/src/lib.rs @@ -23,6 +23,9 @@ // Make sure fast / cheap clones on Arc are explicit: // https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] pub mod aggregate_statistics; pub mod coalesce_async_exec_input; diff --git a/datafusion/physical-optimizer/src/projection_pushdown.rs b/datafusion/physical-optimizer/src/projection_pushdown.rs index 987e3cb6f713e..b5e002b51f921 100644 --- a/datafusion/physical-optimizer/src/projection_pushdown.rs +++ b/datafusion/physical-optimizer/src/projection_pushdown.rs @@ -129,7 +129,7 @@ fn try_push_down_join_filter( let join_filter = minimize_join_filter( Arc::clone(rhs_rewrite.data.1.expression()), - rhs_rewrite.data.1.column_indices().to_vec(), + rhs_rewrite.data.1.column_indices(), lhs_rewrite.data.0.schema().as_ref(), rhs_rewrite.data.0.schema().as_ref(), ); @@ -238,7 +238,7 @@ fn try_push_down_projection( /// columns are not needed anymore. fn minimize_join_filter( expr: Arc, - old_column_indices: Vec, + old_column_indices: &[ColumnIndex], lhs_schema: &Schema, rhs_schema: &Schema, ) -> JoinFilter { From 2ae3818f35143459541ca588aedd4b16c86088a1 Mon Sep 17 00:00:00 2001 From: Dhanush Date: Sun, 9 Nov 2025 11:40:03 +0530 Subject: [PATCH 0034/1589] chore: enforce clippy lint needless_pass_by_value for datafusion-sql (#18554) ## Which issue does this PR close? - Closes #18546. ## Rationale for this change enforce clippy lint `needless_pass_by_value` ## Are these changes tested? yes ## Are there any user-facing changes? no --- datafusion/sql/src/cte.rs | 12 ++++----- datafusion/sql/src/expr/binary_op.rs | 4 +-- datafusion/sql/src/expr/mod.rs | 16 +++++++----- datafusion/sql/src/expr/subquery.rs | 8 +++--- datafusion/sql/src/lib.rs | 3 +++ datafusion/sql/src/parser.rs | 38 ++++++++++++++-------------- datafusion/sql/src/statement.rs | 12 ++++----- 7 files changed, 49 insertions(+), 44 deletions(-) diff --git a/datafusion/sql/src/cte.rs b/datafusion/sql/src/cte.rs index aceec676761cb..8ccab9dd9a0b6 100644 --- a/datafusion/sql/src/cte.rs +++ b/datafusion/sql/src/cte.rs @@ -46,7 +46,7 @@ impl SqlToRel<'_, S> { // Create a logical plan for the CTE let cte_plan = if is_recursive { - self.recursive_cte(cte_name.clone(), *cte.query, planner_context)? + self.recursive_cte(&cte_name, *cte.query, planner_context)? } else { self.non_recursive_cte(*cte.query, planner_context)? }; @@ -70,7 +70,7 @@ impl SqlToRel<'_, S> { fn recursive_cte( &self, - cte_name: String, + cte_name: &str, mut cte_query: Query, planner_context: &mut PlannerContext, ) -> Result { @@ -136,7 +136,7 @@ impl SqlToRel<'_, S> { // Step 2.1: Create a table source for the temporary relation let work_table_source = self .context_provider - .create_cte_work_table(&cte_name, Arc::clone(static_plan.schema().inner()))?; + .create_cte_work_table(cte_name, Arc::clone(static_plan.schema().inner()))?; // Step 2.2: Create a temporary relation logical plan that will be used // as the input to the recursive term @@ -147,14 +147,14 @@ impl SqlToRel<'_, S> { )? .build()?; - let name = cte_name.clone(); + let name = cte_name.to_string(); // Step 2.3: Register the temporary relation in the planning context // For all the self references in the variadic term, we'll replace it // with the temporary relation we created above by temporarily registering // it as a CTE. This temporary relation in the planning context will be // replaced by the actual CTE plan once we're done with the planning. - planner_context.insert_cte(cte_name.clone(), work_table_plan); + planner_context.insert_cte(cte_name.to_string(), work_table_plan); // ---------- Step 3: Compile the recursive term ------------------ // this uses the named_relation we inserted above to resolve the @@ -166,7 +166,7 @@ impl SqlToRel<'_, S> { // if not, it is a non-recursive CTE if !has_work_table_reference(&recursive_plan, &work_table_source) { // Remove the work table plan from the context - planner_context.remove_cte(&cte_name); + planner_context.remove_cte(cte_name); // Compile it as a non-recursive CTE return self.set_operation_to_plan( SetOperator::Union, diff --git a/datafusion/sql/src/expr/binary_op.rs b/datafusion/sql/src/expr/binary_op.rs index 1c06f5ee926f9..f0ca54161782d 100644 --- a/datafusion/sql/src/expr/binary_op.rs +++ b/datafusion/sql/src/expr/binary_op.rs @@ -21,8 +21,8 @@ use datafusion_expr::Operator; use sqlparser::ast::BinaryOperator; impl SqlToRel<'_, S> { - pub(crate) fn parse_sql_binary_op(&self, op: BinaryOperator) -> Result { - match op { + pub(crate) fn parse_sql_binary_op(&self, op: &BinaryOperator) -> Result { + match *op { BinaryOperator::Gt => Ok(Operator::Gt), BinaryOperator::GtEq => Ok(Operator::GtEq), BinaryOperator::Lt => Ok(Operator::Lt), diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 715a02db8b027..9725025d599fe 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -140,7 +140,7 @@ impl SqlToRel<'_, S> { let RawBinaryExpr { op, left, right } = binary_expr; Ok(Expr::BinaryExpr(BinaryExpr::new( Box::new(left), - self.parse_sql_binary_op(op)?, + self.parse_sql_binary_op(&op)?, Box::new(right), ))) } @@ -270,7 +270,9 @@ impl SqlToRel<'_, S> { expr, data_type, format, - } => self.sql_cast_to_expr(*expr, data_type, format, schema, planner_context), + } => { + self.sql_cast_to_expr(*expr, &data_type, format, schema, planner_context) + } SQLExpr::Cast { kind: CastKind::TryCast | CastKind::SafeCast, @@ -553,7 +555,7 @@ impl SqlToRel<'_, S> { } SQLExpr::Struct { values, fields } => { - self.parse_struct(schema, planner_context, values, fields) + self.parse_struct(schema, planner_context, values, &fields) } SQLExpr::Position { expr, r#in } => { self.sql_position_to_expr(*expr, *r#in, schema, planner_context) @@ -639,7 +641,7 @@ impl SqlToRel<'_, S> { schema: &DFSchema, planner_context: &mut PlannerContext, values: Vec, - fields: Vec, + fields: &[StructField], ) -> Result { if !fields.is_empty() { return not_impl_err!("Struct fields are not supported yet"); @@ -673,7 +675,7 @@ impl SqlToRel<'_, S> { Some(SQLExpr::Identifier(_)) | Some(SQLExpr::Value(_)) | Some(SQLExpr::CompoundIdentifier(_)) => { - self.parse_struct(schema, planner_context, values, vec![]) + self.parse_struct(schema, planner_context, values, &[]) } None => not_impl_err!("Empty tuple not supported yet"), _ => { @@ -979,7 +981,7 @@ impl SqlToRel<'_, S> { fn sql_cast_to_expr( &self, expr: SQLExpr, - data_type: SQLDataType, + data_type: &SQLDataType, format: Option, schema: &DFSchema, planner_context: &mut PlannerContext, @@ -988,7 +990,7 @@ impl SqlToRel<'_, S> { return not_impl_err!("CAST with format is not supported: {format}"); } - let dt = self.convert_data_type_to_field(&data_type)?; + let dt = self.convert_data_type_to_field(data_type)?; let expr = self.sql_expr_to_logical_expr(expr, schema, planner_context)?; // numeric constants are treated as seconds (rather as nanoseconds) diff --git a/datafusion/sql/src/expr/subquery.rs b/datafusion/sql/src/expr/subquery.rs index 24bb813634cc1..4bca6f7e49ba0 100644 --- a/datafusion/sql/src/expr/subquery.rs +++ b/datafusion/sql/src/expr/subquery.rs @@ -74,7 +74,7 @@ impl SqlToRel<'_, S> { self.validate_single_column( &sub_plan, - spans.clone(), + &spans, "Too many columns! The subquery should only return one column", "Select only one column in the subquery", )?; @@ -116,7 +116,7 @@ impl SqlToRel<'_, S> { self.validate_single_column( &sub_plan, - spans.clone(), + &spans, "Too many columns! The subquery should only return one column", "Select only one column in the subquery", )?; @@ -131,7 +131,7 @@ impl SqlToRel<'_, S> { fn validate_single_column( &self, sub_plan: &LogicalPlan, - spans: Spans, + spans: &Spans, error_message: &str, help_message: &str, ) -> Result<()> { @@ -148,7 +148,7 @@ impl SqlToRel<'_, S> { fn build_multi_column_diagnostic( &self, - spans: Spans, + spans: &Spans, error_message: &str, help_message: &str, ) -> Diagnostic { diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index da15b90d22a84..9f8105e9a85b1 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -23,6 +23,9 @@ // Make sure fast / cheap clones on Arc are explicit: // https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] //! This crate provides: //! diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 99d7467e1b7ca..05dd878907634 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -456,7 +456,7 @@ impl<'a> DFParser<'a> { break; } if expecting_statement_delimiter { - return self.expected("end of statement", self.parser.peek_token()); + return self.expected("end of statement", &self.parser.peek_token()); } let statement = self.parse_statement()?; @@ -470,7 +470,7 @@ impl<'a> DFParser<'a> { fn expected( &self, expected: &str, - found: TokenWithSpan, + found: &TokenWithSpan, ) -> Result { let sql_parser_span = found.span; let span = Span::try_from_sqlparser_span(sql_parser_span); @@ -488,11 +488,11 @@ impl<'a> DFParser<'a> { fn expect_token( &mut self, expected: &str, - token: Token, + token: &Token, ) -> Result<(), DataFusionError> { let next_token = self.parser.peek_token_ref(); - if next_token.token != token { - self.expected(expected, next_token.clone()) + if next_token.token != *token { + self.expected(expected, next_token) } else { Ok(()) } @@ -553,7 +553,7 @@ impl<'a> DFParser<'a> { /// contains any trailing, unparsed tokens. pub fn parse_into_expr(&mut self) -> Result { let expr = self.parse_expr()?; - self.expect_token("end of expression", Token::EOF)?; + self.expect_token("end of expression", &Token::EOF)?; Ok(expr) } @@ -638,7 +638,7 @@ impl<'a> DFParser<'a> { if token == Token::EOF || token == Token::SemiColon { break; } else { - return self.expected("end of statement or ;", token)?; + return self.expected("end of statement or ;", &token)?; } } } @@ -675,7 +675,7 @@ impl<'a> DFParser<'a> { // Unquoted namespaced keys have to conform to the syntax // "[\.]*". If we have a key that breaks this // pattern, error out: - return self.expected("key name", next_token); + return self.expected("key name", &next_token); } } Ok(parts.join(".")) @@ -683,7 +683,7 @@ impl<'a> DFParser<'a> { Token::SingleQuotedString(s) => Ok(s), Token::DoubleQuotedString(s) => Ok(s), Token::EscapedStringLiteral(s) => Ok(s), - _ => self.expected("key name", next_token), + _ => self.expected("key name", &next_token), } } @@ -702,7 +702,7 @@ impl<'a> DFParser<'a> { Token::DoubleQuotedString(s) => Ok(Value::DoubleQuotedString(s)), Token::EscapedStringLiteral(s) => Ok(Value::EscapedStringLiteral(s)), Token::Number(n, l) => Ok(Value::Number(n, l)), - _ => self.expected("string or numeric value", next_token), + _ => self.expected("string or numeric value", &next_token), } } @@ -732,7 +732,7 @@ impl<'a> DFParser<'a> { Token::Word(w) => Ok(w.value), Token::SingleQuotedString(w) => Ok(w), Token::DoubleQuotedString(w) => Ok(w), - _ => self.expected("an explain format such as TREE", next_token), + _ => self.expected("an explain format such as TREE", &next_token), }?; Ok(Some(format)) } @@ -777,7 +777,7 @@ impl<'a> DFParser<'a> { let identifier = self.parser.parse_identifier()?; partitions.push(identifier.to_string()); } else { - return self.expected("partition name", self.parser.peek_token()); + return self.expected("partition name", &self.parser.peek_token()); } let comma = self.parser.consume_token(&Token::Comma); if self.parser.consume_token(&Token::RParen) { @@ -786,7 +786,7 @@ impl<'a> DFParser<'a> { } else if !comma { return self.expected( "',' or ')' after partition definition", - self.parser.peek_token(), + &self.parser.peek_token(), ); } } @@ -857,7 +857,7 @@ impl<'a> DFParser<'a> { } else { return self.expected( "column name or constraint definition", - self.parser.peek_token(), + &self.parser.peek_token(), ); } let comma = self.parser.consume_token(&Token::Comma); @@ -867,7 +867,7 @@ impl<'a> DFParser<'a> { } else if !comma { return self.expected( "',' or ')' after column definition", - self.parser.peek_token(), + &self.parser.peek_token(), ); } } @@ -887,7 +887,7 @@ impl<'a> DFParser<'a> { } else { return self.expected( "constraint details after CONSTRAINT ", - self.parser.peek_token(), + &self.parser.peek_token(), ); } } else if let Some(option) = self.parser.parse_optional_column_option()? { @@ -1012,7 +1012,7 @@ impl<'a> DFParser<'a> { if token == Token::EOF || token == Token::SemiColon { break; } else { - return self.expected("end of statement or ;", token)?; + return self.expected("end of statement or ;", &token)?; } } } @@ -1051,7 +1051,7 @@ impl<'a> DFParser<'a> { let token = self.parser.next_token(); match &token.token { Token::Word(w) => parse_file_type(&w.value), - _ => self.expected("one of ARROW, PARQUET, NDJSON, or CSV", token), + _ => self.expected("one of ARROW, PARQUET, NDJSON, or CSV", &token), } } @@ -1074,7 +1074,7 @@ impl<'a> DFParser<'a> { } else if !comma { return self.expected( "',' or ')' after option definition", - self.parser.peek_token(), + &self.parser.peek_token(), ); } } diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 81381bf49fc5b..d09923690f868 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1037,7 +1037,7 @@ impl SqlToRel<'_, S> { if limit.is_some() { return not_impl_err!("Update-limit clause not supported")?; } - self.update_to_plan(table, assignments, update_from, selection) + self.update_to_plan(table, &assignments, update_from, selection) } Statement::Delete(Delete { @@ -1070,7 +1070,7 @@ impl SqlToRel<'_, S> { } let table_name = self.get_delete_target(from)?; - self.delete_to_plan(table_name, selection) + self.delete_to_plan(&table_name, selection) } Statement::StartTransaction { @@ -1100,7 +1100,7 @@ impl SqlToRel<'_, S> { if has_end_keyword { return not_impl_err!("Transaction with END keyword not supported"); } - self.validate_transaction_kind(transaction)?; + self.validate_transaction_kind(transaction.as_ref())?; let isolation_level: ast::TransactionIsolationLevel = modes .iter() .filter_map(|m: &TransactionMode| match m { @@ -1903,7 +1903,7 @@ impl SqlToRel<'_, S> { fn delete_to_plan( &self, - table_name: ObjectName, + table_name: &ObjectName, predicate_expr: Option, ) -> Result { // Do a table lookup to verify the table exists @@ -1947,7 +1947,7 @@ impl SqlToRel<'_, S> { fn update_to_plan( &self, table: TableWithJoins, - assignments: Vec, + assignments: &[Assignment], from: Option, predicate_expr: Option, ) -> Result { @@ -2353,7 +2353,7 @@ ON p.function_name = r.routine_name fn validate_transaction_kind( &self, - kind: Option, + kind: Option<&BeginTransactionKind>, ) -> Result<()> { match kind { // BEGIN From f10fcbeccb5fa630305070d3a7bb3af532a2389e Mon Sep 17 00:00:00 2001 From: Dhanush Date: Sun, 9 Nov 2025 11:40:46 +0530 Subject: [PATCH 0035/1589] chore: enforce clippy lint needless_pass_by_value to physical-expr-common (#18556) ## Which issue does this PR close? - Closes #18543 ## What changes are included in this PR? enforce clippy lint `needless_pass_by_value` to `datafusion-physical-expr-common` ## Are these changes tested? yes ## Are there any user-facing changes? no --- datafusion/physical-expr-common/src/lib.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs index e21206d906422..cac863ee69fb4 100644 --- a/datafusion/physical-expr-common/src/lib.rs +++ b/datafusion/physical-expr-common/src/lib.rs @@ -23,6 +23,9 @@ // Make sure fast / cheap clones on Arc are explicit: // https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] //! Physical Expr Common packages for [DataFusion] //! This package contains high level PhysicalExpr trait From a216d4aeedbe7522e2e31fa273c5546a0a23ea71 Mon Sep 17 00:00:00 2001 From: Cora Sutton Date: Sun, 9 Nov 2025 00:12:42 -0600 Subject: [PATCH 0036/1589] chore: Enforce lint rule `clippy::needless_pass_by_value` to `datafusion-physical-expr` (#18557) ## Which issue does this PR close? - Closes #18544. ## Rationale for this change See https://github.com/apache/datafusion/issues/18503 for details. ## What changes are included in this PR? I enabled the clippy lint rule and then fixed nearly all instances. ## Are these changes tested? As part of the normal test suite, yes. ## Are there any user-facing changes? The following `pub (crate)` APIs were changed: - `regex_match_dyn` in `datafusion/physical-expr/src/expressions/binary/kernels.rs` - `regex_match_dyn_scalar` in `datafusion/physical-expr/src/expressions/binary/kernels.rs` But no fully `pub` functions were changed. --- datafusion/physical-expr/src/analysis.rs | 6 +++--- .../src/equivalence/properties/mod.rs | 2 +- .../physical-expr/src/expressions/binary.rs | 20 +++++++++---------- .../src/expressions/binary/kernels.rs | 6 +++--- .../physical-expr/src/expressions/in_list.rs | 14 ++++++------- .../physical-expr/src/expressions/literal.rs | 1 + datafusion/physical-expr/src/lib.rs | 3 +++ .../physical-expr/src/utils/guarantee.rs | 4 ++-- 8 files changed, 30 insertions(+), 26 deletions(-) diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs index 1d59dab8fd6dd..f34dfb4ae1b4a 100644 --- a/datafusion/physical-expr/src/analysis.rs +++ b/datafusion/physical-expr/src/analysis.rs @@ -218,7 +218,7 @@ pub fn analyze( .update_ranges(&mut target_indices_and_boundaries, Interval::CERTAINLY_TRUE)? { PropagationResult::Success => { - shrink_boundaries(graph, target_boundaries, target_expr_and_indices) + shrink_boundaries(&graph, target_boundaries, &target_expr_and_indices) } PropagationResult::Infeasible => { // If the propagation result is infeasible, set intervals to None @@ -239,9 +239,9 @@ pub fn analyze( /// Following this, it constructs and returns a new `AnalysisContext` with the /// updated parameters. fn shrink_boundaries( - graph: ExprIntervalGraph, + graph: &ExprIntervalGraph, mut target_boundaries: Vec, - target_expr_and_indices: Vec<(Arc, usize)>, + target_expr_and_indices: &[(Arc, usize)], ) -> Result { let initial_boundaries = target_boundaries.clone(); target_expr_and_indices.iter().for_each(|(expr, i)| { diff --git a/datafusion/physical-expr/src/equivalence/properties/mod.rs b/datafusion/physical-expr/src/equivalence/properties/mod.rs index 4d919d623bf9b..c13618feb8aa2 100644 --- a/datafusion/physical-expr/src/equivalence/properties/mod.rs +++ b/datafusion/physical-expr/src/equivalence/properties/mod.rs @@ -380,7 +380,7 @@ impl EquivalenceProperties { right: Arc, ) -> Result<()> { // Add equal expressions to the state: - if self.eq_group.add_equal_conditions(Arc::clone(&left), right) { + if self.eq_group.add_equal_conditions(left, right) { self.update_oeq_cache()?; } self.update_oeq_cache()?; diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index b09d57f02d582..f3a71cbea480b 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -573,10 +573,10 @@ impl BinaryExpr { ) -> Result>> { use Operator::*; let scalar_result = match &self.op { - RegexMatch => regex_match_dyn_scalar(array, scalar, false, false), - RegexIMatch => regex_match_dyn_scalar(array, scalar, false, true), - RegexNotMatch => regex_match_dyn_scalar(array, scalar, true, false), - RegexNotIMatch => regex_match_dyn_scalar(array, scalar, true, true), + RegexMatch => regex_match_dyn_scalar(array, &scalar, false, false), + RegexIMatch => regex_match_dyn_scalar(array, &scalar, false, true), + RegexNotMatch => regex_match_dyn_scalar(array, &scalar, true, false), + RegexNotIMatch => regex_match_dyn_scalar(array, &scalar, true, true), BitwiseAnd => bitwise_and_dyn_scalar(array, scalar), BitwiseOr => bitwise_or_dyn_scalar(array, scalar), BitwiseXor => bitwise_xor_dyn_scalar(array, scalar), @@ -625,16 +625,16 @@ impl BinaryExpr { ) } } - RegexMatch => regex_match_dyn(left, right, false, false), - RegexIMatch => regex_match_dyn(left, right, false, true), - RegexNotMatch => regex_match_dyn(left, right, true, false), - RegexNotIMatch => regex_match_dyn(left, right, true, true), + RegexMatch => regex_match_dyn(&left, &right, false, false), + RegexIMatch => regex_match_dyn(&left, &right, false, true), + RegexNotMatch => regex_match_dyn(&left, &right, true, false), + RegexNotIMatch => regex_match_dyn(&left, &right, true, true), BitwiseAnd => bitwise_and_dyn(left, right), BitwiseOr => bitwise_or_dyn(left, right), BitwiseXor => bitwise_xor_dyn(left, right), BitwiseShiftRight => bitwise_shift_right_dyn(left, right), BitwiseShiftLeft => bitwise_shift_left_dyn(left, right), - StringConcat => concat_elements(left, right), + StringConcat => concat_elements(&left, &right), AtArrow | ArrowAt | Arrow | LongArrow | HashArrow | HashLongArrow | AtAt | HashMinus | AtQuestion | Question | QuestionAnd | QuestionPipe | IntegerDivide => { @@ -854,7 +854,7 @@ fn pre_selection_scatter( Ok(ColumnarValue::Array(Arc::new(boolean_result))) } -fn concat_elements(left: Arc, right: Arc) -> Result { +fn concat_elements(left: &ArrayRef, right: &ArrayRef) -> Result { Ok(match left.data_type() { DataType::Utf8 => Arc::new(concat_elements_utf8( left.as_string::(), diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs index 6c96975ed6446..ad44b00212039 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs @@ -207,8 +207,8 @@ macro_rules! regexp_is_match_flag { } pub(crate) fn regex_match_dyn( - left: ArrayRef, - right: ArrayRef, + left: &ArrayRef, + right: &ArrayRef, not_match: bool, flag: bool, ) -> Result { @@ -259,7 +259,7 @@ macro_rules! regexp_is_match_flag_scalar { pub(crate) fn regex_match_dyn_scalar( left: &dyn Array, - right: ScalarValue, + right: &ScalarValue, not_match: bool, flag: bool, ) -> Option> { diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index fa91635d9bfd9..eeac986beec0c 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -149,7 +149,7 @@ where /// /// Note: This is split into a separate function as higher-rank trait bounds currently /// cause type inference to misbehave -fn make_hash_set(array: T) -> ArrayHashSet +fn make_hash_set(array: &T) -> ArrayHashSet where T: ArrayAccessor, T::Item: IsEqual, @@ -183,26 +183,26 @@ where /// Creates a `Box` for the given list of `IN` expressions and `batch` fn make_set(array: &dyn Array) -> Result> { Ok(downcast_primitive_array! { - array => Arc::new(ArraySet::new(array, make_hash_set(array))), + array => Arc::new(ArraySet::new(array, make_hash_set(&array))), DataType::Boolean => { let array = as_boolean_array(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) + Arc::new(ArraySet::new(array, make_hash_set(&array))) }, DataType::Utf8 => { let array = as_string_array(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) + Arc::new(ArraySet::new(array, make_hash_set(&array))) } DataType::LargeUtf8 => { let array = as_largestring_array(array); - Arc::new(ArraySet::new(array, make_hash_set(array))) + Arc::new(ArraySet::new(array, make_hash_set(&array))) } DataType::Binary => { let array = as_generic_binary_array::(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) + Arc::new(ArraySet::new(array, make_hash_set(&array))) } DataType::LargeBinary => { let array = as_generic_binary_array::(array)?; - Arc::new(ArraySet::new(array, make_hash_set(array))) + Arc::new(ArraySet::new(array, make_hash_set(&array))) } DataType::Dictionary(_, _) => unreachable!("dictionary should have been flattened"), d => return not_impl_err!("DataType::{d} not supported in InList") diff --git a/datafusion/physical-expr/src/expressions/literal.rs b/datafusion/physical-expr/src/expressions/literal.rs index 94e91d43a1c48..359bfcefdbb5f 100644 --- a/datafusion/physical-expr/src/expressions/literal.rs +++ b/datafusion/physical-expr/src/expressions/literal.rs @@ -137,6 +137,7 @@ impl PhysicalExpr for Literal { } /// Create a literal expression +#[allow(clippy::needless_pass_by_value)] pub fn lit(value: T) -> Arc { match value.lit() { Expr::Literal(v, _) => Arc::new(Literal::new(v)), diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index aa8c9e50fd71e..f59582f405064 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -23,6 +23,9 @@ // Make sure fast / cheap clones on Arc are explicit: // https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] // Backward compatibility pub mod aggregate; diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 8a57cc7b7c154..d63a9590c3f66 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -124,7 +124,7 @@ impl LiteralGuarantee { // for an `AND` conjunction to be true, all terms individually must be true .fold(GuaranteeBuilder::new(), |builder, expr| { if let Some(cel) = ColOpLit::try_new(expr) { - builder.aggregate_conjunct(cel) + builder.aggregate_conjunct(&cel) } else if let Some(inlist) = expr .as_any() .downcast_ref::() @@ -292,7 +292,7 @@ impl<'a> GuaranteeBuilder<'a> { /// # Examples /// * `AND (a = 1)`: `a` is guaranteed to be 1 /// * `AND (a != 1)`: a is guaranteed to not be 1 - fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self { + fn aggregate_conjunct(self, col_op_lit: &ColOpLit<'a>) -> Self { self.aggregate_multi_conjunct( col_op_lit.col, col_op_lit.guarantee, From 6ab4d216b768c9327982e59376a62a29c69ca436 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 9 Nov 2025 01:15:14 -0600 Subject: [PATCH 0037/1589] Force `FileSource` to be constructed with a `Schema` (#18386) Most of these file source implementations cannot operate without schema, they all have `.expect("schema must be set")`s that violate using the language to enforce correctness. This is an attempt to rework that by making it so you have to pass in a schema to construct them. That said there are downsides: 1. More boilerplate. 2. Requires that the schema passed into `FileScanConfig` and `FileSource` match. I feel like there's another twist to this needed... maybe moving the schema out of `FileScanConfig`? That's not currently possible, it's used in both places. Maybe having a `FileScan` and a `FileScanConfig` and having construction be `FileScan::new(FileSource::new(config), config)`? --- .../examples/advanced_parquet_index.rs | 13 +- .../examples/csv_json_opener.rs | 22 ++- .../examples/custom_file_format.rs | 5 +- .../examples/default_column_values.rs | 3 +- .../examples/parquet_embedded_index.rs | 6 +- datafusion-examples/examples/parquet_index.rs | 5 +- datafusion/catalog-listing/src/table.rs | 38 ++-- .../core/src/datasource/file_format/mod.rs | 6 +- datafusion/core/src/datasource/mod.rs | 13 +- .../core/src/datasource/physical_plan/avro.rs | 29 ++- .../core/src/datasource/physical_plan/csv.rs | 166 +++++++++++------- .../core/src/datasource/physical_plan/json.rs | 16 +- .../src/datasource/physical_plan/parquet.rs | 61 +++---- datafusion/core/src/test/mod.rs | 19 +- datafusion/core/src/test_util/parquet.rs | 37 ++-- datafusion/core/tests/fuzz_cases/pruning.rs | 5 +- .../core/tests/parquet/custom_reader.rs | 3 +- .../tests/parquet/external_access_plan.rs | 6 +- datafusion/core/tests/parquet/page_pruning.rs | 4 +- .../core/tests/parquet/schema_adapter.rs | 22 +-- .../core/tests/parquet/schema_coercion.rs | 15 +- .../enforce_distribution.rs | 66 ++++--- .../physical_optimizer/enforce_sorting.rs | 11 +- .../physical_optimizer/filter_pushdown/mod.rs | 25 ++- .../filter_pushdown/util.rs | 82 ++++----- .../physical_optimizer/projection_pushdown.rs | 59 ++++--- .../tests/physical_optimizer/test_utils.rs | 9 +- .../schema_adapter_integration_tests.rs | 40 +++-- .../datasource-arrow/src/file_format.rs | 11 +- datafusion/datasource-arrow/src/source.rs | 23 ++- datafusion/datasource-avro/src/file_format.rs | 10 +- datafusion/datasource-avro/src/source.rs | 31 ++-- datafusion/datasource-csv/src/file_format.rs | 36 ++-- datafusion/datasource-csv/src/mod.rs | 4 +- datafusion/datasource-csv/src/source.rs | 128 +++++++------- datafusion/datasource-json/src/file_format.rs | 11 +- datafusion/datasource-json/src/source.rs | 23 ++- .../datasource-parquet/src/file_format.rs | 15 +- datafusion/datasource-parquet/src/opener.rs | 2 +- datafusion/datasource-parquet/src/source.rs | 77 ++++---- datafusion/datasource/src/file.rs | 7 +- datafusion/datasource/src/file_format.rs | 5 +- datafusion/datasource/src/file_scan_config.rs | 157 ++++++++--------- datafusion/datasource/src/file_stream.rs | 4 +- datafusion/datasource/src/table_schema.rs | 6 + datafusion/datasource/src/test_util.rs | 37 +++- .../proto/src/physical_plan/from_proto.rs | 58 +++--- datafusion/proto/src/physical_plan/mod.rs | 41 +++-- .../tests/cases/roundtrip_physical_plan.rs | 130 +++++++------- .../substrait/src/physical_plan/consumer.rs | 4 +- .../tests/cases/roundtrip_physical_plan.rs | 34 ++-- docs/source/library-user-guide/upgrading.md | 89 +++++++++- 52 files changed, 1006 insertions(+), 723 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 371c18de354ce..67bfc5b1bcf56 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -491,19 +491,18 @@ impl TableProvider for IndexTableProvider { .with_file(indexed_file); let file_source = Arc::new( - ParquetSource::default() + ParquetSource::new(schema.clone()) // provide the predicate so the DataSourceExec can try and prune // row groups internally .with_predicate(predicate) // provide the factory to create parquet reader without re-reading metadata .with_parquet_file_reader_factory(Arc::new(reader_factory)), ); - let file_scan_config = - FileScanConfigBuilder::new(object_store_url, schema, file_source) - .with_limit(limit) - .with_projection_indices(projection.cloned()) - .with_file(partitioned_file) - .build(); + let file_scan_config = FileScanConfigBuilder::new(object_store_url, file_source) + .with_limit(limit) + .with_projection_indices(projection.cloned()) + .with_file(partitioned_file) + .build(); // Finally, put it all together into a DataSourceExec Ok(DataSourceExec::from_data_source(file_scan_config)) diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index ef2a3eaca0c88..6d0e4f4a3da7a 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema}; +use datafusion::common::config::CsvOptions; use datafusion::{ assert_batches_eq, datasource::{ @@ -31,9 +32,7 @@ use datafusion::{ test_util::aggr_test_schema, }; -use datafusion::datasource::{ - physical_plan::FileScanConfigBuilder, table_schema::TableSchema, -}; +use datafusion::datasource::physical_plan::FileScanConfigBuilder; use futures::StreamExt; use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; @@ -57,19 +56,25 @@ async fn csv_opener() -> Result<()> { let path = std::path::Path::new(&path).canonicalize()?; + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - Arc::clone(&schema), - Arc::new(CsvSource::default()), + Arc::new(CsvSource::new(Arc::clone(&schema)).with_csv_options(options.clone())), ) .with_projection_indices(Some(vec![12, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.display().to_string(), 10)) .build(); - let config = CsvSource::new(true, b',', b'"') + let config = CsvSource::new(Arc::clone(&schema)) + .with_csv_options(options) .with_comment(Some(b'#')) - .with_schema(TableSchema::from_file_schema(schema)) .with_batch_size(8192) .with_projection(&scan_config); @@ -125,8 +130,7 @@ async fn json_opener() -> Result<()> { let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - schema, - Arc::new(JsonSource::default()), + Arc::new(JsonSource::new(schema)), ) .with_projection_indices(Some(vec![1, 0])) .with_limit(Some(5)) diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs index 67fe642fd46ee..3505651eb183c 100644 --- a/datafusion-examples/examples/custom_file_format.rs +++ b/datafusion-examples/examples/custom_file_format.rs @@ -30,6 +30,7 @@ use datafusion::{ FileFormat, FileFormatFactory, }, physical_plan::{FileScanConfig, FileSinkConfig, FileSource}, + table_schema::TableSchema, MemTable, }, error::Result, @@ -128,8 +129,8 @@ impl FileFormat for TSVFileFormat { .await } - fn file_source(&self) -> Arc { - self.csv_file_format.file_source() + fn file_source(&self, table_schema: TableSchema) -> Arc { + self.csv_file_format.file_source(table_schema) } } diff --git a/datafusion-examples/examples/default_column_values.rs b/datafusion-examples/examples/default_column_values.rs index d3a7d2ec67f3c..bfc60519f26e4 100644 --- a/datafusion-examples/examples/default_column_values.rs +++ b/datafusion-examples/examples/default_column_values.rs @@ -235,7 +235,7 @@ impl TableProvider for DefaultValueTableProvider { &df_schema, )?; - let parquet_source = ParquetSource::default() + let parquet_source = ParquetSource::new(schema.clone()) .with_predicate(filter) .with_pushdown_filters(true); @@ -257,7 +257,6 @@ impl TableProvider for DefaultValueTableProvider { let file_scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("memory://")?, - self.schema.clone(), Arc::new(parquet_source), ) .with_projection_indices(projection.cloned()) diff --git a/datafusion-examples/examples/parquet_embedded_index.rs b/datafusion-examples/examples/parquet_embedded_index.rs index 3cbe189147752..bc0e5a072caac 100644 --- a/datafusion-examples/examples/parquet_embedded_index.rs +++ b/datafusion-examples/examples/parquet_embedded_index.rs @@ -426,8 +426,10 @@ impl TableProvider for DistinctIndexTable { // Build ParquetSource to actually read the files let url = ObjectStoreUrl::parse("file://")?; - let source = Arc::new(ParquetSource::default().with_enable_page_index(true)); - let mut builder = FileScanConfigBuilder::new(url, self.schema.clone(), source); + let source = Arc::new( + ParquetSource::new(self.schema.clone()).with_enable_page_index(true), + ); + let mut builder = FileScanConfigBuilder::new(url, source); for file in files_to_scan { let path = self.dir.join(file); let len = std::fs::metadata(&path)?.len(); diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index a1dd1f1ffd10d..bc9e2a9226d0b 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -242,9 +242,10 @@ impl TableProvider for IndexTableProvider { let files = self.index.get_files(predicate.clone())?; let object_store_url = ObjectStoreUrl::parse("file://")?; - let source = Arc::new(ParquetSource::default().with_predicate(predicate)); + let source = + Arc::new(ParquetSource::new(self.schema()).with_predicate(predicate)); let mut file_scan_config_builder = - FileScanConfigBuilder::new(object_store_url, self.schema(), source) + FileScanConfigBuilder::new(object_store_url, source) .with_projection_indices(projection.cloned()) .with_limit(limit); diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs index 95f9523d4401c..33d5c86bf88dc 100644 --- a/datafusion/catalog-listing/src/table.rs +++ b/datafusion/catalog-listing/src/table.rs @@ -34,7 +34,7 @@ use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, }; use datafusion_datasource::{ - compute_all_files_statistics, ListingTableUrl, PartitionedFile, + compute_all_files_statistics, ListingTableUrl, PartitionedFile, TableSchema, }; use datafusion_execution::cache::cache_manager::FileStatisticsCache; use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache; @@ -338,7 +338,16 @@ impl ListingTable { fn create_file_source_with_schema_adapter( &self, ) -> datafusion_common::Result> { - let mut source = self.options.format.file_source(); + let table_schema = TableSchema::new( + Arc::clone(&self.file_schema), + self.options + .table_partition_cols + .iter() + .map(|(col, field)| Arc::new(Field::new(col, field.clone(), false))) + .collect(), + ); + + let mut source = self.options.format.file_source(table_schema); // Apply schema adapter to source if available // // The source will use this SchemaAdapter to adapt data batches as they flow up the plan. @@ -418,7 +427,7 @@ impl TableProvider for ListingTable { .options .table_partition_cols .iter() - .map(|col| Ok(self.table_schema.field_with_name(&col.0)?.clone())) + .map(|col| Ok(Arc::new(self.table_schema.field_with_name(&col.0)?.clone()))) .collect::>>()?; let table_partition_col_names = table_partition_cols @@ -491,20 +500,15 @@ impl TableProvider for ListingTable { .format .create_physical_plan( state, - FileScanConfigBuilder::new( - object_store_url, - Arc::clone(&self.file_schema), - file_source, - ) - .with_file_groups(partitioned_file_lists) - .with_constraints(self.constraints.clone()) - .with_statistics(statistics) - .with_projection_indices(projection) - .with_limit(limit) - .with_output_ordering(output_ordering) - .with_table_partition_cols(table_partition_cols) - .with_expr_adapter(self.expr_adapter_factory.clone()) - .build(), + FileScanConfigBuilder::new(object_store_url, file_source) + .with_file_groups(partitioned_file_lists) + .with_constraints(self.constraints.clone()) + .with_statistics(statistics) + .with_projection_indices(projection) + .with_limit(limit) + .with_output_ordering(output_ordering) + .with_expr_adapter(self.expr_adapter_factory.clone()) + .build(), ) .await?; diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 4881783eeba69..7c55d452c4e12 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -40,6 +40,7 @@ pub(crate) mod test_util { use datafusion_catalog::Session; use datafusion_common::Result; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; + use datafusion_datasource::TableSchema; use datafusion_datasource::{file_format::FileFormat, PartitionedFile}; use datafusion_execution::object_store::ObjectStoreUrl; use std::sync::Arc; @@ -66,6 +67,8 @@ pub(crate) mod test_util { .await? }; + let table_schema = TableSchema::new(file_schema.clone(), vec![]); + let statistics = format .infer_stats(state, &store, file_schema.clone(), &meta) .await?; @@ -85,8 +88,7 @@ pub(crate) mod test_util { state, FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - file_schema, - format.file_source(), + format.file_source(table_schema), ) .with_file_groups(file_groups) .with_statistics(statistics) diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 37b9663111a53..620e389a0fb85 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -124,16 +124,13 @@ mod tests { let f2 = Field::new("extra_column", DataType::Utf8, true); let schema = Arc::new(Schema::new(vec![f1.clone(), f2.clone()])); - let source = ParquetSource::default() + let source = ParquetSource::new(Arc::clone(&schema)) .with_schema_adapter_factory(Arc::new(TestSchemaAdapterFactory {})) .unwrap(); - let base_conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema, - source, - ) - .with_file(partitioned_file) - .build(); + let base_conf = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file(partitioned_file) + .build(); let parquet_exec = DataSourceExec::from_data_source(base_conf); diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 9068c9758179d..1cf8c573acd95 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -34,7 +34,7 @@ mod tests { use datafusion_common::{test_util, Result, ScalarValue}; use datafusion_datasource::file_format::FileFormat; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; - use datafusion_datasource::PartitionedFile; + use datafusion_datasource::{PartitionedFile, TableSchema}; use datafusion_datasource_avro::source::AvroSource; use datafusion_datasource_avro::AvroFormat; use datafusion_execution::object_store::ObjectStoreUrl; @@ -81,15 +81,11 @@ mod tests { .infer_schema(&state, &store, std::slice::from_ref(&meta)) .await?; - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - source, - ) - .with_file(meta.into()) - .with_projection_indices(Some(vec![0, 1, 2])) - .build(); + let source = Arc::new(AvroSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file(meta.into()) + .with_projection_indices(Some(vec![0, 1, 2])) + .build(); let source_exec = DataSourceExec::from_data_source(conf); assert_eq!( @@ -157,8 +153,8 @@ mod tests { // Include the missing column in the projection let projection = Some(vec![0, 1, 2, actual_schema.fields().len()]); - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(AvroSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file(meta.into()) .with_projection_indices(projection) .build(); @@ -227,13 +223,16 @@ mod tests { partitioned_file.partition_values = vec![ScalarValue::from("2021-10-26")]; let projection = Some(vec![0, 1, file_schema.fields().len(), 2]); - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let table_schema = TableSchema::new( + file_schema.clone(), + vec![Arc::new(Field::new("date", DataType::Utf8, false))], + ); + let source = Arc::new(AvroSource::new(table_schema.clone())); + let conf = FileScanConfigBuilder::new(object_store_url, source) // select specific columns of the files as well as the partitioning // column which is supposed to be the last column in the table schema. .with_projection_indices(projection) .with_file(partitioned_file) - .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) .build(); let source_exec = DataSourceExec::from_data_source(conf); diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 4f46a57d8b137..ac5df24d49990 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -29,12 +29,14 @@ mod tests { use std::io::Write; use std::sync::Arc; + use datafusion_datasource::TableSchema; use datafusion_datasource_csv::CsvFormat; use object_store::ObjectStore; use crate::prelude::CsvReadOptions; use crate::prelude::SessionContext; use crate::test::partitioned_file_groups; + use datafusion_common::config::CsvOptions; use datafusion_common::test_util::arrow_test_data; use datafusion_common::test_util::batches_to_string; use datafusion_common::{assert_batches_eq, Result}; @@ -94,6 +96,8 @@ mod tests { async fn csv_exec_with_projection( file_compression_type: FileCompressionType, ) -> Result<()> { + use datafusion_datasource::TableSchema; + let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema(); @@ -110,16 +114,21 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_file_compression_type(file_compression_type) - .with_newlines_in_values(false) - .with_projection_indices(Some(vec![0, 2, 4])) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_file_compression_type(file_compression_type) + .with_newlines_in_values(false) + .with_projection_indices(Some(vec![0, 2, 4])) + .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); @@ -158,6 +167,8 @@ mod tests { async fn csv_exec_with_mixed_order_projection( file_compression_type: FileCompressionType, ) -> Result<()> { + use datafusion_datasource::TableSchema; + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); let session_ctx = SessionContext::new_with_config(cfg); let task_ctx = session_ctx.task_ctx(); @@ -175,16 +186,21 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_projection_indices(Some(vec![4, 0, 2])) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_projection_indices(Some(vec![4, 0, 2])) + .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); assert_eq!(3, csv.schema().fields().len()); @@ -221,6 +237,7 @@ mod tests { async fn csv_exec_with_limit( file_compression_type: FileCompressionType, ) -> Result<()> { + use datafusion_datasource::TableSchema; use futures::StreamExt; let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); @@ -240,16 +257,21 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_limit(Some(5)) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_limit(Some(5)) + .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); assert_eq!(13, csv.schema().fields().len()); @@ -287,6 +309,8 @@ mod tests { async fn csv_exec_with_missing_column( file_compression_type: FileCompressionType, ) -> Result<()> { + use datafusion_datasource::TableSchema; + let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema_with_missing_col(); @@ -303,16 +327,21 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_limit(Some(5)) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_limit(Some(5)) + .build(); assert_eq!(14, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); assert_eq!(14, csv.schema().fields().len()); @@ -341,6 +370,7 @@ mod tests { file_compression_type: FileCompressionType, ) -> Result<()> { use datafusion_common::ScalarValue; + use datafusion_datasource::TableSchema; let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); @@ -362,19 +392,26 @@ mod tests { let num_file_schema_fields = file_schema.fields().len(); - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) - // We should be able to project on the partition column - // Which is supposed to be after the file fields - .with_projection_indices(Some(vec![0, num_file_schema_fields])) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::new( + Arc::clone(&file_schema), + vec![Arc::new(Field::new("date", DataType::Utf8, false))], + ); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + // We should be able to project on the partition column + // Which is supposed to be after the file fields + .with_projection_indices(Some(vec![0, num_file_schema_fields])) + .build(); // we don't have `/date=xx/` in the path but that is ok because // partitions are resolved during scan anyway @@ -463,15 +500,20 @@ mod tests { ) .unwrap(); - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .build(); let csv = DataSourceExec::from_data_source(config); let it = csv.execute(0, task_ctx).unwrap(); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index f7d5c710bf48a..de7e87d25c848 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -176,8 +176,8 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()) @@ -251,8 +251,8 @@ mod tests { let file_schema = Arc::new(builder.finish()); let missing_field_idx = file_schema.fields.len() - 1; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()) @@ -294,8 +294,8 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) .with_projection_indices(Some(vec![0, 2])) .with_file_compression_type(file_compression_type.to_owned()) @@ -342,8 +342,8 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) .with_projection_indices(Some(vec![3, 0, 2])) .with_file_compression_type(file_compression_type.to_owned()) diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 0ffb252a66052..b27dcf56e33cb 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -161,7 +161,7 @@ mod tests { .as_ref() .map(|p| logical2physical(p, &table_schema)); - let mut source = ParquetSource::default(); + let mut source = ParquetSource::new(table_schema); if let Some(predicate) = predicate { source = source.with_predicate(predicate); } @@ -186,23 +186,19 @@ mod tests { source = source.with_bloom_filter_on_read(false); } - source.with_schema(TableSchema::new(Arc::clone(&table_schema), vec![])) + Arc::new(source) } fn build_parquet_exec( &self, - file_schema: SchemaRef, file_group: FileGroup, source: Arc, ) -> Arc { - let base_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - source, - ) - .with_file_group(file_group) - .with_projection_indices(self.projection.clone()) - .build(); + let base_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file_group(file_group) + .with_projection_indices(self.projection.clone()) + .build(); DataSourceExec::from_data_source(base_config) } @@ -231,11 +227,8 @@ mod tests { // build a ParquetExec to return the results let parquet_source = self.build_file_source(Arc::clone(table_schema)); - let parquet_exec = self.build_parquet_exec( - Arc::clone(table_schema), - file_group.clone(), - Arc::clone(&parquet_source), - ); + let parquet_exec = + self.build_parquet_exec(file_group.clone(), Arc::clone(&parquet_source)); let analyze_exec = Arc::new(AnalyzeExec::new( false, @@ -243,7 +236,6 @@ mod tests { vec![MetricType::SUMMARY, MetricType::DEV], // use a new ParquetSource to avoid sharing execution metrics self.build_parquet_exec( - Arc::clone(table_schema), file_group.clone(), self.build_file_source(Arc::clone(table_schema)), ), @@ -1550,8 +1542,7 @@ mod tests { ) -> Result<()> { let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - file_schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(file_schema)), ) .with_file_groups(file_groups) .build(); @@ -1653,23 +1644,26 @@ mod tests { ), ]); - let source = Arc::new(ParquetSource::default()); - let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) - .with_file(partitioned_file) - // file has 10 cols so index 12 should be month and 13 should be day - .with_projection_indices(Some(vec![0, 1, 2, 12, 13])) - .with_table_partition_cols(vec![ - Field::new("year", DataType::Utf8, false), - Field::new("month", DataType::UInt8, false), - Field::new( + let table_schema = TableSchema::new( + Arc::clone(&schema), + vec![ + Arc::new(Field::new("year", DataType::Utf8, false)), + Arc::new(Field::new("month", DataType::UInt8, false)), + Arc::new(Field::new( "day", DataType::Dictionary( Box::new(DataType::UInt16), Box::new(DataType::Utf8), ), false, - ), - ]) + )), + ], + ); + let source = Arc::new(ParquetSource::new(table_schema.clone())); + let config = FileScanConfigBuilder::new(object_store_url, source) + .with_file(partitioned_file) + // file has 10 cols so index 12 should be month and 13 should be day + .with_projection_indices(Some(vec![0, 1, 2, 12, 13])) .build(); let parquet_exec = DataSourceExec::from_data_source(config); @@ -1731,8 +1725,7 @@ mod tests { let file_schema = Arc::new(Schema::empty()); let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - file_schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(file_schema)), ) .with_file(partitioned_file) .build(); @@ -2279,11 +2272,11 @@ mod tests { let size_hint_calls = reader_factory.metadata_size_hint_calls.clone(); let source = Arc::new( - ParquetSource::default() + ParquetSource::new(Arc::clone(&schema)) .with_parquet_file_reader_factory(reader_factory) .with_metadata_size_hint(456), ); - let config = FileScanConfigBuilder::new(store_url, schema, source) + let config = FileScanConfigBuilder::new(store_url, source) .with_file( PartitionedFile { object_meta: ObjectMeta { diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 68f83e7f1f115..bbc85af7d8749 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -35,12 +35,15 @@ use crate::error::Result; use crate::logical_expr::LogicalPlan; use crate::test_util::{aggr_test_schema, arrow_test_data}; +use datafusion_common::config::CsvOptions; + use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array}; use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; #[cfg(feature = "compression")] use datafusion_common::DataFusionError; use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource::TableSchema; #[cfg(feature = "compression")] use bzip2::write::BzEncoder; @@ -92,11 +95,17 @@ pub fn scan_partitioned_csv( FileCompressionType::UNCOMPRESSED, work_dir, )?; - let source = Arc::new(CsvSource::new(true, b'"', b'"')); - let config = - FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source)) - .with_file_compression_type(FileCompressionType::UNCOMPRESSED) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(schema); + let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_file_compression_type(FileCompressionType::UNCOMPRESSED) + .build(); Ok(DataSourceExec::from_data_source(config)) } diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 203d9e97d2a8c..b5213cee3f2df 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -37,10 +37,8 @@ use crate::physical_plan::metrics::MetricsSet; use crate::physical_plan::ExecutionPlan; use crate::prelude::{Expr, SessionConfig, SessionContext}; -use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; -use datafusion_datasource::TableSchema; use object_store::path::Path; use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; @@ -157,20 +155,21 @@ impl TestParquetFile { maybe_filter: Option, ) -> Result> { let parquet_options = ctx.copied_table_options().parquet; - let source = Arc::new(ParquetSource::new(parquet_options.clone())); - let scan_config_builder = FileScanConfigBuilder::new( - self.object_store_url.clone(), - Arc::clone(&self.schema), - source, - ) - .with_file(PartitionedFile { - object_meta: self.object_meta.clone(), - partition_values: vec![], - range: None, - statistics: None, - extensions: None, - metadata_size_hint: None, - }); + let source = Arc::new( + ParquetSource::new(Arc::clone(&self.schema)) + .with_table_parquet_options(parquet_options.clone()), + ); + let scan_config_builder = + FileScanConfigBuilder::new(self.object_store_url.clone(), source).with_file( + PartitionedFile { + object_meta: self.object_meta.clone(), + partition_values: vec![], + range: None, + statistics: None, + extensions: None, + metadata_size_hint: None, + }, + ); let df_schema = Arc::clone(&self.schema).to_dfschema_ref()?; @@ -184,10 +183,10 @@ impl TestParquetFile { create_physical_expr(&filter, &df_schema, &ExecutionProps::default())?; let source = Arc::new( - ParquetSource::new(parquet_options) + ParquetSource::new(Arc::clone(&self.schema)) + .with_table_parquet_options(parquet_options) .with_predicate(Arc::clone(&physical_filter_expr)), - ) - .with_schema(TableSchema::from_file_schema(Arc::clone(&self.schema))); + ); let config = scan_config_builder.with_source(source).build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index f8bd4dbc1a768..51ec8f03e5d21 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -276,13 +276,12 @@ async fn execute_with_predicate( ctx: &SessionContext, ) -> Vec { let parquet_source = if prune_stats { - ParquetSource::default().with_predicate(predicate.clone()) + ParquetSource::new(schema.clone()).with_predicate(predicate.clone()) } else { - ParquetSource::default() + ParquetSource::new(schema.clone()) }; let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("memory://").unwrap(), - schema.clone(), Arc::new(parquet_source), ) .with_file_group( diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index 3a1f06656236c..0a147d15a6fd1 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -80,7 +80,7 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { .collect(); let source = Arc::new( - ParquetSource::default() + ParquetSource::new(file_schema.clone()) // prepare the scan .with_parquet_file_reader_factory(Arc::new( InMemoryParquetFileReaderFactory(Arc::clone(&in_memory_object_store)), @@ -89,7 +89,6 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { let base_config = FileScanConfigBuilder::new( // just any url that doesn't point to in memory object store ObjectStoreUrl::local_filesystem(), - file_schema, source, ) .with_file_group(file_group) diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index 5135f956852c3..b35cb6e09cfb8 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -355,11 +355,11 @@ impl TestFull { let source = if let Some(predicate) = predicate { let df_schema = DFSchema::try_from(schema.clone())?; let predicate = ctx.create_physical_expr(predicate, &df_schema)?; - Arc::new(ParquetSource::default().with_predicate(predicate)) + Arc::new(ParquetSource::new(schema.clone()).with_predicate(predicate)) } else { - Arc::new(ParquetSource::default()) + Arc::new(ParquetSource::new(schema.clone())) }; - let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) + let config = FileScanConfigBuilder::new(object_store_url, source) .with_file(partitioned_file) .build(); diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index 27bee10234b57..fb2a196b0aa65 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -81,12 +81,12 @@ async fn get_parquet_exec( let predicate = create_physical_expr(&filter, &df_schema, &execution_props).unwrap(); let source = Arc::new( - ParquetSource::default() + ParquetSource::new(schema.clone()) .with_predicate(predicate) .with_enable_page_index(true) .with_pushdown_filters(pushdown_filters), ); - let base_config = FileScanConfigBuilder::new(object_store_url, schema, source) + let base_config = FileScanConfigBuilder::new(object_store_url, source) .with_file(partitioned_file) .build(); diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 40fc6176e212b..0e76d626aac5b 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -482,7 +482,7 @@ fn test_apply_schema_adapter_with_factory() { ])); // Create a parquet source - let source = ParquetSource::default(); + let source = ParquetSource::new(schema.clone()); // Create a file scan config with source that has a schema adapter factory let factory = Arc::new(PrefixAdapterFactory { @@ -491,12 +491,9 @@ fn test_apply_schema_adapter_with_factory() { let file_source = source.clone().with_schema_adapter_factory(factory).unwrap(); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source, - ) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .build(); // Apply schema adapter to a new source let result_source = source.apply_schema_adapter(&config).unwrap(); @@ -532,18 +529,15 @@ fn test_apply_schema_adapter_without_factory() { ])); // Create a parquet source - let source = ParquetSource::default(); + let source = ParquetSource::new(schema.clone()); // Convert to Arc let file_source: Arc = Arc::new(source.clone()); // Create a file scan config without a schema adapter factory - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source, - ) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .build(); // Apply schema adapter function - should pass through the source unchanged let result_source = source.apply_schema_adapter(&config).unwrap(); diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 9be391a9108e6..51e5242cbafd7 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -62,14 +62,10 @@ async fn multi_parquet_coercion() { Field::new("c2", DataType::Int32, true), Field::new("c3", DataType::Float64, true), ])); - let source = Arc::new(ParquetSource::default()); - let conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - source, - ) - .with_file_group(file_group) - .build(); + let source = Arc::new(ParquetSource::new(file_schema.clone())); + let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file_group(file_group) + .build(); let parquet_exec = DataSourceExec::from_data_source(conf); @@ -122,8 +118,7 @@ async fn multi_parquet_coercion_projection() { ])); let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - file_schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(file_schema)), ) .with_file_group(file_group) .with_projection_indices(Some(vec![1, 0, 2])) diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 5b7d9ac8fbe99..f0f610dfba4fd 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -37,6 +37,7 @@ use datafusion::datasource::physical_plan::{CsvSource, ParquetSource}; use datafusion::datasource::source::DataSourceExec; use datafusion::datasource::MemTable; use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_common::config::CsvOptions; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::ScalarValue; @@ -229,8 +230,7 @@ fn parquet_exec_multiple_sorted( ) -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(schema())), ) .with_file_groups(vec![ FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), @@ -247,14 +247,19 @@ fn csv_exec() -> Arc { } fn csv_exec_with_sort(output_ordering: Vec) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_output_ordering(output_ordering) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + Arc::new(CsvSource::new(schema()).with_csv_options(options)) + }) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_output_ordering(output_ordering) + .build(); DataSourceExec::from_data_source(config) } @@ -265,17 +270,22 @@ fn csv_exec_multiple() -> Arc { // Created a sorted parquet exec with multiple files fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file_groups(vec![ - FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), - FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]), - ]) - .with_output_ordering(output_ordering) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + Arc::new(CsvSource::new(schema()).with_csv_options(options)) + }) + .with_file_groups(vec![ + FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), + FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]), + ]) + .with_output_ordering(output_ordering) + .build(); DataSourceExec::from_data_source(config) } @@ -2597,11 +2607,15 @@ fn parallelization_compressed_csv() -> Result<()> { for compression_type in compression_types { let plan = aggregate_exec_with_alias( DataSourceExec::from_data_source( - FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + Arc::new(CsvSource::new(schema()).with_csv_options(options)) + }) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_file_compression_type(compression_type) .build(), diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index e3a0eb7e1aa6f..c0cfa46733f18 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -31,7 +31,7 @@ use crate::physical_optimizer::test_utils::{ use arrow::compute::SortOptions; use arrow::datatypes::{DataType, SchemaRef}; -use datafusion_common::config::ConfigOptions; +use datafusion_common::config::{ConfigOptions, CsvOptions}; use datafusion_common::tree_node::{TreeNode, TransformedResult}; use datafusion_common::{Result, TableReference}; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; @@ -72,10 +72,15 @@ fn csv_exec_sorted( schema: &SchemaRef, sort_exprs: impl IntoIterator, ) -> Arc { + let options = CsvOptions { + has_header: Some(false), + delimiter: 0, + quote: 0, + ..Default::default() + }; let mut builder = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema.clone(), - Arc::new(CsvSource::new(false, 0, 0)), + Arc::new(CsvSource::new(schema.clone()).with_csv_options(options)), ) .with_file(PartitionedFile::new("x".to_string(), 100)); if let Some(ordering) = LexOrdering::new(sort_exprs) { diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index de61149508904..31909415a286d 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -859,20 +859,17 @@ async fn test_topk_filter_passes_through_coalesce_partitions() { ]; // Create a source that supports all batches - let source = Arc::new(TestSource::new(true, batches)); - - let base_config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test://").unwrap(), - Arc::clone(&schema()), - source, - ) - .with_file_groups(vec![ - // Partition 0 - FileGroup::new(vec![PartitionedFile::new("test1.parquet", 123)]), - // Partition 1 - FileGroup::new(vec![PartitionedFile::new("test2.parquet", 123)]), - ]) - .build(); + let source = Arc::new(TestSource::new(schema(), true, batches)); + + let base_config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test://").unwrap(), source) + .with_file_groups(vec![ + // Partition 0 + FileGroup::new(vec![PartitionedFile::new("test1.parquet", 123)]), + // Partition 1 + FileGroup::new(vec![PartitionedFile::new("test2.parquet", 123)]), + ]) + .build(); let scan = DataSourceExec::from_data_source(base_config); diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index 7d8a9c7c2125c..2bd70221f41e1 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -24,7 +24,6 @@ use datafusion_datasource::{ file_scan_config::FileScanConfigBuilder, file_stream::FileOpenFuture, file_stream::FileOpener, schema_adapter::DefaultSchemaAdapterFactory, schema_adapter::SchemaAdapterFactory, source::DataSourceExec, PartitionedFile, - TableSchema, }; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_optimizer::PhysicalOptimizerRule; @@ -53,7 +52,7 @@ use std::{ pub struct TestOpener { batches: Vec, batch_size: Option, - schema: Option, + schema: SchemaRef, projection: Option>, predicate: Option>, } @@ -71,23 +70,23 @@ impl FileOpener for TestOpener { } batches = new_batches.into_iter().collect(); } - if let Some(schema) = &self.schema { - let factory = DefaultSchemaAdapterFactory::from_schema(Arc::clone(schema)); - let (mapper, projection) = factory.map_schema(&batches[0].schema()).unwrap(); - let mut new_batches = Vec::new(); - for batch in batches { - let batch = if let Some(predicate) = &self.predicate { - batch_filter(&batch, predicate)? - } else { - batch - }; - let batch = batch.project(&projection).unwrap(); - let batch = mapper.map_batch(batch).unwrap(); - new_batches.push(batch); - } - batches = new_batches; + let factory = DefaultSchemaAdapterFactory::from_schema(Arc::clone(&self.schema)); + let (mapper, projection) = factory.map_schema(&batches[0].schema()).unwrap(); + let mut new_batches = Vec::new(); + for batch in batches { + let batch = if let Some(predicate) = &self.predicate { + batch_filter(&batch, predicate)? + } else { + batch + }; + + let batch = batch.project(&projection).unwrap(); + let batch = mapper.map_batch(batch).unwrap(); + new_batches.push(batch); } + batches = new_batches; + if let Some(projection) = &self.projection { batches = batches .into_iter() @@ -102,26 +101,35 @@ impl FileOpener for TestOpener { } /// A placeholder data source that accepts filter pushdown -#[derive(Clone, Default)] +#[derive(Clone)] pub struct TestSource { support: bool, predicate: Option>, statistics: Option, batch_size: Option, batches: Vec, - schema: Option, + schema: SchemaRef, metrics: ExecutionPlanMetricsSet, projection: Option>, schema_adapter_factory: Option>, + table_schema: datafusion_datasource::TableSchema, } impl TestSource { - pub fn new(support: bool, batches: Vec) -> Self { + pub fn new(schema: SchemaRef, support: bool, batches: Vec) -> Self { + let table_schema = + datafusion_datasource::TableSchema::new(Arc::clone(&schema), vec![]); Self { + schema, support, metrics: ExecutionPlanMetricsSet::new(), batches, - ..Default::default() + predicate: None, + statistics: None, + batch_size: None, + projection: None, + schema_adapter_factory: None, + table_schema, } } } @@ -136,7 +144,7 @@ impl FileSource for TestSource { Arc::new(TestOpener { batches: self.batches.clone(), batch_size: self.batch_size, - schema: self.schema.clone(), + schema: Arc::clone(&self.schema), projection: self.projection.clone(), predicate: self.predicate.clone(), }) @@ -157,17 +165,6 @@ impl FileSource for TestSource { }) } - fn with_schema(&self, schema: TableSchema) -> Arc { - assert!( - schema.table_partition_cols().is_empty(), - "TestSource does not support partition columns" - ); - Arc::new(TestSource { - schema: Some(schema.file_schema().clone()), - ..self.clone() - }) - } - fn with_projection(&self, config: &FileScanConfig) -> Arc { Arc::new(TestSource { projection: config.projection_exprs.as_ref().map(|p| p.column_indices()), @@ -260,6 +257,10 @@ impl FileSource for TestSource { fn schema_adapter_factory(&self) -> Option> { self.schema_adapter_factory.clone() } + + fn table_schema(&self) -> &datafusion_datasource::TableSchema { + &self.table_schema + } } #[derive(Debug, Clone)] @@ -289,14 +290,15 @@ impl TestScanBuilder { } pub fn build(self) -> Arc { - let source = Arc::new(TestSource::new(self.support, self.batches)); - let base_config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test://").unwrap(), + let source = Arc::new(TestSource::new( Arc::clone(&self.schema), - source, - ) - .with_file(PartitionedFile::new("test.parquet", 123)) - .build(); + self.support, + self.batches, + )); + let base_config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test://").unwrap(), source) + .with_file(PartitionedFile::new("test.parquet", 123)) + .build(); DataSourceExec::from_data_source(base_config) } } diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 8631613c3925e..9d39a80fb9df6 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -24,9 +24,10 @@ use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::CsvSource; use datafusion::datasource::source::DataSourceExec; -use datafusion_common::config::ConfigOptions; +use datafusion_common::config::{ConfigOptions, CsvOptions}; use datafusion_common::{JoinSide, JoinType, NullEquality, Result, ScalarValue}; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::TableSchema; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{ @@ -384,14 +385,19 @@ fn create_simple_csv_exec() -> Arc { Field::new("d", DataType::Int32, true), Field::new("e", DataType::Int32, true), ])); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(CsvSource::new(false, 0, 0)), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection_indices(Some(vec![0, 1, 2, 3, 4])) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: 0, + quote: 0, + ..Default::default() + }; + Arc::new(CsvSource::new(schema.clone()).with_csv_options(options)) + }) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_projection_indices(Some(vec![0, 1, 2, 3, 4])) + .build(); DataSourceExec::from_data_source(config) } @@ -403,14 +409,19 @@ fn create_projecting_csv_exec() -> Arc { Field::new("c", DataType::Int32, true), Field::new("d", DataType::Int32, true), ])); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(CsvSource::new(false, 0, 0)), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection_indices(Some(vec![3, 2, 1])) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: 0, + quote: 0, + ..Default::default() + }; + Arc::new(CsvSource::new(schema.clone()).with_csv_options(options)) + }) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_projection_indices(Some(vec![3, 2, 1])) + .build(); DataSourceExec::from_data_source(config) } @@ -1589,13 +1600,21 @@ fn partitioned_data_source() -> Arc { Field::new("string_col", DataType::Utf8, true), ])); + let options = CsvOptions { + has_header: Some(false), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::new( + Arc::clone(&file_schema), + vec![Arc::new(Field::new("partition_col", DataType::Utf8, true))], + ); let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - file_schema.clone(), - Arc::new(CsvSource::default()), + Arc::new(CsvSource::new(table_schema).with_csv_options(options)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_table_partition_cols(vec![Field::new("partition_col", DataType::Utf8, true)]) .with_projection_indices(Some(vec![0, 1, 2])) .build(); diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 8ca33f3d4abb9..60fec2243621d 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -73,8 +73,7 @@ use datafusion_physical_plan::{ pub fn parquet_exec(schema: SchemaRef) -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(schema)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .build(); @@ -89,8 +88,7 @@ pub(crate) fn parquet_exec_with_sort( ) -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(schema)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_output_ordering(output_ordering) @@ -127,8 +125,7 @@ pub(crate) fn parquet_exec_with_stats(file_size: u64) -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(ParquetSource::new(Default::default())), + Arc::new(ParquetSource::new(schema())), ) .with_file(PartitionedFile::new("x".to_string(), file_size)) .with_statistics(statistics) diff --git a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs index c3c92a9028d67..0b093485c1ce1 100644 --- a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs @@ -27,12 +27,14 @@ use datafusion::datasource::physical_plan::{ }; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; +use datafusion_common::config::CsvOptions; use datafusion_common::ColumnStatistics; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::schema_adapter::{ SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource::TableSchema; use datafusion_execution::object_store::ObjectStoreUrl; use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; @@ -182,17 +184,17 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { let ctx = SessionContext::new(); ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - // Create a table schema with uppercase column names let table_schema = Arc::new(Schema::new(vec![ Field::new("ID", DataType::Int32, false), Field::new("NAME", DataType::Utf8, true), ])); - let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source) + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::new(table_schema.clone()) + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + + let config = FileScanConfigBuilder::new(store_url, file_source) .with_file(PartitionedFile::new(path, file_size)) .build(); @@ -245,10 +247,10 @@ async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() + let file_source = ParquetSource::new(batch.schema()) .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) + let config = FileScanConfigBuilder::new(store_url, file_source) .with_file(PartitionedFile::new(path, file_size)) .build(); @@ -284,7 +286,10 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test ArrowSource { - let source = ArrowSource::default(); + let schema = + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let table_schema = TableSchema::new(schema, vec![]); + let source = ArrowSource::new(table_schema); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -304,7 +309,9 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test ParquetSource #[cfg(feature = "parquet")] { - let source = ParquetSource::default(); + let schema = + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let source = ParquetSource::new(schema); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -323,7 +330,15 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test CsvSource { - let source = CsvSource::default(); + let schema = + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let source = CsvSource::new(schema).with_csv_options(options); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -342,7 +357,10 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test JsonSource { - let source = JsonSource::default(); + let schema = + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let table_schema = TableSchema::new(schema, vec![]); + let source = JsonSource::new(table_schema); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs index 3b85640804219..dc1f5cf72da7f 100644 --- a/datafusion/datasource-arrow/src/file_format.rs +++ b/datafusion/datasource-arrow/src/file_format.rs @@ -45,6 +45,7 @@ use datafusion_datasource::sink::{DataSink, DataSinkExec}; use datafusion_datasource::write::{ get_writer_schema, ObjectWriterBuilder, SharedBuffer, }; +use datafusion_datasource::TableSchema; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr_common::sort_expr::LexRequirement; @@ -178,7 +179,11 @@ impl FileFormat for ArrowFormat { _state: &dyn Session, conf: FileScanConfig, ) -> Result> { - let source = Arc::new(ArrowSource::default()); + let table_schema = TableSchema::new( + Arc::clone(conf.file_schema()), + conf.table_partition_cols().clone(), + ); + let source = Arc::new(ArrowSource::new(table_schema)); let config = FileScanConfigBuilder::from(conf) .with_source(source) .build(); @@ -202,8 +207,8 @@ impl FileFormat for ArrowFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - fn file_source(&self) -> Arc { - Arc::new(ArrowSource::default()) + fn file_source(&self, table_schema: TableSchema) -> Arc { + Arc::new(ArrowSource::new(table_schema)) } } diff --git a/datafusion/datasource-arrow/src/source.rs b/datafusion/datasource-arrow/src/source.rs index f254b7e3ff30f..b3253d43f49a8 100644 --- a/datafusion/datasource-arrow/src/source.rs +++ b/datafusion/datasource-arrow/src/source.rs @@ -20,7 +20,6 @@ use std::sync::Arc; use datafusion_datasource::as_file_source; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; -use datafusion_datasource::TableSchema; use arrow::buffer::Buffer; use arrow_ipc::reader::FileDecoder; @@ -39,13 +38,26 @@ use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore}; /// Arrow configuration struct that is given to DataSourceExec /// Does not hold anything special, since [`FileScanConfig`] is sufficient for arrow -#[derive(Clone, Default)] +#[derive(Clone)] pub struct ArrowSource { + table_schema: datafusion_datasource::TableSchema, metrics: ExecutionPlanMetricsSet, projected_statistics: Option, schema_adapter_factory: Option>, } +impl ArrowSource { + /// Initialize an ArrowSource with the provided schema + pub fn new(table_schema: impl Into) -> Self { + Self { + table_schema: table_schema.into(), + metrics: ExecutionPlanMetricsSet::new(), + projected_statistics: None, + schema_adapter_factory: None, + } + } +} + impl From for Arc { fn from(source: ArrowSource) -> Self { as_file_source(source) @@ -69,13 +81,14 @@ impl FileSource for ArrowSource { self } - fn with_batch_size(&self, _batch_size: usize) -> Arc { - Arc::new(Self { ..self.clone() }) + fn table_schema(&self) -> &datafusion_datasource::TableSchema { + &self.table_schema } - fn with_schema(&self, _schema: TableSchema) -> Arc { + fn with_batch_size(&self, _batch_size: usize) -> Arc { Arc::new(Self { ..self.clone() }) } + fn with_statistics(&self, statistics: Statistics) -> Arc { let mut conf = self.clone(); conf.projected_statistics = Some(statistics); diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs index 60c361b42e771..50aecf97b299f 100644 --- a/datafusion/datasource-avro/src/file_format.rs +++ b/datafusion/datasource-avro/src/file_format.rs @@ -154,13 +154,17 @@ impl FileFormat for AvroFormat { _state: &dyn Session, conf: FileScanConfig, ) -> Result> { + let file_schema = Arc::clone(conf.file_schema()); let config = FileScanConfigBuilder::from(conf) - .with_source(self.file_source()) + .with_source(Arc::new(AvroSource::new(file_schema))) .build(); Ok(DataSourceExec::from_data_source(config)) } - fn file_source(&self) -> Arc { - Arc::new(AvroSource::new()) + fn file_source( + &self, + table_schema: datafusion_datasource::TableSchema, + ) -> Arc { + Arc::new(AvroSource::new(table_schema)) } } diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 1ff73d2c3cc39..9859e11e25d22 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -22,7 +22,6 @@ use std::sync::Arc; use crate::avro_to_arrow::Reader as AvroReader; -use arrow::datatypes::SchemaRef; use datafusion_common::error::Result; use datafusion_common::Statistics; use datafusion_datasource::file::FileSource; @@ -36,9 +35,9 @@ use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use object_store::ObjectStore; /// AvroSource holds the extra configuration that is necessary for opening avro files -#[derive(Clone, Default)] +#[derive(Clone)] pub struct AvroSource { - schema: Option, + table_schema: TableSchema, batch_size: Option, projection: Option>, metrics: ExecutionPlanMetricsSet, @@ -47,15 +46,22 @@ pub struct AvroSource { } impl AvroSource { - /// Initialize an AvroSource with default values - pub fn new() -> Self { - Self::default() + /// Initialize an AvroSource with the provided schema + pub fn new(table_schema: impl Into) -> Self { + Self { + table_schema: table_schema.into(), + batch_size: None, + projection: None, + metrics: ExecutionPlanMetricsSet::new(), + projected_statistics: None, + schema_adapter_factory: None, + } } fn open(&self, reader: R) -> Result> { AvroReader::try_new( reader, - Arc::clone(self.schema.as_ref().expect("Schema must set before open")), + Arc::clone(self.table_schema.file_schema()), self.batch_size.expect("Batch size must set before open"), self.projection.clone(), ) @@ -79,16 +85,13 @@ impl FileSource for AvroSource { self } - fn with_batch_size(&self, batch_size: usize) -> Arc { - let mut conf = self.clone(); - conf.batch_size = Some(batch_size); - Arc::new(conf) + fn table_schema(&self) -> &TableSchema { + &self.table_schema } - fn with_schema(&self, schema: TableSchema) -> Arc { + fn with_batch_size(&self, batch_size: usize) -> Arc { let mut conf = self.clone(); - // TableSchema may have partition columns, but AvroSource does not use partition columns or values atm - conf.schema = Some(Arc::clone(schema.file_schema())); + conf.batch_size = Some(batch_size); Arc::new(conf) } diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs index 1c39893b23c85..6b27687a56f7b 100644 --- a/datafusion/datasource-csv/src/file_format.rs +++ b/datafusion/datasource-csv/src/file_format.rs @@ -48,6 +48,7 @@ use datafusion_datasource::sink::{DataSink, DataSinkExec}; use datafusion_datasource::write::demux::DemuxedStreamReceiver; use datafusion_datasource::write::orchestration::spawn_writer_tasks_and_join; use datafusion_datasource::write::BatchSerializer; +use datafusion_datasource::TableSchema; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr_common::sort_expr::LexRequirement; @@ -434,20 +435,23 @@ impl FileFormat for CsvFormat { .newlines_in_values .unwrap_or_else(|| state.config_options().catalog.newlines_in_values); - let conf_builder = FileScanConfigBuilder::from(conf) - .with_file_compression_type(self.options.compression.into()) - .with_newlines_in_values(newlines_in_values); + let mut csv_options = self.options.clone(); + csv_options.has_header = Some(has_header); - let truncated_rows = self.options.truncated_rows.unwrap_or(false); - let source = Arc::new( - CsvSource::new(has_header, self.options.delimiter, self.options.quote) - .with_escape(self.options.escape) - .with_terminator(self.options.terminator) - .with_comment(self.options.comment) - .with_truncate_rows(truncated_rows), - ); + // Get the existing CsvSource and update its options + // We need to preserve the table_schema from the original source (which includes partition columns) + let csv_source = conf + .file_source + .as_any() + .downcast_ref::() + .expect("file_source should be a CsvSource"); + let source = Arc::new(csv_source.clone().with_csv_options(csv_options)); - let config = conf_builder.with_source(source).build(); + let config = FileScanConfigBuilder::from(conf) + .with_file_compression_type(self.options.compression.into()) + .with_newlines_in_values(newlines_in_values) + .with_source(source) + .build(); Ok(DataSourceExec::from_data_source(config)) } @@ -489,8 +493,12 @@ impl FileFormat for CsvFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - fn file_source(&self) -> Arc { - Arc::new(CsvSource::default()) + fn file_source(&self, table_schema: TableSchema) -> Arc { + let mut csv_options = self.options.clone(); + if csv_options.has_header.is_none() { + csv_options.has_header = Some(true); + } + Arc::new(CsvSource::new(table_schema).with_csv_options(csv_options)) } } diff --git a/datafusion/datasource-csv/src/mod.rs b/datafusion/datasource-csv/src/mod.rs index 90538d0808b1a..78a916912c93f 100644 --- a/datafusion/datasource-csv/src/mod.rs +++ b/datafusion/datasource-csv/src/mod.rs @@ -24,7 +24,6 @@ pub mod source; use std::sync::Arc; -use arrow::datatypes::SchemaRef; use datafusion_datasource::file_groups::FileGroup; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::{file::FileSource, file_scan_config::FileScanConfig}; @@ -33,11 +32,10 @@ pub use file_format::*; /// Returns a [`FileScanConfig`] for given `file_groups` pub fn partitioned_csv_config( - schema: SchemaRef, file_groups: Vec, file_source: Arc, ) -> FileScanConfig { - FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, file_source) + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) .with_file_groups(file_groups) .build() } diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index 0b18571e58bd7..94c6b3810ae21 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -33,7 +33,7 @@ use datafusion_datasource::{ }; use arrow::csv; -use arrow::datatypes::SchemaRef; +use datafusion_common::config::CsvOptions; use datafusion_common::{DataFusionError, Result, Statistics}; use datafusion_common_runtime::JoinSet; use datafusion_datasource::file::FileSource; @@ -61,111 +61,118 @@ use tokio::io::AsyncWriteExt; /// # use datafusion_datasource_csv::source::CsvSource; /// # use datafusion_execution::object_store::ObjectStoreUrl; /// # use datafusion_datasource::source::DataSourceExec; +/// # use datafusion_common::config::CsvOptions; /// /// # let object_store_url = ObjectStoreUrl::local_filesystem(); /// # let file_schema = Arc::new(Schema::empty()); /// -/// let source = Arc::new(CsvSource::new( -/// true, -/// b',', -/// b'"', -/// ) -/// .with_terminator(Some(b'#') -/// )); +/// let options = CsvOptions { +/// has_header: Some(true), +/// delimiter: b',', +/// quote: b'"', +/// ..Default::default() +/// }; +/// let source = Arc::new(CsvSource::new(file_schema.clone()) +/// .with_csv_options(options) +/// .with_terminator(Some(b'#')) +/// ); /// // Create a DataSourceExec for reading the first 100MB of `file1.csv` -/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source) +/// let config = FileScanConfigBuilder::new(object_store_url, source) /// .with_file(PartitionedFile::new("file1.csv", 100*1024*1024)) /// .with_newlines_in_values(true) // The file contains newlines in values; /// .build(); /// let exec = (DataSourceExec::from_data_source(config)); /// ``` -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub struct CsvSource { + options: CsvOptions, batch_size: Option, - file_schema: Option, + table_schema: TableSchema, file_projection: Option>, - pub(crate) has_header: bool, - delimiter: u8, - quote: u8, - terminator: Option, - escape: Option, - comment: Option, metrics: ExecutionPlanMetricsSet, projected_statistics: Option, schema_adapter_factory: Option>, - truncate_rows: bool, } impl CsvSource { /// Returns a [`CsvSource`] - pub fn new(has_header: bool, delimiter: u8, quote: u8) -> Self { + pub fn new(table_schema: impl Into) -> Self { Self { - has_header, - delimiter, - quote, - ..Self::default() + options: CsvOptions::default(), + table_schema: table_schema.into(), + batch_size: None, + file_projection: None, + metrics: ExecutionPlanMetricsSet::new(), + projected_statistics: None, + schema_adapter_factory: None, } } + /// Sets the CSV options + pub fn with_csv_options(mut self, options: CsvOptions) -> Self { + self.options = options; + self + } + /// true if the first line of each file is a header pub fn has_header(&self) -> bool { - self.has_header + self.options.has_header.unwrap_or(true) } // true if rows length support truncate pub fn truncate_rows(&self) -> bool { - self.truncate_rows + self.options.truncated_rows.unwrap_or(false) } /// A column delimiter pub fn delimiter(&self) -> u8 { - self.delimiter + self.options.delimiter } /// The quote character pub fn quote(&self) -> u8 { - self.quote + self.options.quote } /// The line terminator pub fn terminator(&self) -> Option { - self.terminator + self.options.terminator } /// Lines beginning with this byte are ignored. pub fn comment(&self) -> Option { - self.comment + self.options.comment } /// The escape character pub fn escape(&self) -> Option { - self.escape + self.options.escape } /// Initialize a CsvSource with escape pub fn with_escape(&self, escape: Option) -> Self { let mut conf = self.clone(); - conf.escape = escape; + conf.options.escape = escape; conf } /// Initialize a CsvSource with terminator pub fn with_terminator(&self, terminator: Option) -> Self { let mut conf = self.clone(); - conf.terminator = terminator; + conf.options.terminator = terminator; conf } /// Initialize a CsvSource with comment pub fn with_comment(&self, comment: Option) -> Self { let mut conf = self.clone(); - conf.comment = comment; + conf.options.comment = comment; conf } /// Whether to support truncate rows when read csv file pub fn with_truncate_rows(&self, truncate_rows: bool) -> Self { let mut conf = self.clone(); - conf.truncate_rows = truncate_rows; + conf.options.truncated_rows = Some(truncate_rows); conf } } @@ -176,29 +183,26 @@ impl CsvSource { } fn builder(&self) -> csv::ReaderBuilder { - let mut builder = csv::ReaderBuilder::new(Arc::clone( - self.file_schema - .as_ref() - .expect("Schema must be set before initializing builder"), - )) - .with_delimiter(self.delimiter) - .with_batch_size( - self.batch_size - .expect("Batch size must be set before initializing builder"), - ) - .with_header(self.has_header) - .with_quote(self.quote) - .with_truncated_rows(self.truncate_rows); - if let Some(terminator) = self.terminator { + let mut builder = + csv::ReaderBuilder::new(Arc::clone(self.table_schema.file_schema())) + .with_delimiter(self.delimiter()) + .with_batch_size( + self.batch_size + .expect("Batch size must be set before initializing builder"), + ) + .with_header(self.has_header()) + .with_quote(self.quote()) + .with_truncated_rows(self.truncate_rows()); + if let Some(terminator) = self.terminator() { builder = builder.with_terminator(terminator); } if let Some(proj) = &self.file_projection { builder = builder.with_projection(proj.clone()); } - if let Some(escape) = self.escape { + if let Some(escape) = self.escape() { builder = builder.with_escape(escape) } - if let Some(comment) = self.comment { + if let Some(comment) = self.comment() { builder = builder.with_comment(comment); } @@ -252,15 +256,13 @@ impl FileSource for CsvSource { self } - fn with_batch_size(&self, batch_size: usize) -> Arc { - let mut conf = self.clone(); - conf.batch_size = Some(batch_size); - Arc::new(conf) + fn table_schema(&self) -> &TableSchema { + &self.table_schema } - fn with_schema(&self, schema: TableSchema) -> Arc { + fn with_batch_size(&self, batch_size: usize) -> Arc { let mut conf = self.clone(); - conf.file_schema = Some(Arc::clone(schema.file_schema())); + conf.batch_size = Some(batch_size); Arc::new(conf) } @@ -291,7 +293,7 @@ impl FileSource for CsvSource { fn fmt_extra(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { match t { DisplayFormatType::Default | DisplayFormatType::Verbose => { - write!(f, ", has_header={}", self.has_header) + write!(f, ", has_header={}", self.has_header()) } DisplayFormatType::TreeRender => Ok(()), } @@ -340,18 +342,16 @@ impl FileOpener for CsvOpener { // `self.config.has_header` controls whether to skip reading the 1st line header // If the .csv file is read in parallel and this `CsvOpener` is only reading some middle // partition, then don't skip first line - let mut csv_has_header = self.config.has_header; + let mut csv_has_header = self.config.has_header(); if let Some(FileRange { start, .. }) = partitioned_file.range { if start != 0 { csv_has_header = false; } } - let config = CsvSource { - has_header: csv_has_header, - truncate_rows: self.config.truncate_rows, - ..(*self.config).clone() - }; + let mut config = (*self.config).clone(); + config.options.has_header = Some(csv_has_header); + config.options.truncated_rows = Some(config.truncate_rows()); let file_compression_type = self.file_compression_type.to_owned(); @@ -363,7 +363,7 @@ impl FileOpener for CsvOpener { } let store = Arc::clone(&self.object_store); - let terminator = self.config.terminator; + let terminator = self.config.terminator(); Ok(Box::pin(async move { // Current partition contains bytes [start_byte, end_byte) (might contain incomplete lines at boundaries) diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index 51f4bd7e963e0..afb12e5262718 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -50,6 +50,7 @@ use datafusion_datasource::sink::{DataSink, DataSinkExec}; use datafusion_datasource::write::demux::DemuxedStreamReceiver; use datafusion_datasource::write::orchestration::spawn_writer_tasks_and_join; use datafusion_datasource::write::BatchSerializer; +use datafusion_datasource::TableSchema; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr_common::sort_expr::LexRequirement; @@ -253,7 +254,11 @@ impl FileFormat for JsonFormat { _state: &dyn Session, conf: FileScanConfig, ) -> Result> { - let source = Arc::new(JsonSource::new()); + let table_schema = TableSchema::new( + Arc::clone(conf.file_schema()), + conf.table_partition_cols().clone(), + ); + let source = Arc::new(JsonSource::new(table_schema)); let conf = FileScanConfigBuilder::from(conf) .with_file_compression_type(FileCompressionType::from( self.options.compression, @@ -281,8 +286,8 @@ impl FileFormat for JsonFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - fn file_source(&self) -> Arc { - Arc::new(JsonSource::default()) + fn file_source(&self, table_schema: TableSchema) -> Arc { + Arc::new(JsonSource::new(table_schema)) } } diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs index 52ed0def03f18..44b71ce680fd9 100644 --- a/datafusion/datasource-json/src/source.rs +++ b/datafusion/datasource-json/src/source.rs @@ -32,7 +32,6 @@ use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; use datafusion_datasource::{ as_file_source, calculate_range, ListingTableUrl, PartitionedFile, RangeCalculation, - TableSchema, }; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; @@ -75,8 +74,9 @@ impl JsonOpener { } /// JsonSource holds the extra configuration that is necessary for [`JsonOpener`] -#[derive(Clone, Default)] +#[derive(Clone)] pub struct JsonSource { + table_schema: datafusion_datasource::TableSchema, batch_size: Option, metrics: ExecutionPlanMetricsSet, projected_statistics: Option, @@ -84,9 +84,15 @@ pub struct JsonSource { } impl JsonSource { - /// Initialize a JsonSource with default values - pub fn new() -> Self { - Self::default() + /// Initialize a JsonSource with the provided schema + pub fn new(table_schema: impl Into) -> Self { + Self { + table_schema: table_schema.into(), + batch_size: None, + metrics: ExecutionPlanMetricsSet::new(), + projected_statistics: None, + schema_adapter_factory: None, + } } } @@ -117,15 +123,16 @@ impl FileSource for JsonSource { self } + fn table_schema(&self) -> &datafusion_datasource::TableSchema { + &self.table_schema + } + fn with_batch_size(&self, batch_size: usize) -> Arc { let mut conf = self.clone(); conf.batch_size = Some(batch_size); Arc::new(conf) } - fn with_schema(&self, _schema: TableSchema) -> Arc { - Arc::new(Self { ..self.clone() }) - } fn with_statistics(&self, statistics: Statistics) -> Arc { let mut conf = self.clone(); conf.projected_statistics = Some(statistics); diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index f27bda387fda5..1e86d4192774d 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -32,6 +32,7 @@ use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig}; use datafusion_datasource::write::{ get_writer_schema, ObjectWriterBuilder, SharedBuffer, }; +use datafusion_datasource::TableSchema; use datafusion_datasource::file_format::{FileFormat, FileFormatFactory}; use datafusion_datasource::write::demux::DemuxedStreamReceiver; @@ -459,7 +460,12 @@ impl FileFormat for ParquetFormat { metadata_size_hint = Some(metadata); } - let mut source = ParquetSource::new(self.options.clone()); + let table_schema = TableSchema::new( + Arc::clone(conf.file_schema()), + conf.table_partition_cols().clone(), + ); + let mut source = ParquetSource::new(table_schema) + .with_table_parquet_options(self.options.clone()); // Use the CachedParquetFileReaderFactory let metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); @@ -501,8 +507,11 @@ impl FileFormat for ParquetFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - fn file_source(&self) -> Arc { - Arc::new(ParquetSource::default()) + fn file_source(&self, table_schema: TableSchema) -> Arc { + Arc::new( + ParquetSource::new(table_schema) + .with_table_parquet_options(self.options.clone()), + ) } } diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 2815b82f1d455..3c905d950a962 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -270,7 +270,7 @@ impl FileOpener for ParquetOpener { let partition_values = partition_fields .iter() .cloned() - .zip(partitioned_file.partition_values) + .zip(partitioned_file.partition_values.clone()) .collect_vec(); let expr = expr_adapter_factory .create( diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 450ccc5d0620e..27640f37cee4b 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -104,11 +104,11 @@ use parquet::encryption::decrypt::FileDecryptionProperties; /// # let object_store_url = ObjectStoreUrl::local_filesystem(); /// # let predicate = lit(true); /// let source = Arc::new( -/// ParquetSource::default() -/// .with_predicate(predicate) +/// ParquetSource::new(Arc::clone(&file_schema)) +/// .with_predicate(predicate) /// ); /// // Create a DataSourceExec for reading `file1.parquet` with a file size of 100MB -/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source) +/// let config = FileScanConfigBuilder::new(object_store_url, source) /// .with_file(PartitionedFile::new("file1.parquet", 100*1024*1024)).build(); /// let exec = DataSourceExec::from_data_source(config); /// ``` @@ -231,7 +231,7 @@ use parquet::encryption::decrypt::FileDecryptionProperties; /// let partitioned_file = PartitionedFile::new("my_file.parquet", 1234) /// .with_extensions(Arc::new(access_plan)); /// // create a FileScanConfig to scan this file -/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default())) +/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), Arc::new(ParquetSource::new(schema()))) /// .with_file(partitioned_file).build(); /// // this parquet DataSourceExec will not even try to read row groups 2 and 4. Additional /// // pruning based on predicates may also happen @@ -266,7 +266,7 @@ use parquet::encryption::decrypt::FileDecryptionProperties; /// [`RecordBatch`]: arrow::record_batch::RecordBatch /// [`SchemaAdapter`]: datafusion_datasource::schema_adapter::SchemaAdapter /// [`ParquetMetadata`]: parquet::file::metadata::ParquetMetaData -#[derive(Clone, Default, Debug)] +#[derive(Clone, Debug)] pub struct ParquetSource { /// Options for reading Parquet files pub(crate) table_parquet_options: TableParquetOptions, @@ -275,7 +275,7 @@ pub struct ParquetSource { /// The schema of the file. /// In particular, this is the schema of the table without partition columns, /// *not* the physical schema of the file. - pub(crate) table_schema: Option, + pub(crate) table_schema: TableSchema, /// Optional predicate for row filtering during parquet scan pub(crate) predicate: Option>, /// Optional user defined parquet file reader factory @@ -293,15 +293,35 @@ pub struct ParquetSource { impl ParquetSource { /// Create a new ParquetSource to read the data specified in the file scan - /// configuration with the provided `TableParquetOptions`. - /// if default values are going to be used, use `ParguetConfig::default()` instead - pub fn new(table_parquet_options: TableParquetOptions) -> Self { + /// configuration with the provided schema. + /// + /// Uses default `TableParquetOptions`. + /// To set custom options, use [ParquetSource::with_table_parquet_options`]. + pub fn new(table_schema: impl Into) -> Self { Self { - table_parquet_options, - ..Self::default() + table_schema: table_schema.into(), + table_parquet_options: TableParquetOptions::default(), + metrics: ExecutionPlanMetricsSet::new(), + predicate: None, + parquet_file_reader_factory: None, + schema_adapter_factory: None, + batch_size: None, + metadata_size_hint: None, + projected_statistics: None, + #[cfg(feature = "parquet_encryption")] + encryption_factory: None, } } + /// Set the `TableParquetOptions` for this ParquetSource. + pub fn with_table_parquet_options( + mut self, + table_parquet_options: TableParquetOptions, + ) -> Self { + self.table_parquet_options = table_parquet_options; + self + } + /// Set the metadata size hint /// /// This value determines how many bytes at the end of the file the default @@ -590,6 +610,10 @@ impl FileSource for ParquetSource { self } + fn table_schema(&self) -> &TableSchema { + &self.table_schema + } + fn filter(&self) -> Option> { self.predicate.clone() } @@ -600,13 +624,6 @@ impl FileSource for ParquetSource { Arc::new(conf) } - fn with_schema(&self, schema: TableSchema) -> Arc { - Arc::new(Self { - table_schema: Some(schema), - ..self.clone() - }) - } - fn with_statistics(&self, statistics: Statistics) -> Arc { let mut conf = self.clone(); conf.projected_statistics = Some(statistics); @@ -660,14 +677,11 @@ impl FileSource for ParquetSource { // the actual predicates are built in reference to the physical schema of // each file, which we do not have at this point and hence cannot use. // Instead we use the logical schema of the file (the table schema without partition columns). - if let (Some(file_schema), Some(predicate)) = ( - &self.table_schema.as_ref().map(|ts| ts.file_schema()), - &self.predicate, - ) { + if let Some(predicate) = &self.predicate { let predicate_creation_errors = Count::new(); if let (Some(pruning_predicate), _) = build_pruning_predicates( Some(predicate), - file_schema, + self.table_schema.table_schema(), &predicate_creation_errors, ) { let mut guarantees = pruning_predicate @@ -700,16 +714,7 @@ impl FileSource for ParquetSource { filters: Vec>, config: &ConfigOptions, ) -> datafusion_common::Result>> { - let Some(table_schema) = self - .table_schema - .as_ref() - .map(|ts| ts.table_schema()) - .cloned() - else { - return Ok(FilterPushdownPropagation::with_parent_pushdown_result( - vec![PushedDown::No; filters.len()], - )); - }; + let table_schema = self.table_schema.table_schema(); // Determine if based on configs we should push filters down. // If either the table / scan itself or the config has pushdown enabled, // we will push down the filters. @@ -725,7 +730,7 @@ impl FileSource for ParquetSource { let filters: Vec = filters .into_iter() .map(|filter| { - if can_expr_be_pushed_down_with_schemas(&filter, &table_schema) { + if can_expr_be_pushed_down_with_schemas(&filter, table_schema) { PushedDownPredicate::supported(filter) } else { PushedDownPredicate::unsupported(filter) @@ -790,6 +795,7 @@ impl FileSource for ParquetSource { #[cfg(test)] mod tests { use super::*; + use arrow::datatypes::Schema; use datafusion_physical_expr::expressions::lit; #[test] @@ -797,7 +803,8 @@ mod tests { fn test_parquet_source_predicate_same_as_filter() { let predicate = lit(true); - let parquet_source = ParquetSource::default().with_predicate(predicate); + let parquet_source = + ParquetSource::new(Arc::new(Schema::empty())).with_predicate(predicate); // same value. but filter() call Arc::clone internally assert_eq!(parquet_source.predicate(), parquet_source.filter().as_ref()); } diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index d6ade3b8b2107..9245f60e2306c 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -26,7 +26,6 @@ use crate::file_groups::FileGroupPartitioner; use crate::file_scan_config::FileScanConfig; use crate::file_stream::FileOpener; use crate::schema_adapter::SchemaAdapterFactory; -use crate::TableSchema; use datafusion_common::config::ConfigOptions; use datafusion_common::{not_impl_err, Result, Statistics}; use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; @@ -61,10 +60,12 @@ pub trait FileSource: Send + Sync { ) -> Arc; /// Any fn as_any(&self) -> &dyn Any; + /// Returns the table schema for this file source. + /// + /// This always returns the unprojected schema (the full schema of the data). + fn table_schema(&self) -> &crate::table_schema::TableSchema; /// Initialize new type with batch size configuration fn with_batch_size(&self, batch_size: usize) -> Arc; - /// Initialize new instance with a new schema - fn with_schema(&self, schema: TableSchema) -> Arc; /// Initialize new instance with projection information fn with_projection(&self, config: &FileScanConfig) -> Arc; /// Initialize new instance with projected statistics diff --git a/datafusion/datasource/src/file_format.rs b/datafusion/datasource/src/file_format.rs index 23f68636c156e..bb4ffded8086a 100644 --- a/datafusion/datasource/src/file_format.rs +++ b/datafusion/datasource/src/file_format.rs @@ -111,7 +111,10 @@ pub trait FileFormat: Send + Sync + fmt::Debug { } /// Return the related FileSource such as `CsvSource`, `JsonSource`, etc. - fn file_source(&self) -> Arc; + /// + /// # Arguments + /// * `table_schema` - The table schema to use for the FileSource (includes partition columns) + fn file_source(&self, table_schema: crate::TableSchema) -> Arc; } /// Factory for creating [`FileFormat`] instances based on session and command level options diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 5847a8cf5e11f..82052ee4c39c3 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -24,7 +24,7 @@ use crate::schema_adapter::SchemaAdapterFactory; use crate::{ display::FileGroupsDisplay, file::FileSource, file_compression_type::FileCompressionType, file_stream::FileStream, - source::DataSource, statistics::MinMaxStatistics, PartitionedFile, TableSchema, + source::DataSource, statistics::MinMaxStatistics, PartitionedFile, }; use arrow::datatypes::FieldRef; use arrow::{ @@ -33,7 +33,7 @@ use arrow::{ RecordBatchOptions, }, buffer::Buffer, - datatypes::{ArrowNativeType, DataType, Field, Schema, SchemaRef, UInt16Type}, + datatypes::{ArrowNativeType, DataType, Schema, SchemaRef, UInt16Type}, }; use datafusion_common::config::ConfigOptions; use datafusion_common::{ @@ -103,29 +103,30 @@ use log::{debug, warn}; /// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate /// #[derive(Clone)] /// # struct ParquetSource { +/// # table_schema: TableSchema, /// # projected_statistics: Option, /// # schema_adapter_factory: Option> /// # }; /// # impl FileSource for ParquetSource { /// # fn create_file_opener(&self, _: Arc, _: &FileScanConfig, _: usize) -> Arc { unimplemented!() } /// # fn as_any(&self) -> &dyn Any { self } +/// # fn table_schema(&self) -> &TableSchema { &self.table_schema } /// # fn with_batch_size(&self, _: usize) -> Arc { unimplemented!() } -/// # fn with_schema(&self, _: TableSchema) -> Arc { Arc::new(self.clone()) as Arc } /// # fn with_projection(&self, _: &FileScanConfig) -> Arc { unimplemented!() } -/// # fn with_statistics(&self, statistics: Statistics) -> Arc { Arc::new(Self {projected_statistics: Some(statistics), schema_adapter_factory: self.schema_adapter_factory.clone()} ) } +/// # fn with_statistics(&self, statistics: Statistics) -> Arc { Arc::new(Self {table_schema: self.table_schema.clone(), projected_statistics: Some(statistics), schema_adapter_factory: self.schema_adapter_factory.clone()} ) } /// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() } /// # fn statistics(&self) -> Result { Ok(self.projected_statistics.clone().expect("projected_statistics should be set")) } /// # fn file_type(&self) -> &str { "parquet" } -/// # fn with_schema_adapter_factory(&self, factory: Arc) -> Result> { Ok(Arc::new(Self {projected_statistics: self.projected_statistics.clone(), schema_adapter_factory: Some(factory)} )) } +/// # fn with_schema_adapter_factory(&self, factory: Arc) -> Result> { Ok(Arc::new(Self {table_schema: self.table_schema.clone(), projected_statistics: self.projected_statistics.clone(), schema_adapter_factory: Some(factory)} )) } /// # fn schema_adapter_factory(&self) -> Option> { self.schema_adapter_factory.clone() } /// # } /// # impl ParquetSource { -/// # fn new() -> Self { Self {projected_statistics: None, schema_adapter_factory: None} } +/// # fn new(table_schema: impl Into) -> Self { Self {table_schema: table_schema.into(), projected_statistics: None, schema_adapter_factory: None} } /// # } /// // create FileScan config for reading parquet files from file:// /// let object_store_url = ObjectStoreUrl::local_filesystem(); -/// let file_source = Arc::new(ParquetSource::new()); -/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source) +/// let file_source = Arc::new(ParquetSource::new(file_schema.clone())); +/// let config = FileScanConfigBuilder::new(object_store_url, file_source) /// .with_limit(Some(1000)) // read only the first 1000 records /// .with_projection_indices(Some(vec![2, 3])) // project columns 2 and 3 /// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group @@ -156,16 +157,6 @@ pub struct FileScanConfig { /// [`RuntimeEnv::register_object_store`]: datafusion_execution::runtime_env::RuntimeEnv::register_object_store /// [`RuntimeEnv::object_store`]: datafusion_execution::runtime_env::RuntimeEnv::object_store pub object_store_url: ObjectStoreUrl, - /// Schema information including the file schema, table partition columns, - /// and the combined table schema. - /// - /// The table schema (file schema + partition columns) is the schema exposed - /// upstream of [`FileScanConfig`] (e.g. in [`DataSourceExec`]). - /// - /// See [`TableSchema`] for more information. - /// - /// [`DataSourceExec`]: crate::source::DataSourceExec - pub table_schema: TableSchema, /// List of files to be processed, grouped into partitions /// /// Each file must have a schema of `file_schema` or a subset. If @@ -214,6 +205,7 @@ pub struct FileScanConfig { /// # use datafusion_datasource::file_compression_type::FileCompressionType; /// # use datafusion_datasource::file_groups::FileGroup; /// # use datafusion_datasource::PartitionedFile; +/// # use datafusion_datasource::table_schema::TableSchema; /// # use datafusion_execution::object_store::ObjectStoreUrl; /// # use datafusion_common::Statistics; /// # use datafusion_datasource::file::FileSource; @@ -221,25 +213,28 @@ pub struct FileScanConfig { /// # fn main() { /// # fn with_source(file_source: Arc) { /// // Create a schema for our Parquet files -/// let schema = Arc::new(Schema::new(vec![ +/// let file_schema = Arc::new(Schema::new(vec![ /// Field::new("id", DataType::Int32, false), /// Field::new("value", DataType::Utf8, false), /// ])); /// +/// // Create partition columns +/// let partition_cols = vec![ +/// Arc::new(Field::new("date", DataType::Utf8, false)), +/// ]; +/// +/// // Create table schema with file schema and partition columns +/// let table_schema = TableSchema::new(file_schema, partition_cols); +/// /// // Create a builder for scanning Parquet files from a local filesystem /// let config = FileScanConfigBuilder::new( /// ObjectStoreUrl::local_filesystem(), -/// schema, /// file_source, /// ) /// // Set a limit of 1000 rows /// .with_limit(Some(1000)) /// // Project only the first column /// .with_projection_indices(Some(vec![0])) -/// // Add partition columns -/// .with_table_partition_cols(vec![ -/// Field::new("date", DataType::Utf8, false), -/// ]) /// // Add a file group with two files /// .with_file_group(FileGroup::new(vec![ /// PartitionedFile::new("data/date=2024-01-01/file1.parquet", 1024), @@ -255,16 +250,6 @@ pub struct FileScanConfig { #[derive(Clone)] pub struct FileScanConfigBuilder { object_store_url: ObjectStoreUrl, - /// Schema information including the file schema, table partition columns, - /// and the combined table schema. - /// - /// This schema is used to read the files, but the file schema is **not** necessarily - /// the schema of the physical files. Rather this is the schema that the - /// physical file schema will be mapped onto, and the schema that the - /// [`DataSourceExec`] will return. - /// - /// [`DataSourceExec`]: crate::source::DataSourceExec - table_schema: TableSchema, file_source: Arc, limit: Option, projection_indices: Option>, @@ -283,16 +268,14 @@ impl FileScanConfigBuilder { /// /// # Parameters: /// * `object_store_url`: See [`FileScanConfig::object_store_url`] - /// * `file_schema`: See [`FileScanConfig::file_schema`] - /// * `file_source`: See [`FileScanConfig::file_source`] + /// * `file_source`: See [`FileScanConfig::file_source`]. The file source must have + /// a schema set via its constructor. pub fn new( object_store_url: ObjectStoreUrl, - file_schema: SchemaRef, file_source: Arc, ) -> Self { Self { object_store_url, - table_schema: TableSchema::from_file_schema(file_schema), file_source, file_groups: vec![], statistics: None, @@ -324,7 +307,7 @@ impl FileScanConfigBuilder { } pub fn table_schema(&self) -> &SchemaRef { - self.table_schema.table_schema() + self.file_source.table_schema().table_schema() } /// Set the columns on which to project the data. Indexes that are higher than the @@ -345,18 +328,6 @@ impl FileScanConfigBuilder { self } - /// Set the partitioning columns - pub fn with_table_partition_cols(mut self, table_partition_cols: Vec) -> Self { - let table_partition_cols: Vec = table_partition_cols - .into_iter() - .map(|f| Arc::new(f) as FieldRef) - .collect(); - self.table_schema = self - .table_schema - .with_table_partition_cols(table_partition_cols); - self - } - /// Set the table constraints pub fn with_constraints(mut self, constraints: Constraints) -> Self { self.constraints = Some(constraints); @@ -451,7 +422,6 @@ impl FileScanConfigBuilder { pub fn build(self) -> FileScanConfig { let Self { object_store_url, - table_schema, file_source, limit, projection_indices, @@ -466,12 +436,11 @@ impl FileScanConfigBuilder { } = self; let constraints = constraints.unwrap_or_default(); - let statistics = statistics - .unwrap_or_else(|| Statistics::new_unknown(table_schema.file_schema())); + let statistics = statistics.unwrap_or_else(|| { + Statistics::new_unknown(file_source.table_schema().file_schema()) + }); - let file_source = file_source - .with_statistics(statistics.clone()) - .with_schema(table_schema.clone()); + let file_source = file_source.with_statistics(statistics.clone()); let file_compression_type = file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); let new_lines_in_values = new_lines_in_values.unwrap_or(false); @@ -479,12 +448,14 @@ impl FileScanConfigBuilder { // Convert projection indices to ProjectionExprs using the final table schema // (which now includes partition columns if they were added) let projection_exprs = projection_indices.map(|indices| { - ProjectionExprs::from_indices(&indices, table_schema.table_schema()) + ProjectionExprs::from_indices( + &indices, + file_source.table_schema().table_schema(), + ) }); FileScanConfig { object_store_url, - table_schema, file_source, limit, projection_exprs, @@ -503,7 +474,6 @@ impl From for FileScanConfigBuilder { fn from(config: FileScanConfig) -> Self { Self { object_store_url: config.object_store_url, - table_schema: config.table_schema, file_source: Arc::::clone(&config.file_source), file_groups: config.file_groups, statistics: config.file_source.statistics().ok(), @@ -748,12 +718,12 @@ impl DataSource for FileScanConfig { impl FileScanConfig { /// Get the file schema (schema of the files without partition columns) pub fn file_schema(&self) -> &SchemaRef { - self.table_schema.file_schema() + self.file_source.table_schema().file_schema() } /// Get the table partition columns pub fn table_partition_cols(&self) -> &Vec { - self.table_schema.table_partition_cols() + self.file_source.table_schema().table_partition_cols() } fn projection_indices(&self) -> Vec { @@ -1509,12 +1479,14 @@ pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue { mod tests { use super::*; use crate::test_util::col; + use crate::TableSchema; use crate::{ generate_test_files, test_util::MockSource, tests::aggr_test_schema, verify_sort_integrity, }; use arrow::array::{Int32Array, RecordBatch}; + use arrow::datatypes::Field; use datafusion_common::stats::Precision; use datafusion_common::{assert_batches_eq, internal_err}; use datafusion_expr::{Operator, SortExpr}; @@ -2178,14 +2150,16 @@ mod tests { statistics: Statistics, table_partition_cols: Vec, ) -> FileScanConfig { + let table_schema = TableSchema::new( + file_schema, + table_partition_cols.into_iter().map(Arc::new).collect(), + ); FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - file_schema, - Arc::new(MockSource::default()), + Arc::new(MockSource::new(table_schema.clone())), ) .with_projection_indices(projection) .with_statistics(statistics) - .with_table_partition_cols(table_partition_cols) .build() } @@ -2224,12 +2198,22 @@ mod tests { fn test_file_scan_config_builder() { let file_schema = aggr_test_schema(); let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); - let file_source: Arc = Arc::new(MockSource::default()); + + let table_schema = TableSchema::new( + Arc::clone(&file_schema), + vec![Arc::new(Field::new( + "date", + wrap_partition_type_in_dict(DataType::Utf8), + false, + ))], + ); + + let file_source: Arc = + Arc::new(MockSource::new(table_schema.clone())); // Create a builder with required parameters let builder = FileScanConfigBuilder::new( object_store_url.clone(), - Arc::clone(&file_schema), Arc::clone(&file_source), ); @@ -2237,11 +2221,6 @@ mod tests { let config = builder .with_limit(Some(1000)) .with_projection_indices(Some(vec![0, 1])) - .with_table_partition_cols(vec![Field::new( - "date", - wrap_partition_type_in_dict(DataType::Utf8), - false, - )]) .with_statistics(Statistics::new_unknown(&file_schema)) .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( "test.parquet".to_string(), @@ -2283,17 +2262,20 @@ mod tests { fn equivalence_properties_after_schema_change() { let file_schema = aggr_test_schema(); let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); + + let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]); + // Create a file source with a filter - let file_source: Arc = - Arc::new(MockSource::default().with_filter(Arc::new(BinaryExpr::new( + let file_source: Arc = Arc::new( + MockSource::new(table_schema.clone()).with_filter(Arc::new(BinaryExpr::new( col("c2", &file_schema).unwrap(), Operator::Eq, Arc::new(Literal::new(ScalarValue::Int32(Some(10)))), - )))); + ))), + ); let config = FileScanConfigBuilder::new( object_store_url.clone(), - Arc::clone(&file_schema), Arc::clone(&file_source), ) .with_projection_indices(Some(vec![0, 1, 2])) @@ -2331,12 +2313,15 @@ mod tests { fn test_file_scan_config_builder_defaults() { let file_schema = aggr_test_schema(); let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); - let file_source: Arc = Arc::new(MockSource::default()); + + let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]); + + let file_source: Arc = + Arc::new(MockSource::new(table_schema.clone())); // Create a builder with only required parameters and build without any additional configurations let config = FileScanConfigBuilder::new( object_store_url.clone(), - Arc::clone(&file_schema), Arc::clone(&file_source), ) .build(); @@ -2389,7 +2374,6 @@ mod tests { fn test_file_scan_config_builder_new_from() { let schema = aggr_test_schema(); let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); - let file_source: Arc = Arc::new(MockSource::default()); let partition_cols = vec![Field::new( "date", wrap_partition_type_in_dict(DataType::Utf8), @@ -2397,15 +2381,21 @@ mod tests { )]; let file = PartitionedFile::new("test_file.parquet", 100); + let table_schema = TableSchema::new( + Arc::clone(&schema), + partition_cols.iter().map(|f| Arc::new(f.clone())).collect(), + ); + + let file_source: Arc = + Arc::new(MockSource::new(table_schema.clone())); + // Create a config with non-default values let original_config = FileScanConfigBuilder::new( object_store_url.clone(), - Arc::clone(&schema), Arc::clone(&file_source), ) .with_projection_indices(Some(vec![0, 2])) .with_limit(Some(10)) - .with_table_partition_cols(partition_cols.clone()) .with_file(file.clone()) .with_constraints(Constraints::default()) .with_newlines_in_values(true) @@ -2640,11 +2630,12 @@ mod tests { let file_group = FileGroup::new(vec![PartitionedFile::new("test.parquet", 1024)]) .with_statistics(Arc::new(file_group_stats)); + let table_schema = TableSchema::new(Arc::clone(&schema), vec![]); + // Create a FileScanConfig with projection: only keep columns 0 and 2 let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - Arc::clone(&schema), - Arc::new(MockSource::default()), + Arc::new(MockSource::new(table_schema.clone())), ) .with_projection_indices(Some(vec![0, 2])) // Only project columns 0 and 2 .with_file_groups(vec![file_group]) diff --git a/datafusion/datasource/src/file_stream.rs b/datafusion/datasource/src/file_stream.rs index a4a43ca9aeab3..0568b4cc4e5f9 100644 --- a/datafusion/datasource/src/file_stream.rs +++ b/datafusion/datasource/src/file_stream.rs @@ -639,10 +639,10 @@ mod tests { let on_error = self.on_error; + let table_schema = crate::table_schema::TableSchema::new(file_schema, vec![]); let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - file_schema, - Arc::new(MockSource::default()), + Arc::new(MockSource::new(table_schema)), ) .with_file_group(file_group) .with_limit(self.limit) diff --git a/datafusion/datasource/src/table_schema.rs b/datafusion/datasource/src/table_schema.rs index 8002df4a99dfc..ff0e788018875 100644 --- a/datafusion/datasource/src/table_schema.rs +++ b/datafusion/datasource/src/table_schema.rs @@ -170,3 +170,9 @@ impl TableSchema { &self.table_schema } } + +impl From for TableSchema { + fn from(schema: SchemaRef) -> Self { + Self::from_file_schema(schema) + } +} diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs index feb704af9913e..78ba593f22ec8 100644 --- a/datafusion/datasource/src/test_util.rs +++ b/datafusion/datasource/src/test_util.rs @@ -22,7 +22,6 @@ use crate::{ use std::sync::Arc; -use crate::TableSchema; use arrow::datatypes::Schema; use datafusion_common::{Result, Statistics}; use datafusion_physical_expr::{expressions::Column, PhysicalExpr}; @@ -30,15 +29,41 @@ use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use object_store::ObjectStore; /// Minimal [`crate::file::FileSource`] implementation for use in tests. -#[derive(Clone, Default)] +#[derive(Clone)] pub(crate) struct MockSource { metrics: ExecutionPlanMetricsSet, projected_statistics: Option, schema_adapter_factory: Option>, filter: Option>, + table_schema: crate::table_schema::TableSchema, +} + +impl Default for MockSource { + fn default() -> Self { + Self { + metrics: ExecutionPlanMetricsSet::new(), + projected_statistics: None, + schema_adapter_factory: None, + filter: None, + table_schema: crate::table_schema::TableSchema::new( + Arc::new(Schema::empty()), + vec![], + ), + } + } } impl MockSource { + pub fn new(table_schema: impl Into) -> Self { + Self { + metrics: ExecutionPlanMetricsSet::new(), + projected_statistics: None, + schema_adapter_factory: None, + filter: None, + table_schema: table_schema.into(), + } + } + pub fn with_filter(mut self, filter: Arc) -> Self { self.filter = Some(filter); self @@ -67,10 +92,6 @@ impl FileSource for MockSource { Arc::new(Self { ..self.clone() }) } - fn with_schema(&self, _schema: TableSchema) -> Arc { - Arc::new(Self { ..self.clone() }) - } - fn with_projection(&self, _config: &FileScanConfig) -> Arc { Arc::new(Self { ..self.clone() }) } @@ -110,6 +131,10 @@ impl FileSource for MockSource { fn schema_adapter_factory(&self) -> Option> { self.schema_adapter_factory.clone() } + + fn table_schema(&self) -> &crate::table_schema::TableSchema { + &self.table_schema + } } /// Create a column expression diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 349ed79ddb4ad..f1a9abe6ea7b1 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -34,7 +34,7 @@ use datafusion_datasource::file::FileSource; use datafusion_datasource::file_groups::FileGroup; use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_datasource::file_sink_config::FileSinkConfig; -use datafusion_datasource::{FileRange, ListingTableUrl, PartitionedFile}; +use datafusion_datasource::{FileRange, ListingTableUrl, PartitionedFile, TableSchema}; use datafusion_datasource_csv::file_format::CsvSink; use datafusion_datasource_json::file_format::JsonSink; #[cfg(feature = "parquet")] @@ -481,6 +481,37 @@ pub fn parse_protobuf_file_scan_schema( Ok(Arc::new(convert_required!(proto.schema)?)) } +/// Parses a TableSchema from protobuf, extracting the file schema and partition columns +pub fn parse_table_schema_from_proto( + proto: &protobuf::FileScanExecConf, +) -> Result { + let schema: Arc = parse_protobuf_file_scan_schema(proto)?; + + // Reacquire the partition column types from the schema before removing them below. + let table_partition_cols = proto + .table_partition_cols + .iter() + .map(|col| Ok(Arc::new(schema.field_with_name(col)?.clone()))) + .collect::>>()?; + + // Remove partition columns from the schema after recreating table_partition_cols + // because the partition columns are not in the file. They are present to allow + // the partition column types to be reconstructed after serde. + let file_schema = Arc::new( + Schema::new( + schema + .fields() + .iter() + .filter(|field| !table_partition_cols.contains(field)) + .cloned() + .collect::>(), + ) + .with_metadata(schema.metadata.clone()), + ); + + Ok(TableSchema::new(file_schema, table_partition_cols)) +} + pub fn parse_protobuf_file_scan_config( proto: &protobuf::FileScanExecConf, ctx: &TaskContext, @@ -508,28 +539,6 @@ pub fn parse_protobuf_file_scan_config( true => ObjectStoreUrl::local_filesystem(), }; - // Reacquire the partition column types from the schema before removing them below. - let table_partition_cols = proto - .table_partition_cols - .iter() - .map(|col| Ok(schema.field_with_name(col)?.clone())) - .collect::>>()?; - - // Remove partition columns from the schema after recreating table_partition_cols - // because the partition columns are not in the file. They are present to allow - // the partition column types to be reconstructed after serde. - let file_schema = Arc::new( - Schema::new( - schema - .fields() - .iter() - .filter(|field| !table_partition_cols.contains(field)) - .cloned() - .collect::>(), - ) - .with_metadata(schema.metadata.clone()), - ); - let mut output_ordering = vec![]; for node_collection in &proto.output_ordering { let sort_exprs = parse_physical_sort_exprs( @@ -541,13 +550,12 @@ pub fn parse_protobuf_file_scan_config( output_ordering.extend(LexOrdering::new(sort_exprs)); } - let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source) + let config = FileScanConfigBuilder::new(object_store_url, file_source) .with_file_groups(file_groups) .with_constraints(constraints) .with_statistics(statistics) .with_projection_indices(Some(projection)) .with_limit(proto.limit.as_ref().map(|sl| sl.limit as usize)) - .with_table_partition_cols(table_partition_cols) .with_output_ordering(output_ordering) .with_batch_size(proto.batch_size.map(|s| s as usize)) .build(); diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index e5f4a1f7d0267..fc7818fe461a6 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -24,6 +24,7 @@ use crate::common::{byte_to_string, str_to_byte}; use crate::physical_plan::from_proto::{ parse_physical_expr, parse_physical_sort_expr, parse_physical_sort_exprs, parse_physical_window_expr, parse_protobuf_file_scan_config, parse_record_batches, + parse_table_schema_from_proto, }; use crate::physical_plan::to_proto::{ serialize_file_scan_config, serialize_maybe_filter, serialize_physical_aggr_expr, @@ -42,6 +43,7 @@ use crate::{convert_required, into_required}; use arrow::compute::SortOptions; use arrow::datatypes::{IntervalMonthDayNanoType, SchemaRef}; use datafusion_catalog::memory::MemorySourceConfig; +use datafusion_common::config::CsvOptions; use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, DataFusionError, Result, }; @@ -612,14 +614,21 @@ impl protobuf::PhysicalPlanNode { None }; + // Parse table schema with partition columns + let table_schema = + parse_table_schema_from_proto(scan.base_conf.as_ref().unwrap())?; + + let csv_options = CsvOptions { + has_header: Some(scan.has_header), + delimiter: str_to_byte(&scan.delimiter, "delimiter")?, + quote: str_to_byte(&scan.quote, "quote")?, + ..Default::default() + }; let source = Arc::new( - CsvSource::new( - scan.has_header, - str_to_byte(&scan.delimiter, "delimiter")?, - 0, - ) - .with_escape(escape) - .with_comment(comment), + CsvSource::new(table_schema) + .with_csv_options(csv_options) + .with_escape(escape) + .with_comment(comment), ); let conf = FileScanConfigBuilder::from(parse_protobuf_file_scan_config( @@ -641,11 +650,13 @@ impl protobuf::PhysicalPlanNode { extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { + let base_conf = scan.base_conf.as_ref().unwrap(); + let table_schema = parse_table_schema_from_proto(base_conf)?; let scan_conf = parse_protobuf_file_scan_config( - scan.base_conf.as_ref().unwrap(), + base_conf, ctx, extension_codec, - Arc::new(JsonSource::new()), + Arc::new(JsonSource::new(table_schema)), )?; Ok(DataSourceExec::from_data_source(scan_conf)) } @@ -695,7 +706,12 @@ impl protobuf::PhysicalPlanNode { if let Some(table_options) = scan.parquet_options.as_ref() { options = table_options.try_into()?; } - let mut source = ParquetSource::new(options); + + // Parse table schema with partition columns + let table_schema = parse_table_schema_from_proto(base_conf)?; + + let mut source = + ParquetSource::new(table_schema).with_table_parquet_options(options); if let Some(predicate) = predicate { source = source.with_predicate(predicate); @@ -717,16 +733,17 @@ impl protobuf::PhysicalPlanNode { &self, scan: &protobuf::AvroScanExecNode, ctx: &TaskContext, - extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { #[cfg(feature = "avro")] { + let table_schema = + parse_table_schema_from_proto(scan.base_conf.as_ref().unwrap())?; let conf = parse_protobuf_file_scan_config( scan.base_conf.as_ref().unwrap(), ctx, extension_codec, - Arc::new(AvroSource::new()), + Arc::new(AvroSource::new(table_schema)), )?; Ok(DataSourceExec::from_data_source(conf)) } diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index c8b2bc02e447b..73f39eaa7bf95 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -33,6 +33,7 @@ use arrow::datatypes::{Fields, TimeUnit}; use datafusion::physical_expr::aggregate::AggregateExprBuilder; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion::physical_plan::metrics::MetricType; +use datafusion_datasource::TableSchema; use datafusion_expr::dml::InsertOp; use datafusion_functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf; use datafusion_functions_aggregate::array_agg::array_agg_udaf; @@ -883,25 +884,26 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> { let mut options = TableParquetOptions::new(); options.global.pushdown_filters = true; - let file_source = Arc::new(ParquetSource::new(options).with_predicate(predicate)); + let file_source = Arc::new( + ParquetSource::new(Arc::clone(&file_schema)) + .with_table_parquet_options(options) + .with_predicate(predicate), + ); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - file_source, - ) - .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( - "/path/to/file.parquet".to_string(), - 1024, - )])]) - .with_statistics(Statistics { - num_rows: Precision::Inexact(100), - total_byte_size: Precision::Inexact(1024), - column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ - Field::new("col", DataType::Utf8, false), - ]))), - }) - .build(); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )])]) + .with_statistics(Statistics { + num_rows: Precision::Inexact(100), + total_byte_size: Precision::Inexact(1024), + column_statistics: Statistics::unknown_column(&Arc::new(Schema::new( + vec![Field::new("col", DataType::Utf8, false)], + ))), + }) + .build(); roundtrip_test(DataSourceExec::from_data_source(scan_config)) } @@ -914,21 +916,22 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { vec![wrap_partition_value_in_dict(ScalarValue::Int64(Some(0)))]; let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)])); - let file_source = Arc::new(ParquetSource::default()); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema, - file_source, - ) - .with_projection_indices(Some(vec![0, 1])) - .with_file_group(FileGroup::new(vec![file_group])) - .with_table_partition_cols(vec![Field::new( - "part".to_string(), - wrap_partition_type_in_dict(DataType::Int16), - false, - )]) - .with_newlines_in_values(false) - .build(); + let table_schema = TableSchema::new( + schema.clone(), + vec![Arc::new(Field::new( + "part".to_string(), + wrap_partition_type_in_dict(DataType::Int16), + false, + ))], + ); + + let file_source = Arc::new(ParquetSource::new(table_schema.clone())); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .with_projection_indices(Some(vec![0, 1])) + .with_file_group(FileGroup::new(vec![file_group])) + .with_newlines_in_values(false) + .build(); roundtrip_test(DataSourceExec::from_data_source(scan_config)) } @@ -942,26 +945,25 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { inner: Arc::new(Column::new("col", 1)), }); - let file_source = - Arc::new(ParquetSource::default().with_predicate(custom_predicate_expr)); + let file_source = Arc::new( + ParquetSource::new(Arc::clone(&file_schema)) + .with_predicate(custom_predicate_expr), + ); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - file_source, - ) - .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( - "/path/to/file.parquet".to_string(), - 1024, - )])]) - .with_statistics(Statistics { - num_rows: Precision::Inexact(100), - total_byte_size: Precision::Inexact(1024), - column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ - Field::new("col", DataType::Utf8, false), - ]))), - }) - .build(); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )])]) + .with_statistics(Statistics { + num_rows: Precision::Inexact(100), + total_byte_size: Precision::Inexact(1024), + column_statistics: Statistics::unknown_column(&Arc::new(Schema::new( + vec![Field::new("col", DataType::Utf8, false)], + ))), + }) + .build(); #[derive(Debug, Clone, Eq)] struct CustomPredicateExpr { @@ -1803,19 +1805,17 @@ async fn roundtrip_projection_source() -> Result<()> { let statistics = Statistics::new_unknown(&schema); - let file_source = ParquetSource::default().with_statistics(statistics.clone()); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source, - ) - .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( - "/path/to/file.parquet".to_string(), - 1024, - )])]) - .with_statistics(statistics) - .with_projection_indices(Some(vec![0, 1, 2])) - .build(); + let file_source = + ParquetSource::new(Arc::clone(&schema)).with_statistics(statistics.clone()); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )])]) + .with_statistics(statistics) + .with_projection_indices(Some(vec![0, 1, 2])) + .build(); let filter = Arc::new( FilterExec::try_new( diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index 45a19cea80cfc..8ce71acecca3e 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -53,7 +53,6 @@ pub async fn from_substrait_rel( ) -> Result> { let mut base_config_builder; - let source = Arc::new(ParquetSource::default()); match &rel.rel_type { Some(RelType::Read(read)) => { if read.filter.is_some() || read.best_effort_filter.is_some() { @@ -80,9 +79,10 @@ pub async fn from_substrait_rel( .collect::>>() { Ok(fields) => { + let schema = Arc::new(Schema::new(fields)); + let source = Arc::new(ParquetSource::new(Arc::clone(&schema))); base_config_builder = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - Arc::new(Schema::new(fields)), source, ); } diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs index 64599465f96f7..bafaffa8285b7 100644 --- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs @@ -35,24 +35,22 @@ use substrait::proto::extensions; #[tokio::test] async fn parquet_exec() -> Result<()> { - let source = Arc::new(ParquetSource::default()); - - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - Arc::new(Schema::empty()), - source, - ) - .with_file_groups(vec![ - FileGroup::new(vec![PartitionedFile::new( - "file://foo/part-0.parquet".to_string(), - 123, - )]), - FileGroup::new(vec![PartitionedFile::new( - "file://foo/part-1.parquet".to_string(), - 123, - )]), - ]) - .build(); + let schema = Arc::new(Schema::empty()); + let source = Arc::new(ParquetSource::new(schema.clone())); + + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file_groups(vec![ + FileGroup::new(vec![PartitionedFile::new( + "file://foo/part-0.parquet".to_string(), + 123, + )]), + FileGroup::new(vec![PartitionedFile::new( + "file://foo/part-1.parquet".to_string(), + 123, + )]), + ]) + .build(); let parquet_exec: Arc = DataSourceExec::from_data_source(scan_config); diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 0b227000f73d9..f08e2c383a176 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -150,7 +150,7 @@ let projection_exprs = config.projection_exprs; The `FileScanConfigBuilder::with_projection()` method has been deprecated in favor of `with_projection_indices()`: ```diff -let config = FileScanConfigBuilder::new(url, schema, file_source) +let config = FileScanConfigBuilder::new(url, file_source) - .with_projection(Some(vec![0, 2, 3])) + .with_projection_indices(Some(vec![0, 2, 3])) .build(); @@ -190,6 +190,91 @@ TIMEZONE = '+00:00'; This change was made to better support using the default timezone in scalar UDF functions such as `now`, `current_date`, `current_time`, and `to_timestamp` among others. +### Refactoring of `FileSource` constructors and `FileScanConfigBuilder` to accept schemas upfront + +The way schemas are passed to file sources and scan configurations has been significantly refactored. File sources now require the schema (including partition columns) to be provided at construction time, and `FileScanConfigBuilder` no longer takes a separate schema parameter. + +**Who is affected:** + +- Users who create `FileScanConfig` or file sources (`ParquetSource`, `CsvSource`, `JsonSource`, `AvroSource`) directly +- Users who implement custom `FileFormat` implementations + +**Key changes:** + +1. **FileSource constructors now require TableSchema**: All built-in file sources now take the schema in their constructor: + + ```diff + - let source = ParquetSource::default(); + + let source = ParquetSource::new(table_schema); + ``` + +2. **FileScanConfigBuilder no longer takes schema as a parameter**: The schema is now passed via the FileSource: + + ```diff + - FileScanConfigBuilder::new(url, schema, source) + + FileScanConfigBuilder::new(url, source) + ``` + +3. **Partition columns are now part of TableSchema**: The `with_table_partition_cols()` method has been removed from `FileScanConfigBuilder`. Partition columns are now passed as part of the `TableSchema` to the FileSource constructor: + + ```diff + + let table_schema = TableSchema::new( + + file_schema, + + vec![Arc::new(Field::new("date", DataType::Utf8, false))], + + ); + + let source = ParquetSource::new(table_schema); + let config = FileScanConfigBuilder::new(url, source) + - .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) + .with_file(partitioned_file) + .build(); + ``` + +4. **FileFormat::file_source() now takes TableSchema parameter**: Custom `FileFormat` implementations must be updated: + ```diff + impl FileFormat for MyFileFormat { + - fn file_source(&self) -> Arc { + + fn file_source(&self, table_schema: TableSchema) -> Arc { + - Arc::new(MyFileSource::default()) + + Arc::new(MyFileSource::new(table_schema)) + } + } + ``` + +**Migration examples:** + +For Parquet files: + +```diff +- let source = Arc::new(ParquetSource::default()); +- let config = FileScanConfigBuilder::new(url, schema, source) ++ let table_schema = TableSchema::new(schema, vec![]); ++ let source = Arc::new(ParquetSource::new(table_schema)); ++ let config = FileScanConfigBuilder::new(url, source) + .with_file(partitioned_file) + .build(); +``` + +For CSV files with partition columns: + +```diff +- let source = Arc::new(CsvSource::new(true, b',', b'"')); +- let config = FileScanConfigBuilder::new(url, file_schema, source) +- .with_table_partition_cols(vec![Field::new("year", DataType::Int32, false)]) ++ let options = CsvOptions { ++ has_header: Some(true), ++ delimiter: b',', ++ quote: b'"', ++ ..Default::default() ++ }; ++ let table_schema = TableSchema::new( ++ file_schema, ++ vec![Arc::new(Field::new("year", DataType::Int32, false))], ++ ); ++ let source = Arc::new(CsvSource::new(table_schema).with_csv_options(options)); ++ let config = FileScanConfigBuilder::new(url, source) + .build(); +``` + ### Introduction of `TableSchema` and changes to `FileSource::with_schema()` method A new `TableSchema` struct has been introduced in the `datafusion-datasource` crate to better manage table schemas with partition columns. This struct helps distinguish between: @@ -1137,7 +1222,7 @@ Pattern in DataFusion `47.0.0`: ```rust # /* comment to avoid running -let config = FileScanConfigBuilder::new(url, schema, Arc::new(file_source)) +let config = FileScanConfigBuilder::new(url, Arc::new(file_source)) .with_statistics(stats) ... .build(); From 49782435f2f6a77121b36da29eed1d9fb4ffbc4a Mon Sep 17 00:00:00 2001 From: theirix Date: Sun, 9 Nov 2025 11:10:21 +0000 Subject: [PATCH 0038/1589] Fix out-of-bounds access in SLT runner (#18562) ## Which issue does this PR close? ## Rationale for this change A small fix for a rare case in SLT runner when it panics instead of printing result. ## What changes are included in this PR? Code change in sqllogictest ## Are these changes tested? Manual test ## Are there any user-facing changes? No --- datafusion/sqllogictest/src/util.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/src/util.rs b/datafusion/sqllogictest/src/util.rs index 695fe463fa676..2c3bd12d897da 100644 --- a/datafusion/sqllogictest/src/util.rs +++ b/datafusion/sqllogictest/src/util.rs @@ -95,7 +95,7 @@ pub fn df_value_validator( warn!("[{i}] {}", normalized_actual[i]); warn!( "[{i}] {}", - if normalized_expected.len() >= i { + if normalized_expected.len() > i { &normalized_expected[i] } else { "No more results" From a6fab6bbe7ca5a6a1b9a3030c81d432c086b54b7 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Sun, 9 Nov 2025 19:11:24 +0800 Subject: [PATCH 0039/1589] feat: support complex expr for prepared statement argument (#18383) ## Which issue does this PR close? ## Rationale for this change complex expr is not supported in prepared statement argument. ## What changes are included in this PR? simplify arguments of prepared statement first. ## Are these changes tested? UT ## Are there any user-facing changes? No --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/execution/context/mod.rs | 10 ++++++++-- datafusion/sqllogictest/test_files/prepare.slt | 8 +++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 687779787ab50..c732c2c92f642 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -76,6 +76,7 @@ pub use datafusion_execution::config::SessionConfig; use datafusion_execution::registry::SerializerRegistry; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ expr_rewriter::FunctionRewrite, logical_plan::{DdlStatement, Statement}, @@ -83,6 +84,7 @@ use datafusion_expr::{ Expr, UserDefinedLogicalNode, WindowUDF, }; use datafusion_optimizer::analyzer::type_coercion::TypeCoercion; +use datafusion_optimizer::simplify_expressions::ExprSimplifier; use datafusion_optimizer::Analyzer; use datafusion_optimizer::{AnalyzerRule, OptimizerRule}; use datafusion_session::SessionStore; @@ -1269,14 +1271,18 @@ impl SessionContext { exec_datafusion_err!("Prepared statement '{}' does not exist", name) })?; + let state = self.state.read(); + let context = SimplifyContext::new(state.execution_props()); + let simplifier = ExprSimplifier::new(context); + // Only allow literals as parameters for now. let mut params: Vec = parameters .into_iter() - .map(|e| match e { + .map(|e| match simplifier.simplify(e)? { Expr::Literal(scalar, metadata) => { Ok(ScalarAndMetadata::new(scalar, metadata)) } - _ => not_impl_err!("Unsupported parameter type: {}", e), + e => not_impl_err!("Unsupported parameter type: {e}"), }) .collect::>()?; diff --git a/datafusion/sqllogictest/test_files/prepare.slt b/datafusion/sqllogictest/test_files/prepare.slt index 486baca6f54d6..650527cef620a 100644 --- a/datafusion/sqllogictest/test_files/prepare.slt +++ b/datafusion/sqllogictest/test_files/prepare.slt @@ -204,9 +204,11 @@ EXECUTE my_plan6(20.0); statement error Cast error: Cannot cast string 'foo' to value of Int32 type EXECUTE my_plan6('foo'); -# TODO: support non-literal expressions -statement error Unsupported parameter type -EXECUTE my_plan6(10 + 20); +# support non-literal expressions +query II +EXECUTE my_plan6(10 + 10); +---- +1 20 statement ok DEALLOCATE my_plan6; From 0e6be30454f9b74bf6839f3fe93ad98b9fba8530 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Sun, 9 Nov 2025 04:55:11 -0800 Subject: [PATCH 0040/1589] feat: Implement `SessionState::create_logical_expr_from_sql_expr` (#18423) ## Which issue does this PR close? - Closes #18278 ## Rationale for this change Convenience method for when parsing has already been done, and we want to start from a an expr object instead of SQL string. ## What changes are included in this PR? ## Are these changes tested? Added test ## Are there any user-facing changes? Yes, new public api. --- .../core/src/execution/session_state.rs | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index c15b7eae08432..d7a66db28ac47 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -547,6 +547,16 @@ impl SessionState { let sql_expr = self.sql_to_expr_with_alias(sql, &dialect)?; + self.create_logical_expr_from_sql_expr(sql_expr, df_schema) + } + + /// Creates a datafusion style AST [`Expr`] from a SQL expression. + #[cfg(feature = "sql")] + pub fn create_logical_expr_from_sql_expr( + &self, + sql_expr: SQLExprWithAlias, + df_schema: &DFSchema, + ) -> datafusion_common::Result { let provider = SessionContextProvider { state: self, tables: HashMap::new(), @@ -2097,6 +2107,36 @@ mod tests { assert!(sql_to_expr(&state).is_err()) } + #[test] + #[cfg(feature = "sql")] + fn test_create_logical_expr_from_sql_expr() { + let state = SessionStateBuilder::new().with_default_features().build(); + + let provider = SessionContextProvider { + state: &state, + tables: HashMap::new(), + }; + + let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + let df_schema = DFSchema::try_from(schema).unwrap(); + let dialect = state.config.options().sql_parser.dialect; + let query = SqlToRel::new_with_options(&provider, state.get_parser_options()); + + for sql in ["[1,2,3]", "a > 10", "SUM(a)"] { + let sql_expr = state.sql_to_expr(sql, &dialect).unwrap(); + let from_str = query + .sql_to_expr(sql_expr, &df_schema, &mut PlannerContext::new()) + .unwrap(); + + let sql_expr_with_alias = + state.sql_to_expr_with_alias(sql, &dialect).unwrap(); + let from_expr = state + .create_logical_expr_from_sql_expr(sql_expr_with_alias, &df_schema) + .unwrap(); + assert_eq!(from_str, from_expr); + } + } + #[test] fn test_from_existing() -> Result<()> { fn employee_batch() -> RecordBatch { From 1d8bc9b0e0cda4e7baee3e4ce032ad9c3f4d627b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 9 Nov 2025 07:56:52 -0500 Subject: [PATCH 0041/1589] [main] Update version to 51.0.0, add Changelog (#18551) (#18565) ## Which issue does this PR close? - part of https://github.com/apache/datafusion/issues/17558 - port of https://github.com/apache/datafusion/pull/18551 ## Rationale for this change Let's update the version numbers! ## What changes are included in this PR? - forward port the change from https://github.com/apache/datafusion/pull/18551 to main ## Are these changes tested? by CI ## Are there any user-facing changes? New version --- Cargo.lock | 84 ++-- Cargo.toml | 76 ++-- dev/changelog/51.0.0.md | 713 ++++++++++++++++++++++++++++++ docs/source/user-guide/configs.md | 2 +- 4 files changed, 794 insertions(+), 81 deletions(-) create mode 100644 dev/changelog/51.0.0.md diff --git a/Cargo.lock b/Cargo.lock index f500265108ff5..d712eecfcc72e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1832,7 +1832,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-schema", @@ -1904,7 +1904,7 @@ dependencies = [ [[package]] name = "datafusion-benchmarks" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion", @@ -1929,7 +1929,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -1952,7 +1952,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -1975,7 +1975,7 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2007,7 +2007,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "apache-avro", @@ -2034,7 +2034,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "50.3.0" +version = "51.0.0" dependencies = [ "futures", "log", @@ -2043,7 +2043,7 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-compression", @@ -2078,7 +2078,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-ipc", @@ -2101,7 +2101,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "50.3.0" +version = "51.0.0" dependencies = [ "apache-avro", "arrow", @@ -2120,7 +2120,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2141,7 +2141,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2161,7 +2161,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2190,11 +2190,11 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "50.3.0" +version = "51.0.0" [[package]] name = "datafusion-examples" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-flight", @@ -2228,7 +2228,7 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2249,7 +2249,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2273,7 +2273,7 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2284,7 +2284,7 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "50.3.0" +version = "51.0.0" dependencies = [ "abi_stable", "arrow", @@ -2306,7 +2306,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-buffer", @@ -2338,7 +2338,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2359,7 +2359,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2372,7 +2372,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-ord", @@ -2395,7 +2395,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2409,7 +2409,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2425,7 +2425,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2433,7 +2433,7 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "50.3.0" +version = "51.0.0" dependencies = [ "datafusion-doc", "quote", @@ -2442,7 +2442,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2469,7 +2469,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2494,7 +2494,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2507,7 +2507,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2519,7 +2519,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2539,7 +2539,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2575,7 +2575,7 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "chrono", @@ -2611,7 +2611,7 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2623,7 +2623,7 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2641,7 +2641,7 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "50.3.0" +version = "51.0.0" dependencies = [ "async-trait", "datafusion-common", @@ -2653,7 +2653,7 @@ dependencies = [ [[package]] name = "datafusion-spark" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "bigdecimal", @@ -2673,7 +2673,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "bigdecimal", @@ -2699,7 +2699,7 @@ dependencies = [ [[package]] name = "datafusion-sqllogictest" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2733,7 +2733,7 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "50.3.0" +version = "51.0.0" dependencies = [ "async-recursion", "async-trait", @@ -2755,7 +2755,7 @@ dependencies = [ [[package]] name = "datafusion-wasmtest" -version = "50.3.0" +version = "51.0.0" dependencies = [ "chrono", "console_error_panic_hook", diff --git a/Cargo.toml b/Cargo.toml index f15929b4c2b00..36198430e40b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,7 +79,7 @@ repository = "https://github.com/apache/datafusion" # Define Minimum Supported Rust Version (MSRV) rust-version = "1.88.0" # Define DataFusion version -version = "50.3.0" +version = "51.0.0" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -111,43 +111,43 @@ chrono = { version = "0.4.42", default-features = false } criterion = "0.7" ctor = "0.6.1" dashmap = "6.0.1" -datafusion = { path = "datafusion/core", version = "50.3.0", default-features = false } -datafusion-catalog = { path = "datafusion/catalog", version = "50.3.0" } -datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "50.3.0" } -datafusion-common = { path = "datafusion/common", version = "50.3.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "50.3.0" } -datafusion-datasource = { path = "datafusion/datasource", version = "50.3.0", default-features = false } -datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "50.3.0", default-features = false } -datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "50.3.0", default-features = false } -datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "50.3.0", default-features = false } -datafusion-datasource-json = { path = "datafusion/datasource-json", version = "50.3.0", default-features = false } -datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "50.3.0", default-features = false } -datafusion-doc = { path = "datafusion/doc", version = "50.3.0" } -datafusion-execution = { path = "datafusion/execution", version = "50.3.0", default-features = false } -datafusion-expr = { path = "datafusion/expr", version = "50.3.0", default-features = false } -datafusion-expr-common = { path = "datafusion/expr-common", version = "50.3.0" } -datafusion-ffi = { path = "datafusion/ffi", version = "50.3.0" } -datafusion-functions = { path = "datafusion/functions", version = "50.3.0" } -datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "50.3.0" } -datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "50.3.0" } -datafusion-functions-nested = { path = "datafusion/functions-nested", version = "50.3.0", default-features = false } -datafusion-functions-table = { path = "datafusion/functions-table", version = "50.3.0" } -datafusion-functions-window = { path = "datafusion/functions-window", version = "50.3.0" } -datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "50.3.0" } -datafusion-macros = { path = "datafusion/macros", version = "50.3.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "50.3.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "50.3.0", default-features = false } -datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "50.3.0", default-features = false } -datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "50.3.0", default-features = false } -datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "50.3.0" } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "50.3.0" } -datafusion-proto = { path = "datafusion/proto", version = "50.3.0" } -datafusion-proto-common = { path = "datafusion/proto-common", version = "50.3.0" } -datafusion-pruning = { path = "datafusion/pruning", version = "50.3.0" } -datafusion-session = { path = "datafusion/session", version = "50.3.0" } -datafusion-spark = { path = "datafusion/spark", version = "50.3.0" } -datafusion-sql = { path = "datafusion/sql", version = "50.3.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "50.3.0" } +datafusion = { path = "datafusion/core", version = "51.0.0", default-features = false } +datafusion-catalog = { path = "datafusion/catalog", version = "51.0.0" } +datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "51.0.0" } +datafusion-common = { path = "datafusion/common", version = "51.0.0", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "51.0.0" } +datafusion-datasource = { path = "datafusion/datasource", version = "51.0.0", default-features = false } +datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "51.0.0", default-features = false } +datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "51.0.0", default-features = false } +datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "51.0.0", default-features = false } +datafusion-datasource-json = { path = "datafusion/datasource-json", version = "51.0.0", default-features = false } +datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "51.0.0", default-features = false } +datafusion-doc = { path = "datafusion/doc", version = "51.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "51.0.0", default-features = false } +datafusion-expr = { path = "datafusion/expr", version = "51.0.0", default-features = false } +datafusion-expr-common = { path = "datafusion/expr-common", version = "51.0.0" } +datafusion-ffi = { path = "datafusion/ffi", version = "51.0.0" } +datafusion-functions = { path = "datafusion/functions", version = "51.0.0" } +datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "51.0.0" } +datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "51.0.0" } +datafusion-functions-nested = { path = "datafusion/functions-nested", version = "51.0.0", default-features = false } +datafusion-functions-table = { path = "datafusion/functions-table", version = "51.0.0" } +datafusion-functions-window = { path = "datafusion/functions-window", version = "51.0.0" } +datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "51.0.0" } +datafusion-macros = { path = "datafusion/macros", version = "51.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "51.0.0", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "51.0.0", default-features = false } +datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "51.0.0", default-features = false } +datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "51.0.0", default-features = false } +datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "51.0.0" } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "51.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "51.0.0" } +datafusion-proto-common = { path = "datafusion/proto-common", version = "51.0.0" } +datafusion-pruning = { path = "datafusion/pruning", version = "51.0.0" } +datafusion-session = { path = "datafusion/session", version = "51.0.0" } +datafusion-spark = { path = "datafusion/spark", version = "51.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "51.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "51.0.0" } doc-comment = "0.3" env_logger = "0.11" diff --git a/dev/changelog/51.0.0.md b/dev/changelog/51.0.0.md new file mode 100644 index 0000000000000..7c0b91440a0df --- /dev/null +++ b/dev/changelog/51.0.0.md @@ -0,0 +1,713 @@ + + +# Apache DataFusion 51.0.0 Changelog + +This release consists of 531 commits from 128 contributors. See credits at the end of this changelog for more information. + +See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions. + +**Breaking changes:** + +- Introduce `TypeSignatureClass::Binary` to allow accepting arbitrarily sized `FixedSizeBinary` arguments [#17531](https://github.com/apache/datafusion/pull/17531) (Jefffrey) +- feat: change `datafusion-proto` to use `TaskContext` rather than`SessionContext` for physical plan serialization [#17601](https://github.com/apache/datafusion/pull/17601) (milenkovicm) +- chore: refactor usage of `reassign_predicate_columns` [#17703](https://github.com/apache/datafusion/pull/17703) (rkrishn7) +- fix: correct edge case where null haystack returns false instead of null [#17818](https://github.com/apache/datafusion/pull/17818) (Jefffrey) +- clean up duplicate information in FileOpener trait [#17956](https://github.com/apache/datafusion/pull/17956) (adriangb) +- refactor : deprecate `ParquetSource::predicate()` and merge into `FileSource::filter()` [#17971](https://github.com/apache/datafusion/pull/17971) (getChan) +- feat: convert_array_to_scalar_vec respects null elements [#17891](https://github.com/apache/datafusion/pull/17891) (vegarsti) +- make Union::try_new pub [#18125](https://github.com/apache/datafusion/pull/18125) (leoyvens) +- refactor: remove unused `type_coercion/aggregate.rs` functions [#18091](https://github.com/apache/datafusion/pull/18091) (Jefffrey) +- refactor: remove core crate from datafusion-proto [#18123](https://github.com/apache/datafusion/pull/18123) (timsaucer) +- Use TableSchema in FileScanConfig [#18231](https://github.com/apache/datafusion/pull/18231) (adriangb) +- Enable placeholders with extension types [#17986](https://github.com/apache/datafusion/pull/17986) (paleolimbot) +- Implement `DESCRIBE SELECT` to show schema rather than `EXPLAIN` plan [#18238](https://github.com/apache/datafusion/pull/18238) (djanderson) +- Push partition_statistics into DataSource [#18233](https://github.com/apache/datafusion/pull/18233) (adriangb) +- Let `FileScanConfig` own a list of `ProjectionExpr`s [#18253](https://github.com/apache/datafusion/pull/18253) (friendlymatthew) +- Introduce `expr_fields` to `AccumulatorArgs` to hold input argument fields [#18100](https://github.com/apache/datafusion/pull/18100) (Jefffrey) +- Rename `is_ordered_set_aggregate` to `supports_within_group_clause` for UDAFs [#18397](https://github.com/apache/datafusion/pull/18397) (Jefffrey) +- Move generate_series projection logic into LazyMemoryStream [#18373](https://github.com/apache/datafusion/pull/18373) (mkleen) + +**Performance related:** + +- Improve `Hash` and `Ord` speed for `dyn LogicalType` [#17437](https://github.com/apache/datafusion/pull/17437) (findepi) +- Faster `&&String::to_string` [#17583](https://github.com/apache/datafusion/pull/17583) (findepi) +- perf: Simplify CASE for any WHEN TRUE [#17602](https://github.com/apache/datafusion/pull/17602) (petern48) +- perf: Improve the performance of WINDOW functions with many partitions [#17528](https://github.com/apache/datafusion/pull/17528) (nuno-faria) +- Avoid redundant Schema clones [#17643](https://github.com/apache/datafusion/pull/17643) (findepi) +- Prevent exponential planning time for Window functions - v2 [#17684](https://github.com/apache/datafusion/pull/17684) (berkaysynnada) +- Add case expr simplifiers for literal comparisons [#17743](https://github.com/apache/datafusion/pull/17743) (jackkleeman) +- Enable Projection Pushdown Optimization for Recursive CTEs [#16696](https://github.com/apache/datafusion/pull/16696) (kosiew) +- perf: Optimize CASE for any WHEN false [#17835](https://github.com/apache/datafusion/pull/17835) (petern48) +- feat: Simplify `NOT(IN ..)` to `NOT IN` and `NOT (EXISTS ..)` to `NOT EXISTS` [#17848](https://github.com/apache/datafusion/pull/17848) (Tpt) +- perf: Faster `string_agg()` aggregate function (1000x speed for no DISTINCT and ORDER case) [#17837](https://github.com/apache/datafusion/pull/17837) (2010YOUY01) +- optimizer: allow projection pushdown through aliased recursive CTE references [#17875](https://github.com/apache/datafusion/pull/17875) (kosiew) +- perf: Implement boolean group values [#17726](https://github.com/apache/datafusion/pull/17726) (ashdnazg) +- #17838 Rewrite `regexp_like` calls as `~` and `*~` operator expressions when possible [#17839](https://github.com/apache/datafusion/pull/17839) (pepijnve) +- perf: add to `aggregate_vectorized` bench benchmark for `PrimitiveGroupValueBuilder` as well [#17930](https://github.com/apache/datafusion/pull/17930) (rluvaton) +- #17972 Restore case expr/expr optimisation while ensuring lazy evaluation [#17973](https://github.com/apache/datafusion/pull/17973) (pepijnve) +- chore: use `NullBuffer::union` for Spark `concat` [#18087](https://github.com/apache/datafusion/pull/18087) (comphead) +- Short circuit complex case evaluation modes as soon as possible [#17898](https://github.com/apache/datafusion/pull/17898) (pepijnve) +- perf: Fix NLJ slow join with condition `array_has` [#18161](https://github.com/apache/datafusion/pull/18161) (2010YOUY01) +- perf: improve `ScalarValue::to_array_of_size` for Boolean and some null values [#18180](https://github.com/apache/datafusion/pull/18180) (rluvaton) +- Allow filter pushdown through AggregateExec [#18404](https://github.com/apache/datafusion/pull/18404) (LiaCastaneda) +- Avoid scatter operation in `ExpressionOrExpression` case evaluation method [#18444](https://github.com/apache/datafusion/pull/18444) (pepijnve) + +**Implemented enhancements:** + +- feat: Implement `DFSchema.print_schema_tree()` method [#17459](https://github.com/apache/datafusion/pull/17459) (comphead) +- feat(spark): implement Spark `length` function [#17475](https://github.com/apache/datafusion/pull/17475) (wForget) +- feat: Add binary to `join_fuzz` testing [#17497](https://github.com/apache/datafusion/pull/17497) (jonathanc-n) +- feat: Support log for Decimal128 and Decimal256 [#17023](https://github.com/apache/datafusion/pull/17023) (theirix) +- feat(spark): implement Spark bitwise function shiftleft/shiftright/shiftrightunsighed [#17013](https://github.com/apache/datafusion/pull/17013) (chenkovsky) +- feat: Ensure explain format in config is valid [#17549](https://github.com/apache/datafusion/pull/17549) (Weijun-H) +- feat: Simplify CASE WHEN true THEN expr to expr [#17450](https://github.com/apache/datafusion/pull/17450) (EeshanBembi) +- feat: add `sql` feature to make sql planning optional [#17332](https://github.com/apache/datafusion/pull/17332) (timsaucer) +- feat: Add `OR REPLACE` to creating external tables [#17580](https://github.com/apache/datafusion/pull/17580) (jonathanc-n) +- feat(substrait): add support for RightAnti and RightSemi join types [#17604](https://github.com/apache/datafusion/pull/17604) (bvolpato) +- feat(small): Display `NullEquality` in join executor's `EXPLAIN` output [#17664](https://github.com/apache/datafusion/pull/17664) (2010YOUY01) +- feat(substrait): add time literal support [#17655](https://github.com/apache/datafusion/pull/17655) (bvolpato) +- feat(spark): implement Spark `make_interval` function [#17424](https://github.com/apache/datafusion/pull/17424) (davidlghellin) +- feat: expose `udafs` and `udwfs` methods on `FunctionRegistry` [#17650](https://github.com/apache/datafusion/pull/17650) (milenkovicm) +- feat: Support Seconds and Milliseconds literals in substrait [#17707](https://github.com/apache/datafusion/pull/17707) (petern48) +- feat: support for null, date, and timestamp types in approx_distinct [#17618](https://github.com/apache/datafusion/pull/17618) (killme2008) +- feat: support `Utf8View` for more args of `regexp_replace` [#17195](https://github.com/apache/datafusion/pull/17195) (mbutrovich) +- feat(spark): implement Spark `map` function `map_from_arrays` [#17456](https://github.com/apache/datafusion/pull/17456) (SparkApplicationMaster) +- feat: Display window function's alias name in output column [#17788](https://github.com/apache/datafusion/pull/17788) (devampatel03) +- feat(spark): implement Spark `make_dt_interval` function [#17728](https://github.com/apache/datafusion/pull/17728) (davidlghellin) +- feat: support multi-threaded writing of Parquet files with modular encryption [#16738](https://github.com/apache/datafusion/pull/16738) (rok) +- feat(spark): implement Spark `map` function `map_from_entries` [#17779](https://github.com/apache/datafusion/pull/17779) (SparkApplicationMaster) +- feat: Add Hash Join benchmarks [#17636](https://github.com/apache/datafusion/pull/17636) (jonathanc-n) +- feat: Support swap for `RightMark` Join [#17651](https://github.com/apache/datafusion/pull/17651) (jonathanc-n) +- feat: support spark udf format_string [#17561](https://github.com/apache/datafusion/pull/17561) (chenkovsky) +- feat(spark): implement Spark `try_parse_url` function [#17485](https://github.com/apache/datafusion/pull/17485) (rafafrdz) +- feat: Support reading CSV files with inconsistent column counts [#17553](https://github.com/apache/datafusion/pull/17553) (EeshanBembi) +- feat: Adds Instrumented Object Store Registry to datafusion-cli [#17953](https://github.com/apache/datafusion/pull/17953) (BlakeOrth) +- feat: add cargo-machete in CI [#18030](https://github.com/apache/datafusion/pull/18030) (Weijun-H) +- feat(spark): implement Spark `elt` function [#17729](https://github.com/apache/datafusion/pull/17729) (davidlghellin) +- feat: support Spark `concat` string function [#18063](https://github.com/apache/datafusion/pull/18063) (comphead) +- feat: support `null_treatment`, `distinct`, and `filter` for window functions in proto [#18024](https://github.com/apache/datafusion/pull/18024) (dqkqd) +- feat: Add percentile_cont aggregate function [#17988](https://github.com/apache/datafusion/pull/17988) (adriangb) +- feat: spark udf array shuffle [#17674](https://github.com/apache/datafusion/pull/17674) (chenkovsky) +- feat: Support configurable `EXPLAIN ANALYZE` detail level [#18098](https://github.com/apache/datafusion/pull/18098) (2010YOUY01) +- feat: add fp16 support to Substrait [#18086](https://github.com/apache/datafusion/pull/18086) (westonpace) +- feat: `ClassicJoin` for PWMJ [#17482](https://github.com/apache/datafusion/pull/17482) (jonathanc-n) +- feat(docs): display compatible logo for dark mode [#18197](https://github.com/apache/datafusion/pull/18197) (foskey51) +- feat: Add `deregister_object_store` [#17999](https://github.com/apache/datafusion/pull/17999) (jonathanc-n) +- feat: Add existence join to NestedLoopJoin benchmarks [#18005](https://github.com/apache/datafusion/pull/18005) (jonathanc-n) +- feat(small): Set 'summary' level metrics for `DataSourceExec` with parquet source [#18196](https://github.com/apache/datafusion/pull/18196) (2010YOUY01) +- feat: be indifferent to padding when decoding base64 [#18264](https://github.com/apache/datafusion/pull/18264) (colinmarc) +- feat: Add `output_bytes` to baseline metrics [#18268](https://github.com/apache/datafusion/pull/18268) (2010YOUY01) +- feat: Introduce `PruningMetrics` and use it in parquet file pruning metric [#18297](https://github.com/apache/datafusion/pull/18297) (2010YOUY01) +- feat: Improve metrics for aggregate streams. [#18325](https://github.com/apache/datafusion/pull/18325) (EmilyMatt) +- feat: allow pushdown of dynamic filters having partition cols [#18172](https://github.com/apache/datafusion/pull/18172) (feniljain) +- feat: support temporary views in DataFrameTableProvider [#18158](https://github.com/apache/datafusion/pull/18158) (r1b) +- feat: Better parquet row-group/page pruning metrics display [#18321](https://github.com/apache/datafusion/pull/18321) (2010YOUY01) +- feat: Add Hash trait to StatsType enum [#18382](https://github.com/apache/datafusion/pull/18382) (rluvaton) +- feat: support get_field for map literal [#18371](https://github.com/apache/datafusion/pull/18371) (chenkovsky) +- feat(docs): enable navbar [#18324](https://github.com/apache/datafusion/pull/18324) (foskey51) +- feat: Add `selectivity` metrics to `FilterExec` [#18406](https://github.com/apache/datafusion/pull/18406) (2010YOUY01) +- feat: Add `reduction_factor` metric to `AggregateExec` for EXPLAIN ANALYZE [#18455](https://github.com/apache/datafusion/pull/18455) (petern48) +- feat: support named arguments for aggregate and window udfs [#18389](https://github.com/apache/datafusion/pull/18389) (bubulalabu) +- feat: Add selectivity metric to NestedLoopJoinExec for EXPLAIN ANALYZE [#18481](https://github.com/apache/datafusion/pull/18481) (petern48) +- feat: Enhance `array_slice` functionality to support `ListView` and `LargeListView` types [#18432](https://github.com/apache/datafusion/pull/18432) (Weijun-H) + +**Fixed bugs:** + +- fix: lazy evaluation for coalesce [#17357](https://github.com/apache/datafusion/pull/17357) (chenkovsky) +- fix: Implement AggregateUDFImpl::reverse_expr for StringAgg [#17165](https://github.com/apache/datafusion/pull/17165) (nuno-faria) +- fix: Support aggregate expressions in `QUALIFY` [#17313](https://github.com/apache/datafusion/pull/17313) (rkrishn7) +- fix: synchronize partition bounds reporting in HashJoin [#17452](https://github.com/apache/datafusion/pull/17452) (rkrishn7) +- fix: correct typos in `CONTRIBUTING.md` [#17507](https://github.com/apache/datafusion/pull/17507) (Weijun-H) +- fix: Add AWS environment variable checks for S3 tests [#17519](https://github.com/apache/datafusion/pull/17519) (Weijun-H) +- fix: Ensure the CachedParquetFileReader respects the metadata prefetch hint [#17302](https://github.com/apache/datafusion/pull/17302) (nuno-faria) +- fix: prevent UnionExec panic with empty inputs [#17449](https://github.com/apache/datafusion/pull/17449) (EeshanBembi) +- fix: ignore non-existent columns when adding filter equivalence info in `FileScanConfig` [#17546](https://github.com/apache/datafusion/pull/17546) (rkrishn7) +- fix: Prevent duplicate expressions in DynamicPhysicalExpr [#17551](https://github.com/apache/datafusion/pull/17551) (UBarney) +- fix: `SortExec` `TopK` OOM [#17622](https://github.com/apache/datafusion/pull/17622) (nuno-faria) +- fix: Change `OuterReferenceColumn` to contain the entire outer field to prevent metadata loss [#17524](https://github.com/apache/datafusion/pull/17524) (Kontinuation) +- fix: Preserves field metadata when creating logical plan for VALUES expression [#17525](https://github.com/apache/datafusion/pull/17525) (Kontinuation) +- fix: Ignore governance doc from typos [#17678](https://github.com/apache/datafusion/pull/17678) (rkrishn7) +- fix: null padding for `array_reverse` on `FixedSizeList` [#17673](https://github.com/apache/datafusion/pull/17673) (chenkovsky) +- fix: correct statistics for `NestedLoopJoinExec` [#17680](https://github.com/apache/datafusion/pull/17680) (duongcongtoai) +- fix: Partial AggregateMode will generate duplicate field names which will fail DFSchema construct [#17706](https://github.com/apache/datafusion/pull/17706) (zhuqi-lucas) +- fix: Remove parquet encryption feature from root deps [#17700](https://github.com/apache/datafusion/pull/17700) (Vyquos) +- fix: Remove datafusion-macros's dependency on datafusion-expr [#17688](https://github.com/apache/datafusion/pull/17688) (yutannihilation) +- fix: Filter out nulls properly in approx_percentile_cont_with_weight [#17780](https://github.com/apache/datafusion/pull/17780) (Jefffrey) +- fix: ignore `DataType::Null` in possible types during csv type inference [#17796](https://github.com/apache/datafusion/pull/17796) (dqkqd) +- fix: `ParquetSource` - `with_predicate()` don't have to reset metrics [#17858](https://github.com/apache/datafusion/pull/17858) (2010YOUY01) +- fix: optimizer `common_sub_expression_eliminate` fails in a window function [#17852](https://github.com/apache/datafusion/pull/17852) (dqkqd) +- fix: fix failing test compilation on main [#17955](https://github.com/apache/datafusion/pull/17955) (Jefffrey) +- fix: update `PrimitiveGroupValueBuilder` to match NaN correctly in scalar `equal_to` [#17979](https://github.com/apache/datafusion/pull/17979) (rluvaton) +- fix: Add overflow checks to SparkDateAdd/Sub to avoid panics [#18013](https://github.com/apache/datafusion/pull/18013) (andygrove) +- fix: Ensure ListingTable partitions are pruned when filters are not used [#17958](https://github.com/apache/datafusion/pull/17958) (peasee) +- fix: Improve null handling in array_to_string function [#18076](https://github.com/apache/datafusion/pull/18076) (Weijun-H) +- fix: Re-bump latest datafusion-testing module so extended tests succeed [#18110](https://github.com/apache/datafusion/pull/18110) (Jefffrey) +- fix: window unparsing [#17367](https://github.com/apache/datafusion/pull/17367) (chenkovsky) +- fix: Add dictionary coercion support for numeric comparison operations [#18099](https://github.com/apache/datafusion/pull/18099) (ahmed-mez) +- fix(substrait): schema errors for Aggregates with no groupings [#17909](https://github.com/apache/datafusion/pull/17909) (vbarua) +- fix: `array_distinct` inner nullability causing type mismatch [#18104](https://github.com/apache/datafusion/pull/18104) (dqkqd) +- fix: improve document ui [#18157](https://github.com/apache/datafusion/pull/18157) (getChan) +- fix(docs): resolve extra outline on tables [#18193](https://github.com/apache/datafusion/pull/18193) (foskey51) +- fix: Use dynamic timezone in now() function for accurate timestamp [#18017](https://github.com/apache/datafusion/pull/18017) (Weijun-H) +- fix: UnnestExec preserves relevant equivalence properties of input [#16985](https://github.com/apache/datafusion/pull/16985) (vegarsti) +- fix: wrong simplification for >= >, <= < [#18222](https://github.com/apache/datafusion/pull/18222) (chenkovsky) +- fix: only fall back to listing prefixes on 404 errors [#18263](https://github.com/apache/datafusion/pull/18263) (colinmarc) +- fix: Support Dictionary[Int32, Binary] for bitmap count spark function [#18273](https://github.com/apache/datafusion/pull/18273) (kazantsev-maksim) +- fix: support float16 for `abs()` [#18304](https://github.com/apache/datafusion/pull/18304) (Jefffrey) +- fix: Add WITH ORDER display in information_schema.views [#18282](https://github.com/apache/datafusion/pull/18282) (gene-bordegaray) +- fix: correct date_trunc for times before the epoch [#18356](https://github.com/apache/datafusion/pull/18356) (mhilton) +- fix: Preserve percent-encoding in `PartitionedFile` paths during deserialization [#18346](https://github.com/apache/datafusion/pull/18346) (lonless9) +- fix: SortPreservingMerge sanity check rejects valid ORDER BY with CASE expression [#18342](https://github.com/apache/datafusion/pull/18342) (watford-ep) +- fix: `DataFrame::select_columns` and `DataFrame::drop_columns` for qualified duplicated field names [#18236](https://github.com/apache/datafusion/pull/18236) (dqkqd) +- fix(docs): remove navbar padding breaking ui on mobile [#18402](https://github.com/apache/datafusion/pull/18402) (foskey51) +- fix: null cast not valid in substrait round trip [#18414](https://github.com/apache/datafusion/pull/18414) (gene-bordegaray) +- fix: map benchmark failing [#18469](https://github.com/apache/datafusion/pull/18469) (randyli) +- fix: eliminate warning when building without sql feature [#18480](https://github.com/apache/datafusion/pull/18480) (corasaurus-hex) +- fix: spark array return type mismatch when inner data type is LargeList [#18485](https://github.com/apache/datafusion/pull/18485) (jizezhang) +- fix: shuffle seed [#18518](https://github.com/apache/datafusion/pull/18518) (chenkovsky) + +**Documentation updates:** + +- Auto detect hive column partitioning with ListingTableFactory / `CREATE EXTERNAL TABLE` [#17232](https://github.com/apache/datafusion/pull/17232) (BlakeOrth) +- Rename Blaze to Auron [#17532](https://github.com/apache/datafusion/pull/17532) (merrily01) +- Revert #17295 (Support from-first SQL syntax) [#17520](https://github.com/apache/datafusion/pull/17520) (adriangb) +- minor: Update doc comments on type signature [#17556](https://github.com/apache/datafusion/pull/17556) (Jefffrey) +- docs: Update documentation on Epics and Supervising Maintainers [#17505](https://github.com/apache/datafusion/pull/17505) (alamb) +- docs: Move Google Summer of Code 2025 pages to a section [#17504](https://github.com/apache/datafusion/pull/17504) (alamb) +- Upgrade to arrow 56.1.0 [#17275](https://github.com/apache/datafusion/pull/17275) (alamb) +- docs: add xorq to list of known users [#17668](https://github.com/apache/datafusion/pull/17668) (dlovell) +- docs: deduplicate links in `introduction.md` [#17669](https://github.com/apache/datafusion/pull/17669) (Jefffrey) +- Add explicit PMC/committers list to governance docs page [#17574](https://github.com/apache/datafusion/pull/17574) (alamb) +- chore: Update READMEs of crates to be more consistent [#17691](https://github.com/apache/datafusion/pull/17691) (Jefffrey) +- chore: fix wasm-pack installation link in wasmtest README [#17704](https://github.com/apache/datafusion/pull/17704) (Jefffrey) +- docs: Remove disclaimer that `datafusion` 50.0.0 is not released [#17695](https://github.com/apache/datafusion/pull/17695) (nuno-faria) +- Bump MSRV to 1.87.0 [#17724](https://github.com/apache/datafusion/pull/17724) (findepi) +- docs: Fix 'Clicking a link in optimizer docs downloads the file instead of redirecting to github' [#17723](https://github.com/apache/datafusion/pull/17723) (petern48) +- Move misplaced upgrading entry about MSRV [#17727](https://github.com/apache/datafusion/pull/17727) (findepi) +- Introduce `avg_distinct()` and `sum_distinct()` functions to DataFrame API [#17536](https://github.com/apache/datafusion/pull/17536) (Jefffrey) +- Support `WHERE`, `ORDER BY`, `LIMIT`, `SELECT`, `EXTEND` pipe operators [#17278](https://github.com/apache/datafusion/pull/17278) (simonvandel) +- doc: add missing examples for multiple math functions [#17018](https://github.com/apache/datafusion/pull/17018) (Adez017) +- chore: remove homebrew publish instructions from release steps [#17735](https://github.com/apache/datafusion/pull/17735) (Jefffrey) +- Improve documentation for ordered set aggregate functions [#17744](https://github.com/apache/datafusion/pull/17744) (alamb) +- docs: fix sidebar overlapping table on configuration page on website [#17738](https://github.com/apache/datafusion/pull/17738) (saimahendra282) +- docs: add Ballista link to landing page (#17746) [#17775](https://github.com/apache/datafusion/pull/17775) (Nihallllll) +- [DOCS] Add dbt Fusion engine and R2 Query Engine to "Known Users" [#17793](https://github.com/apache/datafusion/pull/17793) (dataders) +- docs: update wasmtest README with instructions for Apple silicon [#17755](https://github.com/apache/datafusion/pull/17755) (Jefffrey) +- docs: Add SedonaDB as known user of Apache DataFusion [#17806](https://github.com/apache/datafusion/pull/17806) (petern48) +- minor: simplify docs build process & pin pip package versions [#17816](https://github.com/apache/datafusion/pull/17816) (Jefffrey) +- Cleanup user guide known users section [#17834](https://github.com/apache/datafusion/pull/17834) (blaginin) +- Fix the doc about row_groups pruning metrics in explain_usage.md [#17846](https://github.com/apache/datafusion/pull/17846) (xudong963) +- Fix docs.rs build: Replace `auto_doc_cfg` with `doc_cfg` [#17845](https://github.com/apache/datafusion/pull/17845) (mbrobbel) +- docs: Add rerun.io to known users guide [#17825](https://github.com/apache/datafusion/pull/17825) (alamb) +- chore: fix typos & pin action hashes [#17855](https://github.com/apache/datafusion/pull/17855) (Jefffrey) +- Clarify email reply instructions for invitations [#17851](https://github.com/apache/datafusion/pull/17851) (rluvaton) +- Add missing parenthesis in features documentation [#17869](https://github.com/apache/datafusion/pull/17869) (Viicos) +- Improve comments for DataSinkExec [#17873](https://github.com/apache/datafusion/pull/17873) (xudong963) +- minor: Make `FunctionRegistry` `udafs` and `udwfs` methods mandatory [#17847](https://github.com/apache/datafusion/pull/17847) (milenkovicm) +- docs: Improve documentation for FunctionFactory / CREATE FUNCTION [#17859](https://github.com/apache/datafusion/pull/17859) (alamb) +- Support `AS`, `UNION`, `INTERSECTION`, `EXCEPT`, `AGGREGATE` pipe operators [#17312](https://github.com/apache/datafusion/pull/17312) (simonvandel) +- [forward port] Change version to 50.1.0 and add changelog (#17748) [#17826](https://github.com/apache/datafusion/pull/17826) (alamb) +- chore(deps): bump maturin from 1.9.4 to 1.9.5 in /docs [#17940](https://github.com/apache/datafusion/pull/17940) (dependabot[bot]) +- docs: `Window::try_new_with_schema` with a descriptive error message [#17926](https://github.com/apache/datafusion/pull/17926) (dqkqd) +- Support `JOIN` pipe operator [#17969](https://github.com/apache/datafusion/pull/17969) (simonvandel) +- Adds Object Store Profiling options/commands to CLI [#18004](https://github.com/apache/datafusion/pull/18004) (BlakeOrth) +- docs: typo in `working-with-exprs.md` [#18033](https://github.com/apache/datafusion/pull/18033) (Weijun-H) +- chore(deps): bump maturin from 1.9.5 to 1.9.6 in /docs [#18039](https://github.com/apache/datafusion/pull/18039) (dependabot[bot]) +- [forward port] Change version to 50.2.0 and add changelog [#18057](https://github.com/apache/datafusion/pull/18057) (xudong963) +- Update committers on governance page [#18015](https://github.com/apache/datafusion/pull/18015) (alamb) +- Feat: Make current_date aware of execution timezone. [#18034](https://github.com/apache/datafusion/pull/18034) (codetyri0n) +- Add independent configs for topk/join dynamic filter [#18090](https://github.com/apache/datafusion/pull/18090) (xudong963) +- Adds Trace and Summary to CLI instrumented stores [#18064](https://github.com/apache/datafusion/pull/18064) (BlakeOrth) +- refactor: add dialect enum [#18043](https://github.com/apache/datafusion/pull/18043) (dariocurr) +- #17982 Make `nvl` a thin wrapper for `coalesce` [#17991](https://github.com/apache/datafusion/pull/17991) (pepijnve) +- minor: fix incorrect deprecation version & window docs [#18093](https://github.com/apache/datafusion/pull/18093) (Jefffrey) +- Adding hiop as known user [#18114](https://github.com/apache/datafusion/pull/18114) (enryls) +- Improve datafusion-cli object store profiling summary display [#18085](https://github.com/apache/datafusion/pull/18085) (alamb) +- Feat: Make current_time aware of execution timezone. [#18040](https://github.com/apache/datafusion/pull/18040) (codetyri0n) +- Docs: Update SQL example for current_time() and current_date(). [#18200](https://github.com/apache/datafusion/pull/18200) (codetyri0n) +- doc: Add `Metrics` section to the user-guide [#18216](https://github.com/apache/datafusion/pull/18216) (2010YOUY01) +- docs: Update HOWTOs for adding new functions [#18089](https://github.com/apache/datafusion/pull/18089) (Jefffrey) +- docs: fix trim for `rust,ignore` blocks [#18239](https://github.com/apache/datafusion/pull/18239) (Jefffrey) +- docs: refine `AggregateUDFImpl::is_ordered_set_aggregate` documentation [#17805](https://github.com/apache/datafusion/pull/17805) (Jefffrey) +- docs: fix broken SQL & DataFrame links in root README (#18153) [#18274](https://github.com/apache/datafusion/pull/18274) (manasa-manoj-nbr) +- doc: Contributor guide for AI-generated PRs [#18237](https://github.com/apache/datafusion/pull/18237) (2010YOUY01) +- doc: Add Join Physical Plan documentation, and configuration flag to benchmarks [#18209](https://github.com/apache/datafusion/pull/18209) (jonathanc-n) +- "Gentle Introduction to Arrow / Record Batches" #11336 [#18051](https://github.com/apache/datafusion/pull/18051) (sm4rtm4art) +- Upgrade DataFusion to arrow/parquet 57.0.0 [#17888](https://github.com/apache/datafusion/pull/17888) (alamb) +- Deduplicate range/gen_series nested functions code [#18198](https://github.com/apache/datafusion/pull/18198) (Jefffrey) +- minor: doc fixes for timestamp output format [#18315](https://github.com/apache/datafusion/pull/18315) (Jefffrey) +- Add PostgreSQL-style named arguments support for scalar functions [#18019](https://github.com/apache/datafusion/pull/18019) (bubulalabu) +- Change default prefetch_hint to 512Kb to reduce number of object store requests when reading parquet files [#18160](https://github.com/apache/datafusion/pull/18160) (zhuqi-lucas) +- Bump MSRV to 1.88.0 [#18403](https://github.com/apache/datafusion/pull/18403) (harshasiddartha) +- Change default `time_zone` to `None` (was `"+00:00"`) [#18359](https://github.com/apache/datafusion/pull/18359) (Omega359) +- Fix instances of "the the" to be "the" in comments/docs [#18478](https://github.com/apache/datafusion/pull/18478) (corasaurus-hex) +- Update roadmap links for DataFusion Q1 2026 [#18495](https://github.com/apache/datafusion/pull/18495) (alamb) +- Add a SpillingPool to manage collections of spill files [#18207](https://github.com/apache/datafusion/pull/18207) (adriangb) + +**Other:** + +- Extract complex default impls from AggregateUDFImpl trait [#17391](https://github.com/apache/datafusion/pull/17391) (findepi) +- chore: make `TableFunction` clonable [#17457](https://github.com/apache/datafusion/pull/17457) (sunng87) +- chore(deps): bump wasm-bindgen-test from 0.3.50 to 0.3.51 [#17470](https://github.com/apache/datafusion/pull/17470) (dependabot[bot]) +- chore(deps): bump log from 0.4.27 to 0.4.28 [#17471](https://github.com/apache/datafusion/pull/17471) (dependabot[bot]) +- Support csv truncated rows in datafusion [#17465](https://github.com/apache/datafusion/pull/17465) (zhuqi-lucas) +- chore(deps): bump indexmap from 2.11.0 to 2.11.1 [#17484](https://github.com/apache/datafusion/pull/17484) (dependabot[bot]) +- chore(deps): bump chrono from 0.4.41 to 0.4.42 [#17483](https://github.com/apache/datafusion/pull/17483) (dependabot[bot]) +- Improve `PartialEq`, `Eq` speed for `LexOrdering`, make `PartialEq` and `PartialOrd` consistent [#17442](https://github.com/apache/datafusion/pull/17442) (findepi) +- Fix array types coercion: preserve child element nullability for list types [#17306](https://github.com/apache/datafusion/pull/17306) (sgrebnov) +- better preserve statistics when applying limits [#17381](https://github.com/apache/datafusion/pull/17381) (adriangb) +- Refactor HashJoinExec to progressively accumulate dynamic filter bounds instead of computing them after data is accumulated [#17444](https://github.com/apache/datafusion/pull/17444) (adriangb) +- Fix `PartialOrd` for logical plan nodes and expressions [#17438](https://github.com/apache/datafusion/pull/17438) (findepi) +- chore(deps): bump sqllogictest from 0.28.3 to 0.28.4 [#17500](https://github.com/apache/datafusion/pull/17500) (dependabot[bot]) +- chore(deps): bump tempfile from 3.21.0 to 3.22.0 [#17499](https://github.com/apache/datafusion/pull/17499) (dependabot[bot]) +- refactor: Move `SMJ` tests into own file [#17495](https://github.com/apache/datafusion/pull/17495) (jonathanc-n) +- move MinAggregator and MaxAggregator to functions-aggregate-common [#17492](https://github.com/apache/datafusion/pull/17492) (adriangb) +- Update datafusion-testing pin to update expected output for extended tests [#17490](https://github.com/apache/datafusion/pull/17490) (alamb) +- update physical-plan to use datafusion-functions-aggregate-common for Min/MaxAccumulator [#17502](https://github.com/apache/datafusion/pull/17502) (adriangb) +- bug: Always use 'indent' format for explain verbose [#17481](https://github.com/apache/datafusion/pull/17481) (petern48) +- Fix ambiguous column names in substrait conversion as a result of literals having the same name during conversion. [#17299](https://github.com/apache/datafusion/pull/17299) (xanderbailey) +- Fix NULL Arithmetic Handling for Numerical Operators in Type Coercion [#17418](https://github.com/apache/datafusion/pull/17418) (etolbakov) +- Prepare for Merge Queue [#17183](https://github.com/apache/datafusion/pull/17183) (blaginin) +- bug: Support null as argument to to_local_time [#17491](https://github.com/apache/datafusion/pull/17491) (petern48) +- Implement timestamp_cast_dtype for SqliteDialect [#17479](https://github.com/apache/datafusion/pull/17479) (krinart) +- Disable `required_status_checks` for now [#17537](https://github.com/apache/datafusion/pull/17537) (blaginin) +- Update Bug issue template to use Bug issue type [#17540](https://github.com/apache/datafusion/pull/17540) (findepi) +- Fix predicate simplification for incompatible types in push_down_filter [#17521](https://github.com/apache/datafusion/pull/17521) (adriangb) +- Add assertion that ScalarUDFImpl implementation is consistent with declared return type [#17515](https://github.com/apache/datafusion/pull/17515) (findepi) +- Using `encode_arrow_schema` from arrow-rs. [#17543](https://github.com/apache/datafusion/pull/17543) (samueleresca) +- Add test for decimal256 and float math [#17530](https://github.com/apache/datafusion/pull/17530) (Jefffrey) +- Document how schema projection works. [#17250](https://github.com/apache/datafusion/pull/17250) (wiedld) +- chore(deps): bump rust_decimal from 1.37.2 to 1.38.0 [#17564](https://github.com/apache/datafusion/pull/17564) (dependabot[bot]) +- chore(deps): bump semver from 1.0.26 to 1.0.27 [#17566](https://github.com/apache/datafusion/pull/17566) (dependabot[bot]) +- Generalize struct-to-struct casting with CastOptions and SchemaAdapter integration [#17468](https://github.com/apache/datafusion/pull/17468) (kosiew) +- Add `TableProvider::scan_with_args` [#17336](https://github.com/apache/datafusion/pull/17336) (adriangb) +- Use taiki-e/install-action and binstall in CI [#17573](https://github.com/apache/datafusion/pull/17573) (AdamGS) +- Trying cargo machete to prune unused deps. [#17545](https://github.com/apache/datafusion/pull/17545) (samueleresca) +- Fix typo in error message in `substring.rs` [#17570](https://github.com/apache/datafusion/pull/17570) (AdamGS) +- chore(deps): bump taiki-e/install-action from 2.61.5 to 2.61.6 [#17586](https://github.com/apache/datafusion/pull/17586) (dependabot[bot]) +- datafusion/substrait: enable `unicode_expressions` in dev-dependencies to fix substring planning test [#17584](https://github.com/apache/datafusion/pull/17584) (kosiew) +- chore: replace deprecated UnionExec API [#17588](https://github.com/apache/datafusion/pull/17588) (etolbakov) +- minor: fix compilation issue for extended tests due to missing parquet encryption flag [#17579](https://github.com/apache/datafusion/pull/17579) (Jefffrey) +- Update release README for new `datafusion/physical-expr-adapter` crate [#17591](https://github.com/apache/datafusion/pull/17591) (xudong963) +- chore(deps): bump indexmap from 2.11.1 to 2.11.3 [#17587](https://github.com/apache/datafusion/pull/17587) (dependabot[bot]) +- chore(deps): bump serde_json from 1.0.143 to 1.0.145 [#17585](https://github.com/apache/datafusion/pull/17585) (dependabot[bot]) +- chore(deps): bump taiki-e/install-action from 2.61.6 to 2.61.8 [#17615](https://github.com/apache/datafusion/pull/17615) (dependabot[bot]) +- Always run CI checks [#17538](https://github.com/apache/datafusion/pull/17538) (blaginin) +- Revert "Always run CI checks" [#17629](https://github.com/apache/datafusion/pull/17629) (blaginin) +- Bump datafusion-testing to latest [#17609](https://github.com/apache/datafusion/pull/17609) (Jefffrey) +- Use `Display` formatting of `DataType`:s in error messages [#17565](https://github.com/apache/datafusion/pull/17565) (emilk) +- `avg(distinct)` support for decimal types [#17560](https://github.com/apache/datafusion/pull/17560) (Jefffrey) +- chore(deps): bump taiki-e/install-action from 2.61.8 to 2.61.9 [#17640](https://github.com/apache/datafusion/pull/17640) (dependabot[bot]) +- chore(deps): bump Swatinem/rust-cache from 2.8.0 to 2.8.1 [#17641](https://github.com/apache/datafusion/pull/17641) (dependabot[bot]) +- Validate the memory consumption in SPM created by multi level merge [#17029](https://github.com/apache/datafusion/pull/17029) (ding-young) +- fix(SubqueryAlias): use maybe_project_redundant_column [#17478](https://github.com/apache/datafusion/pull/17478) (notfilippo) +- minor: Ensure `datafusion-sql` package dependencies have `sql` flag [#17644](https://github.com/apache/datafusion/pull/17644) (Jefffrey) +- optimizer: Rewrite `IS NOT DISTINCT FROM` joins as Hash Joins [#17319](https://github.com/apache/datafusion/pull/17319) (2010YOUY01) +- chore(deps): bump serde from 1.0.223 to 1.0.225 [#17614](https://github.com/apache/datafusion/pull/17614) (dependabot[bot]) +- chore: Update dynamic filter formatting [#17647](https://github.com/apache/datafusion/pull/17647) (rkrishn7) +- chore(deps): bump taiki-e/install-action from 2.61.9 to 2.61.10 [#17660](https://github.com/apache/datafusion/pull/17660) (dependabot[bot]) +- proto: don't include parquet feature by default [#17577](https://github.com/apache/datafusion/pull/17577) (jackkleeman) +- minor: Ensure `proto` crate has datetime & unicode expr flags in datafusion dev dependency [#17656](https://github.com/apache/datafusion/pull/17656) (Jefffrey) +- chore(deps): bump indexmap from 2.11.3 to 2.11.4 [#17661](https://github.com/apache/datafusion/pull/17661) (dependabot[bot]) +- Support Decimal32/64 types [#17501](https://github.com/apache/datafusion/pull/17501) (AdamGS) +- minor: Improve hygiene for `datafusion-functions` macros [#17638](https://github.com/apache/datafusion/pull/17638) (Jefffrey) +- [unparser] Custom timestamp format for DuckDB [#17653](https://github.com/apache/datafusion/pull/17653) (krinart) +- Support LargeList for array_sort [#17657](https://github.com/apache/datafusion/pull/17657) (Jefffrey) +- Support FixedSizeList for array_except [#17658](https://github.com/apache/datafusion/pull/17658) (Jefffrey) +- chore: refactor array fn signatures & add more slt tests [#17672](https://github.com/apache/datafusion/pull/17672) (Jefffrey) +- Support FixedSizeList for array_to_string [#17666](https://github.com/apache/datafusion/pull/17666) (Jefffrey) +- minor: add SQLancer fuzzed SLT case for natural joins [#17683](https://github.com/apache/datafusion/pull/17683) (Jefffrey) +- chore: Upgrade Rust version to 1.90.0 [#17677](https://github.com/apache/datafusion/pull/17677) (rkrishn7) +- Support FixedSizeList for array_position [#17659](https://github.com/apache/datafusion/pull/17659) (Jefffrey) +- chore(deps): bump the proto group with 2 updates [#16806](https://github.com/apache/datafusion/pull/16806) (dependabot[bot]) +- chore: update a bunch of dependencies [#17708](https://github.com/apache/datafusion/pull/17708) (Jefffrey) +- Support FixedSizeList for array_slice via coercion to List [#17667](https://github.com/apache/datafusion/pull/17667) (Jefffrey) +- chore(deps): bump taiki-e/install-action from 2.61.10 to 2.62.1 [#17710](https://github.com/apache/datafusion/pull/17710) (dependabot[bot]) +- fix(agg/corr): return NULL when variance is zero or samples < 2 [#17621](https://github.com/apache/datafusion/pull/17621) (killme2008) +- chore(deps): bump taiki-e/install-action from 2.62.1 to 2.62.4 [#17739](https://github.com/apache/datafusion/pull/17739) (dependabot[bot]) +- chore(deps): bump tempfile from 3.22.0 to 3.23.0 [#17741](https://github.com/apache/datafusion/pull/17741) (dependabot[bot]) +- chore: make `LimitPushPastWindows` public [#17736](https://github.com/apache/datafusion/pull/17736) (linhr) +- minor: create `OptimizerContext` with provided `ConfigOptions` [#17742](https://github.com/apache/datafusion/pull/17742) (MichaelScofield) +- Add support for calling async UDF as aggregation expression [#17620](https://github.com/apache/datafusion/pull/17620) (simonvandel) +- chore(deps): bump taiki-e/install-action from 2.62.4 to 2.62.5 [#17750](https://github.com/apache/datafusion/pull/17750) (dependabot[bot]) +- (fix): Lag function creates unwanted projection (#17630) [#17639](https://github.com/apache/datafusion/pull/17639) (renato2099) +- Support `LargeList` in `array_has` simplification to `InList` [#17732](https://github.com/apache/datafusion/pull/17732) (Jefffrey) +- chore(deps): bump wasm-bindgen-test from 0.3.51 to 0.3.53 [#17642](https://github.com/apache/datafusion/pull/17642) (dependabot[bot]) +- chore(deps): bump object_store from 0.12.3 to 0.12.4 [#17753](https://github.com/apache/datafusion/pull/17753) (dependabot[bot]) +- Update `arrow` / `parquet` to 56.2.0 [#17631](https://github.com/apache/datafusion/pull/17631) (alamb) +- chore(deps): bump taiki-e/install-action from 2.62.5 to 2.62.6 [#17766](https://github.com/apache/datafusion/pull/17766) (dependabot[bot]) +- Keep aggregate udaf schema names unique when missing an order-by [#17731](https://github.com/apache/datafusion/pull/17731) (wiedld) +- feat : Display function alias in output column name [#17690](https://github.com/apache/datafusion/pull/17690) (devampatel03) +- Support join cardinality estimation less conservatively [#17476](https://github.com/apache/datafusion/pull/17476) (jackkleeman) +- chore(deps): bump libc from 0.2.175 to 0.2.176 [#17767](https://github.com/apache/datafusion/pull/17767) (dependabot[bot]) +- chore(deps): bump postgres-types from 0.2.9 to 0.2.10 [#17768](https://github.com/apache/datafusion/pull/17768) (dependabot[bot]) +- Use `Expr::qualified_name()` and `Column::new()` to extract partition keys from window and aggregate operators [#17757](https://github.com/apache/datafusion/pull/17757) (masonh22) +- chore(deps): bump taiki-e/install-action from 2.62.6 to 2.62.8 [#17781](https://github.com/apache/datafusion/pull/17781) (dependabot[bot]) +- chore(deps): bump wasm-bindgen-test from 0.3.53 to 0.3.54 [#17784](https://github.com/apache/datafusion/pull/17784) (dependabot[bot]) +- chore: Action some old TODOs in github actions [#17694](https://github.com/apache/datafusion/pull/17694) (Jefffrey) +- dev: Add benchmark for compilation profiles [#17754](https://github.com/apache/datafusion/pull/17754) (2010YOUY01) +- chore(deps): bump tokio-postgres from 0.7.13 to 0.7.14 [#17785](https://github.com/apache/datafusion/pull/17785) (dependabot[bot]) +- chore(deps): bump serde from 1.0.226 to 1.0.227 [#17783](https://github.com/apache/datafusion/pull/17783) (dependabot[bot]) +- chore(deps): bump regex from 1.11.2 to 1.11.3 [#17782](https://github.com/apache/datafusion/pull/17782) (dependabot[bot]) +- Test `CAST` from temporal to `Utf8View` [#17535](https://github.com/apache/datafusion/pull/17535) (findepi) +- chore: dependabot to run weekly [#17797](https://github.com/apache/datafusion/pull/17797) (comphead) +- chore(deps): bump sysinfo from 0.37.0 to 0.37.1 [#17800](https://github.com/apache/datafusion/pull/17800) (dependabot[bot]) +- chore(deps): bump taiki-e/install-action from 2.62.8 to 2.62.9 [#17799](https://github.com/apache/datafusion/pull/17799) (dependabot[bot]) +- Fix potential overflow when we print verbose physical plan [#17798](https://github.com/apache/datafusion/pull/17798) (zhuqi-lucas) +- Extend datatype semantic equality check to include timestamps [#17777](https://github.com/apache/datafusion/pull/17777) (shivbhatia10) +- dev: Add Apache license check to the lint script [#17787](https://github.com/apache/datafusion/pull/17787) (2010YOUY01) +- Fix: common_sub_expression_eliminate optimizer rule failed [#16066](https://github.com/apache/datafusion/pull/16066) (Col-Waltz) +- chore: remove dialect fixes in SLT tests that are outdated [#17807](https://github.com/apache/datafusion/pull/17807) (Jefffrey) +- chore(deps): bump thiserror from 2.0.16 to 2.0.17 [#17821](https://github.com/apache/datafusion/pull/17821) (dependabot[bot]) +- chore(deps): bump quote from 1.0.40 to 1.0.41 [#17822](https://github.com/apache/datafusion/pull/17822) (dependabot[bot]) +- chore(deps): bump taiki-e/install-action from 2.62.9 to 2.62.12 [#17823](https://github.com/apache/datafusion/pull/17823) (dependabot[bot]) +- chore(deps): bump serde from 1.0.227 to 1.0.228 [#17827](https://github.com/apache/datafusion/pull/17827) (dependabot[bot]) +- Temporarily disable failing `sql_planner` benchmark query [#17809](https://github.com/apache/datafusion/pull/17809) (alamb) +- chore(deps): bump taiki-e/install-action from 2.62.12 to 2.62.13 [#17836](https://github.com/apache/datafusion/pull/17836) (dependabot[bot]) +- More decimal 32/64 support - type coercsion and misc gaps [#17808](https://github.com/apache/datafusion/pull/17808) (AdamGS) +- Implement `AsRef` for `Expr` [#17819](https://github.com/apache/datafusion/pull/17819) (findepi) +- chore(deps): bump taiki-e/install-action from 2.62.13 to 2.62.14 [#17840](https://github.com/apache/datafusion/pull/17840) (dependabot[bot]) +- chore(deps): bump petgraph from 0.8.2 to 0.8.3 [#17842](https://github.com/apache/datafusion/pull/17842) (dependabot[bot]) +- Relax constraint that file sort order must only reference individual columns [#17419](https://github.com/apache/datafusion/pull/17419) (pepijnve) +- minor: Include consumer name in OOM message [#17870](https://github.com/apache/datafusion/pull/17870) (andygrove) +- Implement `partition_statistics` API for `InterleaveExec` [#17051](https://github.com/apache/datafusion/pull/17051) (liamzwbao) +- Add `CastColumnExpr` for struct-aware column casting [#17773](https://github.com/apache/datafusion/pull/17773) (kosiew) +- chore(deps): bump taiki-e/install-action from 2.62.14 to 2.62.16 [#17879](https://github.com/apache/datafusion/pull/17879) (dependabot[bot]) +- chore(deps): bump crate-ci/typos from 1.37.0 to 1.37.1 [#17878](https://github.com/apache/datafusion/pull/17878) (dependabot[bot]) +- Fix failing CI caused by hash collisions [#17886](https://github.com/apache/datafusion/pull/17886) (liamzwbao) +- Minor: reuse test schemas in simplify tests [#17864](https://github.com/apache/datafusion/pull/17864) (alamb) +- Make limit pushdown work for SortPreservingMergeExec [#17893](https://github.com/apache/datafusion/pull/17893) (Dandandan) +- chore(deps): bump taiki-e/install-action from 2.62.16 to 2.62.17 [#17896](https://github.com/apache/datafusion/pull/17896) (dependabot[bot]) +- Consolidate `apply_schema_adapter_tests` [#17905](https://github.com/apache/datafusion/pull/17905) (alamb) +- Improve `InListExpr` plan display [#17884](https://github.com/apache/datafusion/pull/17884) (pepijnve) +- Export JoinSetTracerError from datafusion-common-runtime [#17877](https://github.com/apache/datafusion/pull/17877) (JanKaul) +- Clippy to `extended_tests` [#17922](https://github.com/apache/datafusion/pull/17922) (blaginin) +- chore: rename Schema `print_schema_tree` to `tree_string` [#17919](https://github.com/apache/datafusion/pull/17919) (comphead) +- chore: utilize trait upcasting for AsyncScalarUDF PartialEq & Hash [#17872](https://github.com/apache/datafusion/pull/17872) (Jefffrey) +- Refactor: Update enforce_sorting tests to use insta snapshots for easier updates [#17900](https://github.com/apache/datafusion/pull/17900) (alamb) +- chore(deps): bump flate2 from 1.1.2 to 1.1.4 [#17938](https://github.com/apache/datafusion/pull/17938) (dependabot[bot]) +- chore(deps): bump actions/stale from 10.0.0 to 10.1.0 [#17937](https://github.com/apache/datafusion/pull/17937) (dependabot[bot]) +- chore(deps): bump aws-credential-types from 1.2.6 to 1.2.7 [#17936](https://github.com/apache/datafusion/pull/17936) (dependabot[bot]) +- chore(deps): bump rustyline from 17.0.1 to 17.0.2 [#17932](https://github.com/apache/datafusion/pull/17932) (dependabot[bot]) +- chore(deps): bump taiki-e/install-action from 2.62.17 to 2.62.21 [#17934](https://github.com/apache/datafusion/pull/17934) (dependabot[bot]) +- chore(deps): bump crate-ci/typos from 1.37.1 to 1.37.2 [#17935](https://github.com/apache/datafusion/pull/17935) (dependabot[bot]) +- chore: upgrade sqlparser [#17925](https://github.com/apache/datafusion/pull/17925) (chenkovsky) +- minor: impl Clone and Debug on CaseBuilder [#17927](https://github.com/apache/datafusion/pull/17927) (timsaucer) +- chore: Extend backtrace coverage for `Execution` and `Internal` errors [#17921](https://github.com/apache/datafusion/pull/17921) (comphead) +- chore(deps): bump taiki-e/install-action from 2.62.21 to 2.62.22 [#17949](https://github.com/apache/datafusion/pull/17949) (dependabot[bot]) +- chore(deps): bump crate-ci/typos from 1.37.2 to 1.38.0 [#17948](https://github.com/apache/datafusion/pull/17948) (dependabot[bot]) +- Feat: [datafusion-spark] Migrate avg from comet to datafusion-spark and add tests. [#17871](https://github.com/apache/datafusion/pull/17871) (codetyri0n) +- Update tests to use insta / make them easier to update [#17945](https://github.com/apache/datafusion/pull/17945) (alamb) +- Minor Test refactor: avoid creating the same SchemaRef [#17951](https://github.com/apache/datafusion/pull/17951) (alamb) +- Precision::::{add, sub, multiply}: avoid overflows [#17929](https://github.com/apache/datafusion/pull/17929) (Tpt) +- Resolve `ListingScan` projection against table schema including partition columns [#17911](https://github.com/apache/datafusion/pull/17911) (mach-kernel) +- chore(deps): bump crate-ci/typos from 1.38.0 to 1.38.1 [#17960](https://github.com/apache/datafusion/pull/17960) (dependabot[bot]) +- chore(deps): bump taiki-e/install-action from 2.62.22 to 2.62.23 [#17959](https://github.com/apache/datafusion/pull/17959) (dependabot[bot]) +- bench: fix `vectorized_equal_to` bench mutated between iterations [#17968](https://github.com/apache/datafusion/pull/17968) (rluvaton) +- fix docs and broken example from #17956 [#17980](https://github.com/apache/datafusion/pull/17980) (adriangb) +- Refactor: Update `replace_with_order_preserving_variants` tests to use insta snapshots for easier updates [#17962](https://github.com/apache/datafusion/pull/17962) (blaginin) +- Support repartitioned() method in RepartitionExec [#17990](https://github.com/apache/datafusion/pull/17990) (gabotechs) +- Adds Instrumented Object Store to CLI [#17984](https://github.com/apache/datafusion/pull/17984) (BlakeOrth) +- Migrate `join_selection` tests to snapshot-based testing [#17974](https://github.com/apache/datafusion/pull/17974) (blaginin) +- bench: fix actually generate a lot of unique values in benchmark table [#17967](https://github.com/apache/datafusion/pull/17967) (rluvaton) +- Adds Instrument Mode for InstrumentedObjectStore in datafusion-cli [#18000](https://github.com/apache/datafusion/pull/18000) (BlakeOrth) +- minor: refactor Spark ascii function to reuse DataFusion ascii function code [#17965](https://github.com/apache/datafusion/pull/17965) (Jefffrey) +- chore(deps): bump taiki-e/install-action from 2.62.23 to 2.62.24 [#17989](https://github.com/apache/datafusion/pull/17989) (dependabot[bot]) +- chore(deps): bump taiki-e/install-action from 2.62.24 to 2.62.25 [#18007](https://github.com/apache/datafusion/pull/18007) (dependabot[bot]) +- Clarify documentation that ScalarUDFImpl::simplity must not change the schema [#17981](https://github.com/apache/datafusion/pull/17981) (alamb) +- Expose trace_future and trace_block outside of common-runtime [#17976](https://github.com/apache/datafusion/pull/17976) (AdamGS) +- Adds instrumentation to get requests for datafusion-cli [#18016](https://github.com/apache/datafusion/pull/18016) (BlakeOrth) +- chore(deps): bump half from 2.6.0 to 2.7.0 [#18036](https://github.com/apache/datafusion/pull/18036) (dependabot[bot]) +- chore(deps): bump aws-config from 1.8.6 to 1.8.7 [#18038](https://github.com/apache/datafusion/pull/18038) (dependabot[bot]) +- chore(deps): bump taiki-e/install-action from 2.62.25 to 2.62.28 [#18037](https://github.com/apache/datafusion/pull/18037) (dependabot[bot]) +- refactor: cleanup naming and macro usages for binary operator [#17985](https://github.com/apache/datafusion/pull/17985) (sunng87) +- Impl `gather_filters_for_pushdown` for `CoalescePartitionsExec` [#18046](https://github.com/apache/datafusion/pull/18046) (xudong963) +- Fix bug in LimitPushPastWindows [#18029](https://github.com/apache/datafusion/pull/18029) (avantgardnerio) +- Fix `SortPreservingMergeExec` tree formatting with limit [#18009](https://github.com/apache/datafusion/pull/18009) (AdamGS) +- chore(deps): bump actions/setup-node from 5.0.0 to 6.0.0 [#18049](https://github.com/apache/datafusion/pull/18049) (dependabot[bot]) +- chore(deps): bump sysinfo from 0.37.1 to 0.37.2 [#18035](https://github.com/apache/datafusion/pull/18035) (dependabot[bot]) +- FileScanConfig: Preserve schema metadata across ser/de boundary [#17966](https://github.com/apache/datafusion/pull/17966) (mach-kernel) +- physical-plan: push filters down to UnionExec children [#18054](https://github.com/apache/datafusion/pull/18054) (asubiotto) +- Add `min_max_bytes` benchmark (Reproduce quadratic runtime in min_max_bytes) [#18041](https://github.com/apache/datafusion/pull/18041) (ctsk) +- Adds summary output to CLI instrumented object stores [#18045](https://github.com/apache/datafusion/pull/18045) (BlakeOrth) +- Impl spark bit not function [#18018](https://github.com/apache/datafusion/pull/18018) (kazantsev-maksim) +- chore: revert tests [#18065](https://github.com/apache/datafusion/pull/18065) (comphead) +- chore: Use an enum to express the different kinds of nullability in an array [#18048](https://github.com/apache/datafusion/pull/18048) (martin-g) +- chore(deps): bump taiki-e/install-action from 2.62.28 to 2.62.29 [#18069](https://github.com/apache/datafusion/pull/18069) (dependabot[bot]) +- Split up monster test_window_partial_constant_and_set_monotonicity into smaller functions [#17952](https://github.com/apache/datafusion/pull/17952) (alamb) +- Push Down Filter Subexpressions in Nested Loop Joins as Projections [#17906](https://github.com/apache/datafusion/pull/17906) (tobixdev) +- ci: Use PR description for merge commit body in squash merges [#18027](https://github.com/apache/datafusion/pull/18027) (Weijun-H) +- Fix extended tests on main to get CI green [#18096](https://github.com/apache/datafusion/pull/18096) (alamb) +- chore(deps): bump taiki-e/install-action from 2.62.29 to 2.62.31 [#18094](https://github.com/apache/datafusion/pull/18094) (dependabot[bot]) +- chore: run extended suite on PRs for critical areas [#18088](https://github.com/apache/datafusion/pull/18088) (comphead) +- chore(deps): bump taiki-e/install-action from 2.62.31 to 2.62.33 [#18113](https://github.com/apache/datafusion/pull/18113) (dependabot[bot]) +- chore: remove unnecessary `skip_failed_rules` config in slt [#18117](https://github.com/apache/datafusion/pull/18117) (Jefffrey) +- Refactor repartition to use `insta` [#18106](https://github.com/apache/datafusion/pull/18106) (blaginin) +- refactor: move ListingTable over to the catalog-listing-table crate [#18080](https://github.com/apache/datafusion/pull/18080) (timsaucer) +- refactor: move arrow datasource to new `datafusion-datasource-arrow` crate [#18082](https://github.com/apache/datafusion/pull/18082) (timsaucer) +- Adds instrumentation to LIST operations in CLI [#18103](https://github.com/apache/datafusion/pull/18103) (BlakeOrth) +- Add extra case_when benchmarks [#18097](https://github.com/apache/datafusion/pull/18097) (pepijnve) +- Adds instrumentation to delimited LIST operations in CLI [#18134](https://github.com/apache/datafusion/pull/18134) (BlakeOrth) +- test: `to_timestamp(double)` for vectorized input [#18147](https://github.com/apache/datafusion/pull/18147) (dqkqd) +- Fix `concat_elements_utf8view` capacity initialization. [#18003](https://github.com/apache/datafusion/pull/18003) (samueleresca) +- Use < instead of = in case benchmark predicates, use Integers [#18144](https://github.com/apache/datafusion/pull/18144) (pepijnve) +- Adds instrumentation to PUT ops in the CLI [#18139](https://github.com/apache/datafusion/pull/18139) (BlakeOrth) +- [main] chore: Fix `no space left on device` (#18141) [#18151](https://github.com/apache/datafusion/pull/18151) (alamb) +- Fix `DISTINCT ON` for tables with no columns (ReplaceDistinctWithAggregate: do not fail when on input without columns) [#18133](https://github.com/apache/datafusion/pull/18133) (Tpt) +- Fix quadratic runtime in min_max_bytes [#18044](https://github.com/apache/datafusion/pull/18044) (ctsk) +- chore(deps): bump getrandom from 0.3.3 to 0.3.4 [#18163](https://github.com/apache/datafusion/pull/18163) (dependabot[bot]) +- chore(deps): bump tokio from 1.47.1 to 1.48.0 [#18164](https://github.com/apache/datafusion/pull/18164) (dependabot[bot]) +- chore(deps): bump indexmap from 2.11.4 to 2.12.0 [#18162](https://github.com/apache/datafusion/pull/18162) (dependabot[bot]) +- chore(deps): bump bzip2 from 0.6.0 to 0.6.1 [#18165](https://github.com/apache/datafusion/pull/18165) (dependabot[bot]) +- chore(deps): bump taiki-e/install-action from 2.62.33 to 2.62.34 [#18194](https://github.com/apache/datafusion/pull/18194) (dependabot[bot]) +- Fix COPY TO does not produce an output file for the empty set [#18074](https://github.com/apache/datafusion/pull/18074) (bert-beyondloops) +- Add Projection struct w/ helper methods to manipulate projections [#18176](https://github.com/apache/datafusion/pull/18176) (adriangb) +- Add TableSchema helper to encapsulate file schema + partition fields [#18178](https://github.com/apache/datafusion/pull/18178) (adriangb) +- Add spilling to RepartitionExec [#18014](https://github.com/apache/datafusion/pull/18014) (adriangb) +- Adds DELETE and HEAD instrumentation to CLI [#18206](https://github.com/apache/datafusion/pull/18206) (BlakeOrth) +- [branch-50] Prepare 50.3.0 release version number and README (#18173) [#18182](https://github.com/apache/datafusion/pull/18182) (alamb) +- Fix array_has simplification with null argument [#18186](https://github.com/apache/datafusion/pull/18186) (joroKr21) +- chore(deps): bump taiki-e/install-action from 2.62.34 to 2.62.35 [#18215](https://github.com/apache/datafusion/pull/18215) (dependabot[bot]) +- bench: create benchmark for lookup table like `CASE WHEN` [#18203](https://github.com/apache/datafusion/pull/18203) (rluvaton) +- Adds instrumentation to COPY operations in the CLI [#18227](https://github.com/apache/datafusion/pull/18227) (BlakeOrth) +- Consolidate core_integration/datasource and rename parquet_source --> parquet_integration [#18226](https://github.com/apache/datafusion/pull/18226) (alamb) +- CoalescePartitionsExec fetch is not consistent with one partition and more than one partition [#18245](https://github.com/apache/datafusion/pull/18245) (zhuqi-lucas) +- Migrate core test to insta part 3 [#16978](https://github.com/apache/datafusion/pull/16978) (Chen-Yuan-Lai) +- chore(deps): bump taiki-e/install-action from 2.62.35 to 2.62.36 [#18240](https://github.com/apache/datafusion/pull/18240) (dependabot[bot]) +- Fix: Do not normalize table names when deserializing from protobuf [#18187](https://github.com/apache/datafusion/pull/18187) (drin) +- Revert "chore: revert tests (#18065)" [#18255](https://github.com/apache/datafusion/pull/18255) (dqkqd) +- Refactor `nvl2` Function to Support Lazy Evaluation and Simplification via CASE Expression [#18191](https://github.com/apache/datafusion/pull/18191) (kosiew) +- fix null count stats computation [#18276](https://github.com/apache/datafusion/pull/18276) (adriangb) +- Improve docs and examples for `DataTypeExt` and `FieldExt` [#18271](https://github.com/apache/datafusion/pull/18271) (alamb) +- Easier construction of ScalarAndMetadata [#18272](https://github.com/apache/datafusion/pull/18272) (alamb) +- Add integration test for IO operations for listing tables queries [#18229](https://github.com/apache/datafusion/pull/18229) (alamb) +- Fix: Error rather than silently ignore extra parameter passed to ceil/floor [#18265](https://github.com/apache/datafusion/pull/18265) (toxicteddy00077) +- chore(deps): Update `half` to 2.7.1, ignore `RUSTSEC-2025-0111` [#18287](https://github.com/apache/datafusion/pull/18287) (alamb) +- chore(deps): bump taiki-e/install-action from 2.62.36 to 2.62.38 [#18293](https://github.com/apache/datafusion/pull/18293) (dependabot[bot]) +- chore(deps): bump regex from 1.11.3 to 1.12.2 [#18294](https://github.com/apache/datafusion/pull/18294) (dependabot[bot]) +- chore(deps): bump clap from 4.5.48 to 4.5.50 [#18292](https://github.com/apache/datafusion/pull/18292) (dependabot[bot]) +- chore(deps): bump syn from 2.0.106 to 2.0.108 [#18291](https://github.com/apache/datafusion/pull/18291) (dependabot[bot]) +- Enforce unique names for `is_set` on `first_value` and `last_value` [#18303](https://github.com/apache/datafusion/pull/18303) (marc-pydantic) +- chore(deps): update testcontainers to `0.25.2` and drop ignore of `RUSTSEC-2025-0111` [#18305](https://github.com/apache/datafusion/pull/18305) (DDtKey) +- Using `try_append_value` from arrow-rs 57.0.0 [#18313](https://github.com/apache/datafusion/pull/18313) (samueleresca) +- minor: Add documentation to function `concat_elements_utf8view` [#18316](https://github.com/apache/datafusion/pull/18316) (2010YOUY01) +- chore(deps): bump taiki-e/install-action from 2.62.38 to 2.62.40 [#18318](https://github.com/apache/datafusion/pull/18318) (dependabot[bot]) +- Fix: Add projection to generate_series [#18298](https://github.com/apache/datafusion/pull/18298) (mkleen) +- Do not accept null is_set for first_value/last_value [#18301](https://github.com/apache/datafusion/pull/18301) (marc-pydantic) +- Optimize merging of partial case expression results [#18152](https://github.com/apache/datafusion/pull/18152) (pepijnve) +- chore: Format examples in doc strings - execution [#18339](https://github.com/apache/datafusion/pull/18339) (CuteChuanChuan) +- chore: Format examples in doc strings - common [#18336](https://github.com/apache/datafusion/pull/18336) (CuteChuanChuan) +- chore: Format examples in doc strings - crate datafusion [#18333](https://github.com/apache/datafusion/pull/18333) (CuteChuanChuan) +- chore: Format examples in doc strings - expr [#18340](https://github.com/apache/datafusion/pull/18340) (CuteChuanChuan) +- chore: Format examples in doc strings - datasource crates [#18338](https://github.com/apache/datafusion/pull/18338) (CuteChuanChuan) +- Insta for enforce_distrubution (easy ones) [#18248](https://github.com/apache/datafusion/pull/18248) (blaginin) +- chore: Format examples in doc strings - macros and optmizer [#18354](https://github.com/apache/datafusion/pull/18354) (CuteChuanChuan) +- chore: Format examples in doc strings - proto, pruning, and session [#18358](https://github.com/apache/datafusion/pull/18358) (CuteChuanChuan) +- chore: Format examples in doc strings - catalog listing [#18335](https://github.com/apache/datafusion/pull/18335) (CuteChuanChuan) +- ci: fix temporary file creation in tests and tighten CI check [#18374](https://github.com/apache/datafusion/pull/18374) (2010YOUY01) +- Run extended tests when there are changes to datafusion-testing pin [#18310](https://github.com/apache/datafusion/pull/18310) (alamb) +- Add simple unit test for `merge` in case expression [#18369](https://github.com/apache/datafusion/pull/18369) (pepijnve) +- chore(deps): bump taiki-e/install-action from 2.62.40 to 2.62.41 [#18377](https://github.com/apache/datafusion/pull/18377) (dependabot[bot]) +- Refactor `range`/`gen_series` signature away from user defined [#18317](https://github.com/apache/datafusion/pull/18317) (Jefffrey) +- Adds Partitioned CSV test to object store access tests [#18370](https://github.com/apache/datafusion/pull/18370) (BlakeOrth) +- Add reproducer for consecutive RepartitionExec [#18343](https://github.com/apache/datafusion/pull/18343) (NGA-TRAN) +- chore: bump substrait version to `0.60.0` to use substrait spec v0.75.0 [#17866](https://github.com/apache/datafusion/pull/17866) (benbellick) +- Use the upstream arrow-rs coalesce kernel [#17193](https://github.com/apache/datafusion/pull/17193) (zhuqi-lucas) +- Extract out super slow planning benchmark to it's own benchmark [#18388](https://github.com/apache/datafusion/pull/18388) (Omega359) +- minor: Fix parquet pruning metrics display order [#18379](https://github.com/apache/datafusion/pull/18379) (2010YOUY01) +- chore: use enum as `date_trunc` granularity [#18390](https://github.com/apache/datafusion/pull/18390) (comphead) +- chore(deps): bump taiki-e/install-action from 2.62.41 to 2.62.43 [#18398](https://github.com/apache/datafusion/pull/18398) (dependabot[bot]) +- Project record batches to avoid filtering unused columns in `CASE` evaluation [#18329](https://github.com/apache/datafusion/pull/18329) (pepijnve) +- catch errors when simplifying cast(lit(...), ...) and bubble those up [#18332](https://github.com/apache/datafusion/pull/18332) (adriangb) +- Align `NowFunc::new()` with canonical `ConfigOptions` timezone and enhance documentation [#18347](https://github.com/apache/datafusion/pull/18347) (kosiew) +- chore: Format examples in doc strings - physical expr, optimizer, and plan [#18357](https://github.com/apache/datafusion/pull/18357) (CuteChuanChuan) +- Fix: spark bit_count function [#18322](https://github.com/apache/datafusion/pull/18322) (kazantsev-maksim) +- chore: bump workspace rust version to 1.91.0 [#18422](https://github.com/apache/datafusion/pull/18422) (randyli) +- Minor: Remove unneccessary vec! in SortMergeJoinStream initialization [#18430](https://github.com/apache/datafusion/pull/18430) (mapleFU) +- minor: refactor array reverse internals [#18445](https://github.com/apache/datafusion/pull/18445) (Jefffrey) +- chore(deps): bump taiki-e/install-action from 2.62.43 to 2.62.45 [#18465](https://github.com/apache/datafusion/pull/18465) (dependabot[bot]) +- chore(deps): bump crate-ci/typos from 1.38.1 to 1.39.0 [#18464](https://github.com/apache/datafusion/pull/18464) (dependabot[bot]) +- chore(deps): bump rstest from 0.25.0 to 0.26.1 [#18463](https://github.com/apache/datafusion/pull/18463) (dependabot[bot]) +- chore(deps): bump wasm-bindgen-test from 0.3.54 to 0.3.55 [#18462](https://github.com/apache/datafusion/pull/18462) (dependabot[bot]) +- chore(deps): bump postgres-types from 0.2.10 to 0.2.11 [#18461](https://github.com/apache/datafusion/pull/18461) (dependabot[bot]) +- chore(deps): bump ctor from 0.4.3 to 0.6.1 [#18460](https://github.com/apache/datafusion/pull/18460) (dependabot[bot]) +- chore(deps): bump libc from 0.2.176 to 0.2.177 [#18459](https://github.com/apache/datafusion/pull/18459) (dependabot[bot]) +- chore: Format examples in doc strings - functions [#18353](https://github.com/apache/datafusion/pull/18353) (CuteChuanChuan) +- Feat: Support array flatten() on `List(LargeList(_))` types [#18363](https://github.com/apache/datafusion/pull/18363) (sdf-jkl) +- Reproducer tests for #18380 (resorting sorted inputs) [#18352](https://github.com/apache/datafusion/pull/18352) (rgehan) +- Update criterion to 0.7.\* [#18472](https://github.com/apache/datafusion/pull/18472) (Omega359) +- chore(deps): bump taiki-e/install-action from 2.62.45 to 2.62.46 [#18484](https://github.com/apache/datafusion/pull/18484) (dependabot[bot]) +- Consolidate flight examples (#18142) [#18442](https://github.com/apache/datafusion/pull/18442) (cj-zhukov) +- Support reverse for ListView [#18424](https://github.com/apache/datafusion/pull/18424) (vegarsti) +- Complete migrating `enforce_distrubution` tests to insta [#18185](https://github.com/apache/datafusion/pull/18185) (blaginin) +- Add benchmark for array_reverse [#18425](https://github.com/apache/datafusion/pull/18425) (vegarsti) +- chore: simplify map const [#18440](https://github.com/apache/datafusion/pull/18440) (chenkovsky) +- Fix an out of date comment for `snapshot_physical_expr` [#18498](https://github.com/apache/datafusion/pull/18498) (AdamGS) +- Disable `parquet_encryption` by default in datafusion-sqllogictests [#18492](https://github.com/apache/datafusion/pull/18492) (zhuqi-lucas) +- Make extended test to use optional parquet_encryption feature [#18507](https://github.com/apache/datafusion/pull/18507) (zhuqi-lucas) +- Consolidate udf examples (#18142) [#18493](https://github.com/apache/datafusion/pull/18493) (cj-zhukov) +- test: add prepare alias slt test [#18522](https://github.com/apache/datafusion/pull/18522) (dqkqd) +- CI: add `clippy::needless_pass_by_value` rule [#18468](https://github.com/apache/datafusion/pull/18468) (2010YOUY01) +- Refactor create_hashes to accept array references [#18448](https://github.com/apache/datafusion/pull/18448) (adriangb) +- chore: Format examples in doc strings - spark, sql, sqllogictest, sibstrait [#18443](https://github.com/apache/datafusion/pull/18443) (CuteChuanChuan) +- refactor: simplify `calculate_binary_math` in datafusion-functions [#18525](https://github.com/apache/datafusion/pull/18525) (Jefffrey) +- ci: enforce needless_pass_by_value for datafusion-optimzer [#18533](https://github.com/apache/datafusion/pull/18533) (jizezhang) +- Add comments to Cargo.toml about workspace overrides [#18526](https://github.com/apache/datafusion/pull/18526) (alamb) +- minor: Remove inconsistent comment [#18539](https://github.com/apache/datafusion/pull/18539) (2010YOUY01) +- Refactor `log()` signature to use coercion API + fixes [#18519](https://github.com/apache/datafusion/pull/18519) (Jefffrey) +- chore(deps): bump taiki-e/install-action from 2.62.46 to 2.62.47 [#18508](https://github.com/apache/datafusion/pull/18508) (dependabot[bot]) +- Consolidate builtin functions examples (#18142) [#18523](https://github.com/apache/datafusion/pull/18523) (cj-zhukov) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 88 dependabot[bot] + 49 Jeffrey Vo + 32 Andrew Lamb + 20 Yongting You + 19 Adrian Garcia Badaracco + 14 Blake Orth + 12 Pepijn Van Eeckhoudt + 12 Piotr Findeisen + 11 Chen Chongchen + 11 Dmitrii Blaginin + 11 Yu-Chuan Hung + 9 Jonathan Chen + 9 Khanh Duong + 9 Oleks V + 9 Peter Nguyen + 8 Alex Huang + 8 Qi Zhu + 8 Raz Luvaton + 7 Adam Gutglick + 7 Rohan Krishnaswamy + 7 kosiew + 6 xudong.w + 5 Nuno Faria + 5 Tim Saucer + 4 Dhanush + 4 Samuele Resca + 4 Simon Vandel Sillesen + 4 Sriram Sundar + 4 Vegard Stikbakke + 3 Bruce Ritchie + 3 David López + 3 EeshanBembi + 3 Jack Kleeman + 3 Kazantsev Maksim + 3 Marko Milenković + 3 Thomas Tanon + 2 Andy Grove + 2 Bruno Volpato + 2 Christian + 2 Colin Marc + 2 Cora Sutton + 2 David Stancu + 2 Devam Patel + 2 Eugene Tolbakov + 2 Evgenii Glotov + 2 Kristin Cowalcijk + 2 Liam Bao + 2 Marc Brinkmann + 2 Michael Kleen + 2 Namgung Chan + 2 Ning Sun + 2 Randy + 2 Sergey Zhukov + 2 Viktor Yershov + 2 bubulalabu + 2 dennis zhuang + 2 jizezhang + 2 wiedld + 1 Ahmed Mezghani + 1 Aldrin M + 1 Alfonso Subiotto Marqués + 1 Anders + 1 Artem Medvedev + 1 Aryamaan Singh + 1 Ben Bellick + 1 Berkay Şahin + 1 Bert Vermeiren + 1 Brent Gardner + 1 Christopher Watford + 1 Dan Lovell + 1 Daniël Heres + 1 Dewey Dunnington + 1 Douglas Anderson + 1 Duong Cong Toai + 1 Emil Ernerfeldt + 1 Emily Matheys + 1 Enrico La Sala + 1 Eshed Schacham + 1 Filippo Rossi + 1 Gabriel + 1 Gene Bordegaray + 1 Georgi Krastev + 1 Heran Lin + 1 Hiroaki Yutani + 1 Ian Lai + 1 Ilya Ostanevich + 1 JanKaul + 1 Kosta Tarasov + 1 LFC + 1 Leonardo Yvens + 1 Lía Adriana + 1 Manasa Manoj + 1 Martin + 1 Martin Grigorov + 1 Martin Hilton + 1 Mason + 1 Matt Butrovich + 1 Matthew Kim + 1 Matthijs Brobbel + 1 Nga Tran + 1 Nihal Rajak + 1 Rafael Fernández + 1 Renan GEHAN + 1 Renato Marroquin + 1 Rok Mihevc + 1 Ruilei Ma + 1 Sai Mahendra + 1 Sergei Grebnov + 1 Shiv Bhatia + 1 Tobias Schwarzinger + 1 UBarney + 1 Victor Barua + 1 Victorien + 1 Vyquos + 1 Weston Pace + 1 XL Liang + 1 Xander + 1 Zhen Wang + 1 aditya singh rathore + 1 dario curreri + 1 ding-young + 1 feniljain + 1 gene-bordegaray + 1 harshasiddartha + 1 mwish + 1 peasee + 1 r1b + 1 theirix +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 9f2a3c6085083..6e5e063a12926 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -98,7 +98,7 @@ The following configuration settings are available: | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 50.3.0 | (writing) Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 51.0.0 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | | datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | From 812bb357c5543475509d9d51e78710dcb642986e Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sun, 9 Nov 2025 16:24:08 +0100 Subject: [PATCH 0042/1589] Make array_reverse faster for List and FixedSizeList (#18500) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Rationale for this change Noticed while doing #18424 that the list types `List` and `FixedSizeList` uses `MutableData` to build the reverse array. Using `take` turns out to be a lot faster, ~70% for both `List` and `FixedSizeList`. This PR also reworks the benchmark added in #18425, and these are the results on that compared to the implementation on main: ``` # cargo bench --bench array_reverse Compiling datafusion-functions-nested v50.3.0 (/Users/vegard/dev/datafusion/datafusion/functions-nested) Finished `bench` profile [optimized] target(s) in 42.08s Running benches/array_reverse.rs (target/release/deps/array_reverse-2c473eed34a53d0a) Gnuplot not found, using plotters backend Benchmarking array_reverse_list: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 6.3s, or reduce sample count to 70. array_reverse_list time: [62.201 ms 62.551 ms 62.946 ms] change: [−70.137% −69.965% −69.785%] (p = 0.00 < 0.05) Performance has improved. Found 8 outliers among 100 measurements (8.00%) 5 (5.00%) high mild 3 (3.00%) high severe Benchmarking array_reverse_list_view: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 6.3s, or reduce sample count to 70. array_reverse_list_view time: [61.649 ms 61.905 ms 62.185 ms] change: [−16.122% −15.623% −15.087%] (p = 0.00 < 0.05) Performance has improved. Found 6 outliers among 100 measurements (6.00%) 5 (5.00%) high mild 1 (1.00%) high severe array_reverse_fixed_size_list time: [4.7936 ms 4.8292 ms 4.8741 ms] change: [−76.435% −76.196% −75.951%] (p = 0.00 < 0.05) Performance has improved. Found 20 outliers among 100 measurements (20.00%) 8 (8.00%) low mild 5 (5.00%) high mild 7 (7.00%) high severe ``` ## Are these changes tested? Covered by existing sqllogic tests, and one new test for `FixedSizeList`. --- .../functions-nested/benches/array_reverse.rs | 76 +++++++++--- datafusion/functions-nested/src/reverse.rs | 117 ++++++++++++------ 2 files changed, 138 insertions(+), 55 deletions(-) diff --git a/datafusion/functions-nested/benches/array_reverse.rs b/datafusion/functions-nested/benches/array_reverse.rs index d4a63e36403af..92a65128fe6ba 100644 --- a/datafusion/functions-nested/benches/array_reverse.rs +++ b/datafusion/functions-nested/benches/array_reverse.rs @@ -24,7 +24,7 @@ use std::{hint::black_box, sync::Arc}; use crate::criterion::Criterion; use arrow::{ array::{ArrayRef, FixedSizeListArray, Int32Array, ListArray, ListViewArray}, - buffer::{OffsetBuffer, ScalarBuffer}, + buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}, datatypes::{DataType, Field}, }; use datafusion_functions_nested::reverse::array_reverse_inner; @@ -34,44 +34,80 @@ fn array_reverse(array: &ArrayRef) -> ArrayRef { } fn criterion_benchmark(c: &mut Criterion) { - // Construct large arrays for benchmarking - let array_len = 100000; - let step_size: usize = 1000; - let offsets: Vec = (0..array_len as i32).step_by(step_size).collect(); + // Create array sizes with step size of 100, starting from 100. + let number_of_arrays = 1000; + let sizes = (0..number_of_arrays) + .map(|i| 100 + i * 100) + .collect::>(); + + // Calculate the total number of values + let total_values = sizes.iter().sum::(); + + // Calculate sizes and offsets from array lengths + let offsets = sizes + .iter() + .scan(0, |acc, &x| { + let offset = *acc; + *acc += x; + Some(offset) + }) + .collect::>(); let offsets = ScalarBuffer::from(offsets); - let sizes: Vec = vec![step_size as i32; array_len / step_size]; - let values = (0..array_len as i32).collect::>(); + // Set every 10th array to null + let nulls = (0..number_of_arrays) + .map(|i| i % 10 != 0) + .collect::>(); + + let values = (0..total_values).collect::>(); + let values = Arc::new(Int32Array::from(values)); + + // Create ListArray and ListViewArray + let nulls_list_array = Some(NullBuffer::from( + nulls[..((number_of_arrays as usize) - 1)].to_vec(), + )); let list_array: ArrayRef = Arc::new(ListArray::new( Arc::new(Field::new("a", DataType::Int32, false)), OffsetBuffer::new(offsets.clone()), - Arc::new(Int32Array::from(values.clone())), - None, + values.clone(), + nulls_list_array, )); - let fixed_size_list_array: ArrayRef = Arc::new(FixedSizeListArray::new( - Arc::new(Field::new("a", DataType::Int32, false)), - step_size as i32, - Arc::new(Int32Array::from(values.clone())), - None, + let nulls_list_view_array = Some(NullBuffer::from( + nulls[..(number_of_arrays as usize)].to_vec(), )); let list_view_array: ArrayRef = Arc::new(ListViewArray::new( Arc::new(Field::new("a", DataType::Int32, false)), offsets, ScalarBuffer::from(sizes), - Arc::new(Int32Array::from(values)), - None, + values.clone(), + nulls_list_view_array, )); c.bench_function("array_reverse_list", |b| { b.iter(|| array_reverse(&list_array)) }); - c.bench_function("array_reverse_fixed_size_list", |b| { - b.iter(|| array_reverse(&fixed_size_list_array)) - }); - c.bench_function("array_reverse_list_view", |b| { b.iter(|| array_reverse(&list_view_array)) }); + + // Create FixedSizeListArray + let array_len = 1000; + let num_arrays = 5000; + let total_values = num_arrays * array_len; + let values = (0..total_values).collect::>(); + let values = Arc::new(Int32Array::from(values)); + // Set every 10th array to null + let nulls = (0..num_arrays).map(|i| i % 10 != 0).collect::>(); + let nulls = Some(NullBuffer::from(nulls)); + let fixed_size_list_array: ArrayRef = Arc::new(FixedSizeListArray::new( + Arc::new(Field::new("a", DataType::Int32, false)), + array_len, + values.clone(), + nulls.clone(), + )); + c.bench_function("array_reverse_fixed_size_list", |b| { + b.iter(|| array_reverse(&fixed_size_list_array)) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/functions-nested/src/reverse.rs b/datafusion/functions-nested/src/reverse.rs index 635f23967a198..df873ade798d3 100644 --- a/datafusion/functions-nested/src/reverse.rs +++ b/datafusion/functions-nested/src/reverse.rs @@ -19,8 +19,8 @@ use crate::utils::make_scalar_function; use arrow::array::{ - Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, - GenericListViewArray, MutableArrayData, OffsetSizeTrait, UInt32Array, + Array, ArrayRef, FixedSizeListArray, GenericListArray, GenericListViewArray, + OffsetSizeTrait, UInt32Array, UInt64Array, }; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::take; @@ -155,11 +155,8 @@ fn general_array_reverse( field: &FieldRef, ) -> Result { let values = array.values(); - let original_data = values.to_data(); - let capacity = Capacities::Array(original_data.len()); let mut offsets = vec![O::usize_as(0)]; - let mut mutable = - MutableArrayData::with_capacities(vec![&original_data], false, capacity); + let mut indices: Vec = Vec::with_capacity(values.len()); for (row_index, (&start, &end)) in array.offsets().iter().tuple_windows().enumerate() { @@ -171,18 +168,34 @@ fn general_array_reverse( let mut index = end - O::one(); while index >= start { - mutable.extend(0, index.to_usize().unwrap(), index.to_usize().unwrap() + 1); + indices.push(index); index = index - O::one(); } let size = end - start; offsets.push(offsets[row_index] + size); } - let data = mutable.freeze(); + // Materialize values from underlying array with take + let indices_array: ArrayRef = if O::IS_LARGE { + Arc::new(UInt64Array::from( + indices + .iter() + .map(|i| i.as_usize() as u64) + .collect::>(), + )) + } else { + Arc::new(UInt32Array::from( + indices + .iter() + .map(|i| i.as_usize() as u32) + .collect::>(), + )) + }; + let values = take(&values, &indices_array, None)?; Ok(Arc::new(GenericListArray::::try_new( Arc::clone(field), OffsetBuffer::::new(offsets.into()), - arrow::array::make_array(data), + values, array.nulls().cloned(), )?)) } @@ -231,7 +244,7 @@ fn list_view_reverse( // Materialize values from underlying array with take let indices_array: ArrayRef = if O::IS_LARGE { - Arc::new(arrow::array::UInt64Array::from( + Arc::new(UInt64Array::from( indices .iter() .map(|i| i.as_usize() as u64) @@ -245,13 +258,12 @@ fn list_view_reverse( .collect::>(), )) }; - let values_reversed = take(&values, &indices_array, None)?; - + let values = take(&values, &indices_array, None)?; Ok(Arc::new(GenericListViewArray::::try_new( Arc::clone(field), ScalarBuffer::from(new_offsets), ScalarBuffer::from(new_sizes), - values_reversed, + values, array.nulls().cloned(), )?)) } @@ -260,42 +272,34 @@ fn fixed_size_array_reverse( array: &FixedSizeListArray, field: &FieldRef, ) -> Result { - let values = array.values(); - let original_data = values.to_data(); - let capacity = Capacities::Array(original_data.len()); - let mut mutable = - MutableArrayData::with_capacities(vec![&original_data], false, capacity); - let value_length = array.value_length() as usize; + let values: &Arc = array.values(); - for row_index in 0..array.len() { - // skip the null value - if array.is_null(row_index) { - mutable.extend(0, 0, value_length); - continue; - } - let start = row_index * value_length; - let end = start + value_length; - for idx in (start..end).rev() { - mutable.extend(0, idx, idx + 1); - } + // Since each fixed size list in the physical array is the same size and we keep the order + // of the fixed size lists, we can reverse the indices for each fixed size list. + let mut indices: Vec = (0..values.len() as u64).collect(); + for chunk in indices.chunks_mut(array.value_length() as usize) { + chunk.reverse(); } - let data = mutable.freeze(); + // Materialize values from underlying array with take + let indices_array: ArrayRef = Arc::new(UInt64Array::from(indices)); + let values = take(&values, &indices_array, None)?; + Ok(Arc::new(FixedSizeListArray::try_new( Arc::clone(field), array.value_length(), - arrow::array::make_array(data), + values, array.nulls().cloned(), )?)) } #[cfg(test)] mod tests { - use crate::reverse::list_view_reverse; + use crate::reverse::{fixed_size_array_reverse, list_view_reverse}; use arrow::{ array::{ - AsArray, GenericListViewArray, Int32Array, LargeListViewArray, ListViewArray, - OffsetSizeTrait, + AsArray, FixedSizeListArray, GenericListViewArray, Int32Array, + LargeListViewArray, ListViewArray, OffsetSizeTrait, }, buffer::{NullBuffer, ScalarBuffer}, datatypes::{DataType, Field, Int32Type}, @@ -312,6 +316,13 @@ mod tests { .collect() } + fn fixed_size_list_values(array: &FixedSizeListArray) -> Vec>> { + array + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect() + } + #[test] fn test_reverse_list_view() -> Result<()> { let field = Arc::new(Field::new("a", DataType::Int32, false)); @@ -450,4 +461,40 @@ mod tests { assert_eq!(expected, reversed); Ok(()) } + + #[test] + fn test_reverse_fixed_size_list() -> Result<()> { + let field = Arc::new(Field::new("a", DataType::Int32, false)); + let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9])); + let result = fixed_size_array_reverse( + &FixedSizeListArray::new( + field, + 3, + values, + Some(NullBuffer::from(vec![true, false, true])), + ), + &Arc::new(Field::new("test", DataType::Int32, true)), + )?; + let reversed = fixed_size_list_values(result.as_fixed_size_list()); + let expected = vec![Some(vec![3, 2, 1]), None, Some(vec![9, 8, 7])]; + assert_eq!(expected, reversed); + Ok(()) + } + + #[test] + fn test_reverse_fixed_size_list_empty() -> Result<()> { + let field = Arc::new(Field::new("a", DataType::Int32, false)); + let empty_array: Vec = vec![]; + let values = Arc::new(Int32Array::from(empty_array)); + let nulls = None; + let fixed_size_list = FixedSizeListArray::new(field, 3, values, nulls); + let result = fixed_size_array_reverse( + &fixed_size_list, + &Arc::new(Field::new("test", DataType::Int32, true)), + )?; + let reversed = fixed_size_list_values(result.as_fixed_size_list()); + let expected: Vec>> = vec![]; + assert_eq!(expected, reversed); + Ok(()) + } } From b131cac1b0cf6ace6e250a1b32a6f210269dec13 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov <62326549+cj-zhukov@users.noreply.github.com> Date: Sun, 9 Nov 2025 19:48:35 +0300 Subject: [PATCH 0043/1589] Consolidate custom data source examples (#18142) (#18553) ## Which issue does this PR close? This PR is for consolidating all the `custom_data_source` examples into a single example binary. We are agreed on the pattern and we can apply it to the remaining examples - part of #https://github.com/apache/datafusion/issues/18142. ## Rationale for this change ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --------- Co-authored-by: Sergey Zhukov Co-authored-by: Andrew Lamb --- datafusion-examples/Cargo.toml | 4 - datafusion-examples/README.md | 12 +- .../examples/builtin_functions/main.rs | 5 + .../csv_json_opener.rs | 3 +- .../csv_sql_streaming.rs | 3 +- .../custom_datasource.rs | 3 +- .../custom_file_casts.rs | 4 +- .../custom_file_format.rs | 74 +++++----- .../file_stream_provider.rs | 36 +++-- .../examples/custom_data_source/main.rs | 126 ++++++++++++++++++ datafusion-examples/examples/flight/main.rs | 5 + datafusion-examples/examples/udf/main.rs | 5 + 12 files changed, 210 insertions(+), 70 deletions(-) rename datafusion-examples/examples/{ => custom_data_source}/csv_json_opener.rs (99%) rename datafusion-examples/examples/{ => custom_data_source}/csv_sql_streaming.rs (98%) rename datafusion-examples/examples/{ => custom_data_source}/custom_datasource.rs (99%) rename datafusion-examples/examples/{ => custom_data_source}/custom_file_casts.rs (99%) rename datafusion-examples/examples/{ => custom_data_source}/custom_file_format.rs (97%) rename datafusion-examples/examples/{ => custom_data_source}/file_stream_provider.rs (91%) create mode 100644 datafusion-examples/examples/custom_data_source/main.rs diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 38f1f8b0e0cad..61711f8472ebe 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -43,10 +43,6 @@ path = "examples/external_dependency/dataframe-to-s3.rs" name = "query_aws_s3" path = "examples/external_dependency/query-aws-s3.rs" -[[example]] -name = "custom_file_casts" -path = "examples/custom_file_casts.rs" - [dev-dependencies] arrow = { workspace = true } # arrow_schema is required for record_batch! macro :sad: diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 1befba6be66fd..62e51a7900145 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -54,18 +54,18 @@ cargo run --example dataframe - [`analyzer_rule.rs`](examples/analyzer_rule.rs): Use a custom AnalyzerRule to change a query's semantics (row level access control) - [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog - [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization -- [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file -- [`csv_json_opener.rs`](examples/csv_json_opener.rs): Use low level `FileOpener` APIs to read CSV/JSON into Arrow `RecordBatch`es -- [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider) -- [`custom_file_casts.rs`](examples/custom_file_casts.rs): Implement custom casting rules to adapt file schemas -- [`custom_file_format.rs`](examples/custom_file_format.rs): Write data to a custom file format +- [`examples/custom_data_source/csv_sql_streaming.rs`](examples/custom_data_source/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file +- [`examples/custom_data_source/csv_json_opener.rs`](examples/custom_data_source/csv_json_opener.rs): Use low level `FileOpener` APIs to read CSV/JSON into Arrow `RecordBatch`es +- [`examples/custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs): Run queries against a custom datasource (TableProvider) +- [`examples/custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs): Implement custom casting rules to adapt file schemas +- [`examples/custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs): Write data to a custom file format - [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 - [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file. - [`examples/builtin_functions/date_time`](examples/builtin_functions/date_time.rs): Examples of date-time related functions and queries - [`default_column_values.rs`](examples/default_column_values.rs): Implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter - [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs - [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s -- [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks. +- [`examples/custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks. - [`flight/sql_server.rs`](examples/flight/sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from Flight and and FlightSQL (e.g. JDBC) clients - [`examples/builtin_functions/function_factory.rs`](examples/builtin_functions/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros - [`memory_pool_tracking.rs`](examples/memory_pool_tracking.rs): Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages diff --git a/datafusion-examples/examples/builtin_functions/main.rs b/datafusion-examples/examples/builtin_functions/main.rs index 3399c395bfd62..c307bc9532bff 100644 --- a/datafusion-examples/examples/builtin_functions/main.rs +++ b/datafusion-examples/examples/builtin_functions/main.rs @@ -19,6 +19,11 @@ //! //! These examples demonstrate miscellaneous function-related features. //! +//! ## Usage +//! ```bash +//! cargo run --example builtin_functions -- [date_time|function_factory|regexp] +//! ``` +//! //! Each subcommand runs a corresponding example: //! - `date_time` — examples of date-time related functions and queries //! - `function_factory` — register `CREATE FUNCTION` handler to implement SQL macros diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs similarity index 99% rename from datafusion-examples/examples/csv_json_opener.rs rename to datafusion-examples/examples/custom_data_source/csv_json_opener.rs index 6d0e4f4a3da7a..4205bbcdf86a7 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs @@ -40,8 +40,7 @@ use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; /// read data from (CSV/JSON) into Arrow RecordBatches. /// /// If you want to query data in CSV or JSON files, see the [`dataframe.rs`] and [`sql_query.rs`] examples -#[tokio::main] -async fn main() -> Result<()> { +pub async fn csv_json_opener() -> Result<()> { csv_opener().await?; json_opener().await?; Ok(()) diff --git a/datafusion-examples/examples/csv_sql_streaming.rs b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs similarity index 98% rename from datafusion-examples/examples/csv_sql_streaming.rs rename to datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs index 99264bbcb486d..aca63c4f35c2f 100644 --- a/datafusion-examples/examples/csv_sql_streaming.rs +++ b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs @@ -21,8 +21,7 @@ use datafusion::prelude::*; /// This example demonstrates executing a simple query against an Arrow data source (CSV) and /// fetching results with streaming aggregation and streaming window -#[tokio::main] -async fn main() -> Result<()> { +pub async fn csv_sql_streaming() -> Result<()> { // create local execution context let ctx = SessionContext::new(); diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_data_source/custom_datasource.rs similarity index 99% rename from datafusion-examples/examples/custom_datasource.rs rename to datafusion-examples/examples/custom_data_source/custom_datasource.rs index bc865fac5a338..2213d50fccda4 100644 --- a/datafusion-examples/examples/custom_datasource.rs +++ b/datafusion-examples/examples/custom_data_source/custom_datasource.rs @@ -42,8 +42,7 @@ use datafusion::catalog::Session; use tokio::time::timeout; /// This example demonstrates executing a simple query against a custom datasource -#[tokio::main] -async fn main() -> Result<()> { +pub async fn custom_datasource() -> Result<()> { // create our custom datasource and adding some users let db = CustomDataSource::default(); db.populate_users(); diff --git a/datafusion-examples/examples/custom_file_casts.rs b/datafusion-examples/examples/custom_data_source/custom_file_casts.rs similarity index 99% rename from datafusion-examples/examples/custom_file_casts.rs rename to datafusion-examples/examples/custom_data_source/custom_file_casts.rs index 4d97ecd91dc64..31ec2845c6110 100644 --- a/datafusion-examples/examples/custom_file_casts.rs +++ b/datafusion-examples/examples/custom_data_source/custom_file_casts.rs @@ -44,9 +44,7 @@ use object_store::{ObjectStore, PutPayload}; // This example enforces that casts must be strictly widening: if the file type is Int64 and the table type is Int32, it will error // before even reading the data. // Without this custom cast rule DataFusion would happily do the narrowing cast, potentially erroring only if it found a row with data it could not cast. - -#[tokio::main] -async fn main() -> Result<()> { +pub async fn custom_file_casts() -> Result<()> { println!("=== Creating example data ==="); // Create a logical / table schema with an Int32 column diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_data_source/custom_file_format.rs similarity index 97% rename from datafusion-examples/examples/custom_file_format.rs rename to datafusion-examples/examples/custom_data_source/custom_file_format.rs index 3505651eb183c..510fa53c593f8 100644 --- a/datafusion-examples/examples/custom_file_format.rs +++ b/datafusion-examples/examples/custom_data_source/custom_file_format.rs @@ -48,6 +48,42 @@ use tempfile::tempdir; /// TSVFileFormatFactory is responsible for creating instances of TSVFileFormat. /// The former, once registered with the SessionState, will then be used /// to facilitate SQL operations on TSV files, such as `COPY TO` shown here. +pub async fn custom_file_format() -> Result<()> { + // Create a new context with the default configuration + let mut state = SessionStateBuilder::new().with_default_features().build(); + + // Register the custom file format + let file_format = Arc::new(TSVFileFactory::new()); + state.register_file_format(file_format, true)?; + + // Create a new context with the custom file format + let ctx = SessionContext::new_with_state(state); + + let mem_table = create_mem_table(); + ctx.register_table("mem_table", mem_table)?; + + let temp_dir = tempdir().unwrap(); + let table_save_path = temp_dir.path().join("mem_table.tsv"); + + let d = ctx + .sql(&format!( + "COPY mem_table TO '{}' STORED AS TSV;", + table_save_path.display(), + )) + .await?; + + let results = d.collect().await?; + println!( + "Number of inserted rows: {:?}", + (results[0] + .column_by_name("count") + .unwrap() + .as_primitive::() + .value(0)) + ); + + Ok(()) +} #[derive(Debug)] /// Custom file format that reads and writes TSV files @@ -181,44 +217,6 @@ impl GetExt for TSVFileFactory { } } -#[tokio::main] -async fn main() -> Result<()> { - // Create a new context with the default configuration - let mut state = SessionStateBuilder::new().with_default_features().build(); - - // Register the custom file format - let file_format = Arc::new(TSVFileFactory::new()); - state.register_file_format(file_format, true).unwrap(); - - // Create a new context with the custom file format - let ctx = SessionContext::new_with_state(state); - - let mem_table = create_mem_table(); - ctx.register_table("mem_table", mem_table).unwrap(); - - let temp_dir = tempdir().unwrap(); - let table_save_path = temp_dir.path().join("mem_table.tsv"); - - let d = ctx - .sql(&format!( - "COPY mem_table TO '{}' STORED AS TSV;", - table_save_path.display(), - )) - .await?; - - let results = d.collect().await?; - println!( - "Number of inserted rows: {:?}", - (results[0] - .column_by_name("count") - .unwrap() - .as_primitive::() - .value(0)) - ); - - Ok(()) -} - // create a simple mem table fn create_mem_table() -> Arc { let fields = vec![ diff --git a/datafusion-examples/examples/file_stream_provider.rs b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs similarity index 91% rename from datafusion-examples/examples/file_stream_provider.rs rename to datafusion-examples/examples/custom_data_source/file_stream_provider.rs index e6c59d57e98de..55d2cc8cc0af2 100644 --- a/datafusion-examples/examples/file_stream_provider.rs +++ b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs @@ -15,6 +15,29 @@ // specific language governing permissions and limitations // under the License. +/// Demonstrates how to use [`FileStreamProvider`] and [`StreamTable`] to stream data +/// from a file-like source (FIFO) into DataFusion for continuous querying. +/// +/// On non-Windows systems, this example creates a named pipe (FIFO) and +/// writes rows into it asynchronously while DataFusion reads the data +/// through a `FileStreamProvider`. +/// +/// This illustrates how to integrate dynamically updated data sources +/// with DataFusion without needing to reload the entire dataset each time. +/// +/// This example does not work on Windows. +pub async fn file_stream_provider() -> datafusion::error::Result<()> { + #[cfg(target_os = "windows")] + { + println!("file_stream_provider example does not work on windows"); + Ok(()) + } + #[cfg(not(target_os = "windows"))] + { + non_windows::main().await + } +} + #[cfg(not(target_os = "windows"))] mod non_windows { use datafusion::assert_batches_eq; @@ -186,16 +209,3 @@ mod non_windows { Ok(()) } } - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - #[cfg(target_os = "windows")] - { - println!("file_stream_provider example does not work on windows"); - Ok(()) - } - #[cfg(not(target_os = "windows"))] - { - non_windows::main().await - } -} diff --git a/datafusion-examples/examples/custom_data_source/main.rs b/datafusion-examples/examples/custom_data_source/main.rs new file mode 100644 index 0000000000000..ce0585f8c3f7b --- /dev/null +++ b/datafusion-examples/examples/custom_data_source/main.rs @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # These examples are all related to extending or defining how DataFusion reads data +//! +//! These examples demonstrate how DataFusion reads data. +//! +//! ## Usage +//! ```bash +//! cargo run --example custom_data_source -- [csv_json_opener|csv_sql_streaming|custom_datasource|custom_file_casts|custom_file_format|file_stream_provider] +//! ``` +//! +//! Each subcommand runs a corresponding example: +//! - `csv_json_opener` — use low level FileOpener APIs to read CSV/JSON into Arrow RecordBatches +//! - `csv_sql_streaming` — build and run a streaming query plan from a SQL statement against a local CSV file +//! - `custom_datasource` — run queries against a custom datasource (TableProvider) +//! - `custom_file_casts` — implement custom casting rules to adapt file schemas +//! - `custom_file_format` — write data to a custom file format +//! - `file_stream_provider` — run a query on FileStreamProvider which implements StreamProvider for reading and writing to arbitrary stream sources/sinks + +mod csv_json_opener; +mod csv_sql_streaming; +mod custom_datasource; +mod custom_file_casts; +mod custom_file_format; +mod file_stream_provider; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + CsvJsonOpener, + CsvSqlStreaming, + CustomDatasource, + CustomFileCasts, + CustomFileFormat, + FileFtreamProvider, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::CsvJsonOpener => "csv_json_opener", + Self::CsvSqlStreaming => "csv_sql_streaming", + Self::CustomDatasource => "custom_datasource", + Self::CustomFileCasts => "custom_file_casts", + Self::CustomFileFormat => "custom_file_format", + Self::FileFtreamProvider => "file_stream_provider", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "csv_json_opener" => Ok(Self::CsvJsonOpener), + "csv_sql_streaming" => Ok(Self::CsvSqlStreaming), + "custom_datasource" => Ok(Self::CustomDatasource), + "custom_file_casts" => Ok(Self::CustomFileCasts), + "custom_file_format" => Ok(Self::CustomFileFormat), + "file_stream_provider" => Ok(Self::FileFtreamProvider), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 6] = [ + Self::CsvJsonOpener, + Self::CsvSqlStreaming, + Self::CustomDatasource, + Self::CustomFileCasts, + Self::CustomFileFormat, + Self::FileFtreamProvider, + ]; + + const EXAMPLE_NAME: &str = "custom_data_source"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::CsvJsonOpener => csv_json_opener::csv_json_opener().await?, + ExampleKind::CsvSqlStreaming => csv_sql_streaming::csv_sql_streaming().await?, + ExampleKind::CustomDatasource => custom_datasource::custom_datasource().await?, + ExampleKind::CustomFileCasts => custom_file_casts::custom_file_casts().await?, + ExampleKind::CustomFileFormat => custom_file_format::custom_file_format().await?, + ExampleKind::FileFtreamProvider => { + file_stream_provider::file_stream_provider().await? + } + } + + Ok(()) +} diff --git a/datafusion-examples/examples/flight/main.rs b/datafusion-examples/examples/flight/main.rs index a448789b353b9..a83b19bac42eb 100644 --- a/datafusion-examples/examples/flight/main.rs +++ b/datafusion-examples/examples/flight/main.rs @@ -19,6 +19,11 @@ //! //! These examples demonstrate Arrow Flight usage. //! +//! ## Usage +//! ```bash +//! cargo run --example flight -- [client|server|sql_server] +//! ``` +//! //! Each subcommand runs a corresponding example: //! - `client` — run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol //! - `server` — run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol diff --git a/datafusion-examples/examples/udf/main.rs b/datafusion-examples/examples/udf/main.rs index ba36dbb15c58b..104d373937809 100644 --- a/datafusion-examples/examples/udf/main.rs +++ b/datafusion-examples/examples/udf/main.rs @@ -19,6 +19,11 @@ //! //! These examples demonstrate user-defined functions in DataFusion. //! +//! ## Usage +//! ```bash +//! cargo run --example udf -- [adv_udaf|adv_udf|adv_udwf|async_udf|udaf|udf|udtf|udwf] +//! ``` +//! //! Each subcommand runs a corresponding example: //! - `adv_udaf` — user defined aggregate function example //! - `adv_udf` — user defined scalar function example From 28755b1d7eb5222a8f5fb5417134dd6865ac1311 Mon Sep 17 00:00:00 2001 From: Blake Orth Date: Sun, 9 Nov 2025 09:49:20 -0700 Subject: [PATCH 0044/1589] Normalize partitioned and flat object listing (#18146) ## Which issue does this PR close? - https://github.com/apache/datafusion/issues/17211 It's not yet clear to me if this will fully close the above issue, or if it's just the first step. I think there may be more work to do, so I'm not going to have this auto-close the issue. ## Rationale for this change tl;dr of the issue: normalizing the access pattern(s) for objects for partitioned tables should not only reduce the number of requests to a backing object store, but will also allow any existing and/or future caching mechanisms to apply equally to both directory-partitioned and flat tables. List request on `main`: ```sql DataFusion CLI v50.2.0 > \object_store_profiling summary ObjectStore Profile mode set to Summary > CREATE EXTERNAL TABLE overture_partitioned STORED AS PARQUET LOCATION 's3://overturemaps-us-west-2/release/2025-09-24.0/'; 0 row(s) fetched. Elapsed 37.236 seconds. Object Store Profiling Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(overturemaps-us-west-2) Summaries: +-----------+----------+-----+-----+-----+-----+-------+ | Operation | Metric | min | max | avg | sum | count | +-----------+----------+-----+-----+-----+-----+-------+ | List | duration | | | | | 1 | | List | size | | | | | 1 | +-----------+----------+-----+-----+-----+-----+-------+ Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(overturemaps-us-west-2) Summaries: +-----------+----------+-----------+-----------+-------------+-------------+-------+ | Operation | Metric | min | max | avg | sum | count | +-----------+----------+-----------+-----------+-------------+-------------+-------+ | Get | duration | 0.044411s | 0.338399s | 0.104535s | 162.133179s | 1551 | | Get | size | 8 B | 1285059 B | 338457.56 B | 524947683 B | 1551 | | List | duration | | | | | 3 | | List | size | | | | | 3 | +-----------+----------+-----------+-----------+-------------+-------------+-------+ > select count(*) from overture_partitioned; +------------+ | count(*) | +------------+ | 4219677254 | +------------+ 1 row(s) fetched. Elapsed 40.061 seconds. Object Store Profiling Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(overturemaps-us-west-2) Summaries: +-----------+----------+-----------+-----------+-------------+-------------+-------+ | Operation | Metric | min | max | avg | sum | count | +-----------+----------+-----------+-----------+-------------+-------------+-------+ | Get | duration | 0.042554s | 0.453125s | 0.103147s | 159.980835s | 1551 | | Get | size | 8 B | 1285059 B | 338457.56 B | 524947683 B | 1551 | | List | duration | 0.043498s | 0.196298s | 0.092462s | 2.034174s | 22 | | List | size | | | | | 22 | +-----------+----------+-----------+-----------+-------------+-------------+-------+ > select count(*) from overture_partitioned; +------------+ | count(*) | +------------+ | 4219677254 | +------------+ 1 row(s) fetched. Elapsed 0.924 seconds. Object Store Profiling Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(overturemaps-us-west-2) Summaries: +-----------+----------+-----------+-----------+-----------+-----------+-------+ | Operation | Metric | min | max | avg | sum | count | +-----------+----------+-----------+-----------+-----------+-----------+-------+ | List | duration | 0.040526s | 0.161407s | 0.092792s | 2.041431s | 22 | | List | size | | | | | 22 | +-----------+----------+-----------+-----------+-----------+-----------+-------+ > ``` List requests for this PR: ```sql DataFusion CLI v50.2.0 > \object_store_profiling summary ObjectStore Profile mode set to Summary > CREATE EXTERNAL TABLE overture_partitioned STORED AS PARQUET LOCATION 's3://overturemaps-us-west-2/release/2025-09-24.0/'; 0 row(s) fetched. Elapsed 33.962 seconds. Object Store Profiling Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(overturemaps-us-west-2) Summaries: +-----------+----------+-----+-----+-----+-----+-------+ | Operation | Metric | min | max | avg | sum | count | +-----------+----------+-----+-----+-----+-----+-------+ | List | duration | | | | | 1 | | List | size | | | | | 1 | +-----------+----------+-----+-----+-----+-----+-------+ Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(overturemaps-us-west-2) Summaries: +-----------+----------+-----------+-----------+-------------+-------------+-------+ | Operation | Metric | min | max | avg | sum | count | +-----------+----------+-----------+-----------+-------------+-------------+-------+ | Get | duration | 0.043832s | 0.342730s | 0.110505s | 171.393509s | 1551 | | Get | size | 8 B | 1285059 B | 338457.56 B | 524947683 B | 1551 | | List | duration | | | | | 3 | | List | size | | | | | 3 | +-----------+----------+-----------+-----------+-------------+-------------+-------+ > select count(*) from overture_partitioned; +------------+ | count(*) | +------------+ | 4219677254 | +------------+ 1 row(s) fetched. Elapsed 38.119 seconds. Object Store Profiling Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(overturemaps-us-west-2) Summaries: +-----------+----------+-----------+-----------+-------------+-------------+-------+ | Operation | Metric | min | max | avg | sum | count | +-----------+----------+-----------+-----------+-------------+-------------+-------+ | Get | duration | 0.043186s | 0.296394s | 0.099681s | 154.605286s | 1551 | | Get | size | 8 B | 1285059 B | 338457.56 B | 524947683 B | 1551 | | List | duration | | | | | 1 | | List | size | | | | | 1 | +-----------+----------+-----------+-----------+-------------+-------------+-------+ > select count(*) from overture_partitioned; +------------+ | count(*) | +------------+ | 4219677254 | +------------+ 1 row(s) fetched. Elapsed 0.815 seconds. Object Store Profiling Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(overturemaps-us-west-2) Summaries: +-----------+----------+-----+-----+-----+-----+-------+ | Operation | Metric | min | max | avg | sum | count | +-----------+----------+-----+-----+-----+-----+-------+ | List | duration | | | | | 1 | | List | size | | | | | 1 | +-----------+----------+-----+-----+-----+-----+-------+ > ``` List operations | Action | `main` | this PR | | ---- | ---- | ---- | | Create Table | 3 | 3 | | Cold-cache Query | 22 | 1 | | Warm-cache Query | 22 | 1 | ## What changes are included in this PR? - Refactored helpers related to listing, discovering, and pruning objects based on partitions to normalize the strategy between partitioned and flat tables ## Are these changes tested? Yes. The internal methods that have been modified are covered by existing tests. ## Are there any user-facing changes? No ## Additional Notes I want to surface that I believe there is a chance for a performance _regression_ for certain queries against certain tables. One performance related mechanism the existing code implements, but this code currently omits, is (potentially) reducing the number of partitions listed based on query filters. In order for the existing code to exercise this optimization the query filters must contain all the path elements of a subdirectory as column filters. E.g. Given a table with a directory-partitioning structure like: ``` path/to/table/a=1/b=2/c=3/data.parquet ``` This query: ```sql select count(*) from table where a=1 and b=2; ``` Will result in listing the following path: ``` LIST: path/to/table/a=1/b=2/ ``` Whereas this query: ```sql select count(*) from table where b=2; ``` Will result in listing the following path: ``` LIST: path/to/table/ ``` I believe the real-world impact of this omission is likely minimal, at least when using high-latency storage such as S3 or other object stores, especially considering the existing implementation is likely to execute multiple sequential `LIST` operations due to its breadth-first search implementation. The most likely configuration for a table that would be negatively impacted would be a table that holds many thousands of underlying objects (most cloud stores return recursive list requests with page sizes of many hundreds to thousands of objects) with a relatively shallow partition structure. I may be able to find or build a dataset that fulfills these criteria to test this assertion if there's concern about it. I believe we could also augment the existing low-level `object_store` interactions to allow listing a prefix on a table, which would allow the same pruning of list operations with the code in this PR. The downside to this approach is it either complicates future caching efforts, or leads to cache fragmentation in a simpler cache implementation. I didn't include these changes in this PR to avoid the change set being too large. ## cc @alamb --------- Co-authored-by: Andrew Lamb --- datafusion/catalog-listing/src/helpers.rs | 553 +++--------------- datafusion/core/tests/catalog_listing/mod.rs | 18 + .../catalog_listing/pruned_partition_list.rs | 251 ++++++++ datafusion/core/tests/core_integration.rs | 3 + .../tests/datasource/object_store_access.rs | 63 +- 5 files changed, 371 insertions(+), 517 deletions(-) create mode 100644 datafusion/core/tests/catalog_listing/mod.rs create mode 100644 datafusion/core/tests/catalog_listing/pruned_partition_list.rs diff --git a/datafusion/catalog-listing/src/helpers.rs b/datafusion/catalog-listing/src/helpers.rs index 82cc36867939e..089457648d21f 100644 --- a/datafusion/catalog-listing/src/helpers.rs +++ b/datafusion/catalog-listing/src/helpers.rs @@ -25,12 +25,11 @@ use datafusion_common::internal_err; use datafusion_common::{HashMap, Result, ScalarValue}; use datafusion_datasource::ListingTableUrl; use datafusion_datasource::PartitionedFile; -use datafusion_expr::{BinaryExpr, Operator}; +use datafusion_expr::{lit, utils, BinaryExpr, Operator}; use arrow::{ - array::{Array, ArrayRef, AsArray, StringBuilder}, - compute::{and, cast, prep_null_mask_filter}, - datatypes::{DataType, Field, Fields, Schema}, + array::AsArray, + datatypes::{DataType, Field}, record_batch::RecordBatch, }; use datafusion_expr::execution_props::ExecutionProps; @@ -39,7 +38,7 @@ use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use log::{debug, trace}; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; -use datafusion_common::{Column, DFSchema, DataFusionError}; +use datafusion_common::{Column, DFSchema}; use datafusion_expr::{Expr, Volatility}; use datafusion_physical_expr::create_physical_expr; use object_store::path::Path; @@ -239,105 +238,6 @@ pub async fn list_partitions( Ok(out) } -async fn prune_partitions( - table_path: &ListingTableUrl, - partitions: Vec, - filters: &[Expr], - partition_cols: &[(String, DataType)], -) -> Result> { - if filters.is_empty() { - // prune partitions which don't contain the partition columns - return Ok(partitions - .into_iter() - .filter(|p| { - let cols = partition_cols.iter().map(|x| x.0.as_str()); - !parse_partitions_for_path(table_path, &p.path, cols) - .unwrap_or_default() - .is_empty() - }) - .collect()); - } - - let mut builders: Vec<_> = (0..partition_cols.len()) - .map(|_| StringBuilder::with_capacity(partitions.len(), partitions.len() * 10)) - .collect(); - - for partition in &partitions { - let cols = partition_cols.iter().map(|x| x.0.as_str()); - let parsed = parse_partitions_for_path(table_path, &partition.path, cols) - .unwrap_or_default(); - - let mut builders = builders.iter_mut(); - for (p, b) in parsed.iter().zip(&mut builders) { - b.append_value(p); - } - builders.for_each(|b| b.append_null()); - } - - let arrays = partition_cols - .iter() - .zip(builders) - .map(|((_, d), mut builder)| { - let array = builder.finish(); - cast(&array, d) - }) - .collect::>()?; - - let fields: Fields = partition_cols - .iter() - .map(|(n, d)| Field::new(n, d.clone(), true)) - .collect(); - let schema = Arc::new(Schema::new(fields)); - - let df_schema = DFSchema::from_unqualified_fields( - partition_cols - .iter() - .map(|(n, d)| Field::new(n, d.clone(), true)) - .collect(), - Default::default(), - )?; - - let batch = RecordBatch::try_new(schema, arrays)?; - - // TODO: Plumb this down - let props = ExecutionProps::new(); - - // Applies `filter` to `batch` returning `None` on error - let do_filter = |filter| -> Result { - let expr = create_physical_expr(filter, &df_schema, &props)?; - expr.evaluate(&batch)?.into_array(partitions.len()) - }; - - //.Compute the conjunction of the filters - let mask = filters - .iter() - .map(|f| do_filter(f).map(|a| a.as_boolean().clone())) - .reduce(|a, b| Ok(and(&a?, &b?)?)); - - let mask = match mask { - Some(Ok(mask)) => mask, - Some(Err(err)) => return Err(err), - None => return Ok(partitions), - }; - - // Don't retain partitions that evaluated to null - let prepared = match mask.null_count() { - 0 => mask, - _ => prep_null_mask_filter(&mask), - }; - - // Sanity check - assert_eq!(prepared.len(), partitions.len()); - - let filtered = partitions - .into_iter() - .zip(prepared.values()) - .filter_map(|(p, f)| f.then_some(p)) - .collect(); - - Ok(filtered) -} - #[derive(Debug)] enum PartitionValue { Single(String), @@ -412,6 +312,62 @@ pub fn evaluate_partition_prefix<'a>( } } +fn filter_partitions( + pf: PartitionedFile, + filters: &[Expr], + df_schema: &DFSchema, +) -> Result> { + if pf.partition_values.is_empty() && !filters.is_empty() { + return Ok(None); + } else if filters.is_empty() { + return Ok(Some(pf)); + } + + let arrays = pf + .partition_values + .iter() + .map(|v| v.to_array()) + .collect::>()?; + + let batch = RecordBatch::try_new(Arc::clone(df_schema.inner()), arrays)?; + + let filter = utils::conjunction(filters.iter().cloned()).unwrap_or_else(|| lit(true)); + let props = ExecutionProps::new(); + let expr = create_physical_expr(&filter, df_schema, &props)?; + + // Since we're only operating on a single file, our batch and resulting "array" holds only one + // value indicating if the input file matches the provided filters + let matches = expr.evaluate(&batch)?.into_array(1)?; + if matches.as_boolean().value(0) { + return Ok(Some(pf)); + } + + Ok(None) +} + +fn try_into_partitioned_file( + object_meta: ObjectMeta, + partition_cols: &[(String, DataType)], + table_path: &ListingTableUrl, +) -> Result { + let cols = partition_cols.iter().map(|(name, _)| name.as_str()); + let parsed = parse_partitions_for_path(table_path, &object_meta.location, cols); + + let partition_values = parsed + .into_iter() + .flatten() + .zip(partition_cols) + .map(|(parsed, (_, datatype))| { + ScalarValue::try_from_string(parsed.to_string(), datatype) + }) + .collect::>>()?; + + let mut pf: PartitionedFile = object_meta.into(); + pf.partition_values = partition_values; + + Ok(pf) +} + /// Discover the partitions on the given path and prune out files /// that belong to irrelevant partitions using `filters` expressions. /// `filters` should only contain expressions that can be evaluated @@ -424,7 +380,11 @@ pub async fn pruned_partition_list<'a>( file_extension: &'a str, partition_cols: &'a [(String, DataType)], ) -> Result>> { - // if no partition col => simply list all the files + let objects = table_path + .list_all_files(ctx, store, file_extension) + .await? + .try_filter(|object_meta| futures::future::ready(object_meta.size > 0)); + if partition_cols.is_empty() { if !filters.is_empty() { return internal_err!( @@ -432,72 +392,29 @@ pub async fn pruned_partition_list<'a>( table_path ); } - return Ok(Box::pin( - table_path - .list_all_files(ctx, store, file_extension) - .await? - .try_filter(|object_meta| futures::future::ready(object_meta.size > 0)) - .map_ok(|object_meta| object_meta.into()), - )); - } - - let partition_prefix = evaluate_partition_prefix(partition_cols, filters); - - let partitions = - list_partitions(store, table_path, partition_cols.len(), partition_prefix) - .await?; - debug!("Listed {} partitions", partitions.len()); - let pruned = - prune_partitions(table_path, partitions, filters, partition_cols).await?; - - debug!("Pruning yielded {} partitions", pruned.len()); - - let stream = futures::stream::iter(pruned) - .map(move |partition: Partition| async move { - let cols = partition_cols.iter().map(|x| x.0.as_str()); - let parsed = parse_partitions_for_path(table_path, &partition.path, cols); + // if no partition col => simply list all the files + Ok(objects.map_ok(|object_meta| object_meta.into()).boxed()) + } else { + let df_schema = DFSchema::from_unqualified_fields( + partition_cols + .iter() + .map(|(n, d)| Field::new(n, d.clone(), true)) + .collect(), + Default::default(), + )?; - let partition_values = parsed - .into_iter() - .flatten() - .zip(partition_cols) - .map(|(parsed, (_, datatype))| { - ScalarValue::try_from_string(parsed.to_string(), datatype) - }) - .collect::>>()?; - - let files = match partition.files { - Some(files) => files, - None => { - trace!("Recursively listing partition {}", partition.path); - store.list(Some(&partition.path)).try_collect().await? - } - }; - let files = files.into_iter().filter(move |o| { - let extension_match = o.location.as_ref().ends_with(file_extension); - // here need to scan subdirectories(`listing_table_ignore_subdirectory` = false) - let glob_match = table_path.contains(&o.location, false); - extension_match && glob_match - }); - - let stream = futures::stream::iter(files.map(move |object_meta| { - Ok(PartitionedFile { - object_meta, - partition_values: partition_values.clone(), - range: None, - statistics: None, - extensions: None, - metadata_size_hint: None, - }) - })); - - Ok::<_, DataFusionError>(stream) - }) - .buffer_unordered(CONCURRENCY_LIMIT) - .try_flatten() - .boxed(); - Ok(stream) + Ok(objects + .map_ok(|object_meta| { + try_into_partitioned_file(object_meta, partition_cols, table_path) + }) + .try_filter_map(move |pf| { + futures::future::ready( + pf.and_then(|pf| filter_partitions(pf, filters, &df_schema)), + ) + }) + .boxed()) + } } /// Extract the partition values for the given `file_path` (in the given `table_path`) @@ -541,22 +458,11 @@ pub fn describe_partition(partition: &Partition) -> (&str, usize, Vec<&str>) { #[cfg(test)] mod tests { - use async_trait::async_trait; - use datafusion_common::config::TableOptions; use datafusion_datasource::file_groups::FileGroup; - use datafusion_execution::config::SessionConfig; - use datafusion_execution::runtime_env::RuntimeEnv; - use futures::FutureExt; - use object_store::memory::InMemory; - use std::any::Any; use std::ops::Not; use super::*; - use datafusion_expr::{ - case, col, lit, AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF, - }; - use datafusion_physical_expr_common::physical_expr::PhysicalExpr; - use datafusion_physical_plan::ExecutionPlan; + use datafusion_expr::{case, col, lit, Expr}; #[test] fn test_split_files() { @@ -599,209 +505,6 @@ mod tests { assert_eq!(0, chunks.len()); } - #[tokio::test] - async fn test_pruned_partition_list_empty() { - let (store, state) = make_test_store_and_state(&[ - ("tablepath/mypartition=val1/notparquetfile", 100), - ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), - ("tablepath/file.parquet", 100), - ("tablepath/notapartition/file.parquet", 100), - ("tablepath/notmypartition=val1/file.parquet", 100), - ]); - let filter = Expr::eq(col("mypartition"), lit("val1")); - let pruned = pruned_partition_list( - state.as_ref(), - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - &[filter], - ".parquet", - &[(String::from("mypartition"), DataType::Utf8)], - ) - .await - .expect("partition pruning failed") - .collect::>() - .await; - - assert_eq!(pruned.len(), 0); - } - - #[tokio::test] - async fn test_pruned_partition_list() { - let (store, state) = make_test_store_and_state(&[ - ("tablepath/mypartition=val1/file.parquet", 100), - ("tablepath/mypartition=val2/file.parquet", 100), - ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), - ("tablepath/mypartition=val1/other=val3/file.parquet", 100), - ("tablepath/notapartition/file.parquet", 100), - ("tablepath/notmypartition=val1/file.parquet", 100), - ]); - let filter = Expr::eq(col("mypartition"), lit("val1")); - let pruned = pruned_partition_list( - state.as_ref(), - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - &[filter], - ".parquet", - &[(String::from("mypartition"), DataType::Utf8)], - ) - .await - .expect("partition pruning failed") - .try_collect::>() - .await - .unwrap(); - - assert_eq!(pruned.len(), 2); - let f1 = &pruned[0]; - assert_eq!( - f1.object_meta.location.as_ref(), - "tablepath/mypartition=val1/file.parquet" - ); - assert_eq!(&f1.partition_values, &[ScalarValue::from("val1")]); - let f2 = &pruned[1]; - assert_eq!( - f2.object_meta.location.as_ref(), - "tablepath/mypartition=val1/other=val3/file.parquet" - ); - assert_eq!(f2.partition_values, &[ScalarValue::from("val1"),]); - } - - #[tokio::test] - async fn test_pruned_partition_list_multi() { - let (store, state) = make_test_store_and_state(&[ - ("tablepath/part1=p1v1/file.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), - ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100), - ]); - let filter1 = Expr::eq(col("part1"), lit("p1v2")); - let filter2 = Expr::eq(col("part2"), lit("p2v1")); - let pruned = pruned_partition_list( - state.as_ref(), - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - &[filter1, filter2], - ".parquet", - &[ - (String::from("part1"), DataType::Utf8), - (String::from("part2"), DataType::Utf8), - ], - ) - .await - .expect("partition pruning failed") - .try_collect::>() - .await - .unwrap(); - - assert_eq!(pruned.len(), 2); - let f1 = &pruned[0]; - assert_eq!( - f1.object_meta.location.as_ref(), - "tablepath/part1=p1v2/part2=p2v1/file1.parquet" - ); - assert_eq!( - &f1.partition_values, - &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1"),] - ); - let f2 = &pruned[1]; - assert_eq!( - f2.object_meta.location.as_ref(), - "tablepath/part1=p1v2/part2=p2v1/file2.parquet" - ); - assert_eq!( - &f2.partition_values, - &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1")] - ); - } - - #[tokio::test] - async fn test_list_partition() { - let (store, _) = make_test_store_and_state(&[ - ("tablepath/part1=p1v1/file.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), - ("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0), - ]); - - let partitions = list_partitions( - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - 0, - None, - ) - .await - .expect("listing partitions failed"); - - assert_eq!( - &partitions - .iter() - .map(describe_partition) - .collect::>(), - &vec![ - ("tablepath", 0, vec![]), - ("tablepath/part1=p1v1", 1, vec![]), - ("tablepath/part1=p1v2", 1, vec![]), - ("tablepath/part1=p1v3", 1, vec![]), - ] - ); - - let partitions = list_partitions( - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - 1, - None, - ) - .await - .expect("listing partitions failed"); - - assert_eq!( - &partitions - .iter() - .map(describe_partition) - .collect::>(), - &vec![ - ("tablepath", 0, vec![]), - ("tablepath/part1=p1v1", 1, vec!["file.parquet"]), - ("tablepath/part1=p1v2", 1, vec![]), - ("tablepath/part1=p1v2/part2=p2v1", 2, vec![]), - ("tablepath/part1=p1v2/part2=p2v2", 2, vec![]), - ("tablepath/part1=p1v3", 1, vec![]), - ("tablepath/part1=p1v3/part2=p2v1", 2, vec![]), - ] - ); - - let partitions = list_partitions( - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - 2, - None, - ) - .await - .expect("listing partitions failed"); - - assert_eq!( - &partitions - .iter() - .map(describe_partition) - .collect::>(), - &vec![ - ("tablepath", 0, vec![]), - ("tablepath/part1=p1v1", 1, vec!["file.parquet"]), - ("tablepath/part1=p1v2", 1, vec![]), - ("tablepath/part1=p1v3", 1, vec![]), - ( - "tablepath/part1=p1v2/part2=p2v1", - 2, - vec!["file1.parquet", "file2.parquet"] - ), - ("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]), - ("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]), - ] - ); - } - #[test] fn test_parse_partitions_for_path() { assert_eq!( @@ -1016,86 +719,4 @@ mod tests { Some(Path::from("a=1970-01-05")), ); } - - pub fn make_test_store_and_state( - files: &[(&str, u64)], - ) -> (Arc, Arc) { - let memory = InMemory::new(); - - for (name, size) in files { - memory - .put(&Path::from(*name), vec![0; *size as usize].into()) - .now_or_never() - .unwrap() - .unwrap(); - } - - (Arc::new(memory), Arc::new(MockSession {})) - } - - struct MockSession {} - - #[async_trait] - impl Session for MockSession { - fn session_id(&self) -> &str { - unimplemented!() - } - - fn config(&self) -> &SessionConfig { - unimplemented!() - } - - async fn create_physical_plan( - &self, - _logical_plan: &LogicalPlan, - ) -> Result> { - unimplemented!() - } - - fn create_physical_expr( - &self, - _expr: Expr, - _df_schema: &DFSchema, - ) -> Result> { - unimplemented!() - } - - fn scalar_functions(&self) -> &std::collections::HashMap> { - unimplemented!() - } - - fn aggregate_functions( - &self, - ) -> &std::collections::HashMap> { - unimplemented!() - } - - fn window_functions(&self) -> &std::collections::HashMap> { - unimplemented!() - } - - fn runtime_env(&self) -> &Arc { - unimplemented!() - } - - fn execution_props(&self) -> &ExecutionProps { - unimplemented!() - } - - fn as_any(&self) -> &dyn Any { - unimplemented!() - } - - fn table_options(&self) -> &TableOptions { - unimplemented!() - } - - fn table_options_mut(&mut self) -> &mut TableOptions { - unimplemented!() - } - - fn task_ctx(&self) -> Arc { - unimplemented!() - } - } } diff --git a/datafusion/core/tests/catalog_listing/mod.rs b/datafusion/core/tests/catalog_listing/mod.rs new file mode 100644 index 0000000000000..cb6cac4fb0672 --- /dev/null +++ b/datafusion/core/tests/catalog_listing/mod.rs @@ -0,0 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod pruned_partition_list; diff --git a/datafusion/core/tests/catalog_listing/pruned_partition_list.rs b/datafusion/core/tests/catalog_listing/pruned_partition_list.rs new file mode 100644 index 0000000000000..3cdaa3bb9b348 --- /dev/null +++ b/datafusion/core/tests/catalog_listing/pruned_partition_list.rs @@ -0,0 +1,251 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_schema::DataType; +use futures::{FutureExt, StreamExt as _, TryStreamExt as _}; +use object_store::{memory::InMemory, path::Path, ObjectStore as _}; + +use datafusion::execution::SessionStateBuilder; +use datafusion_catalog_listing::helpers::{ + describe_partition, list_partitions, pruned_partition_list, +}; +use datafusion_common::ScalarValue; +use datafusion_datasource::ListingTableUrl; +use datafusion_expr::{col, lit, Expr}; +use datafusion_session::Session; + +#[tokio::test] +async fn test_pruned_partition_list_empty() { + let (store, state) = make_test_store_and_state(&[ + ("tablepath/mypartition=val1/notparquetfile", 100), + ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), + ("tablepath/file.parquet", 100), + ("tablepath/notapartition/file.parquet", 100), + ("tablepath/notmypartition=val1/file.parquet", 100), + ]); + let filter = Expr::eq(col("mypartition"), lit("val1")); + let pruned = pruned_partition_list( + state.as_ref(), + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + &[filter], + ".parquet", + &[(String::from("mypartition"), DataType::Utf8)], + ) + .await + .expect("partition pruning failed") + .collect::>() + .await; + + assert_eq!(pruned.len(), 0); +} + +#[tokio::test] +async fn test_pruned_partition_list() { + let (store, state) = make_test_store_and_state(&[ + ("tablepath/mypartition=val1/file.parquet", 100), + ("tablepath/mypartition=val2/file.parquet", 100), + ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), + ("tablepath/mypartition=val1/other=val3/file.parquet", 100), + ("tablepath/notapartition/file.parquet", 100), + ("tablepath/notmypartition=val1/file.parquet", 100), + ]); + let filter = Expr::eq(col("mypartition"), lit("val1")); + let pruned = pruned_partition_list( + state.as_ref(), + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + &[filter], + ".parquet", + &[(String::from("mypartition"), DataType::Utf8)], + ) + .await + .expect("partition pruning failed") + .try_collect::>() + .await + .unwrap(); + + assert_eq!(pruned.len(), 2); + let f1 = &pruned[0]; + assert_eq!( + f1.object_meta.location.as_ref(), + "tablepath/mypartition=val1/file.parquet" + ); + assert_eq!(&f1.partition_values, &[ScalarValue::from("val1")]); + let f2 = &pruned[1]; + assert_eq!( + f2.object_meta.location.as_ref(), + "tablepath/mypartition=val1/other=val3/file.parquet" + ); + assert_eq!(f2.partition_values, &[ScalarValue::from("val1"),]); +} + +#[tokio::test] +async fn test_pruned_partition_list_multi() { + let (store, state) = make_test_store_and_state(&[ + ("tablepath/part1=p1v1/file.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), + ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100), + ]); + let filter1 = Expr::eq(col("part1"), lit("p1v2")); + let filter2 = Expr::eq(col("part2"), lit("p2v1")); + let pruned = pruned_partition_list( + state.as_ref(), + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + &[filter1, filter2], + ".parquet", + &[ + (String::from("part1"), DataType::Utf8), + (String::from("part2"), DataType::Utf8), + ], + ) + .await + .expect("partition pruning failed") + .try_collect::>() + .await + .unwrap(); + + assert_eq!(pruned.len(), 2); + let f1 = &pruned[0]; + assert_eq!( + f1.object_meta.location.as_ref(), + "tablepath/part1=p1v2/part2=p2v1/file1.parquet" + ); + assert_eq!( + &f1.partition_values, + &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1"),] + ); + let f2 = &pruned[1]; + assert_eq!( + f2.object_meta.location.as_ref(), + "tablepath/part1=p1v2/part2=p2v1/file2.parquet" + ); + assert_eq!( + &f2.partition_values, + &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1")] + ); +} + +#[tokio::test] +async fn test_list_partition() { + let (store, _) = make_test_store_and_state(&[ + ("tablepath/part1=p1v1/file.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), + ("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0), + ]); + + let partitions = list_partitions( + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + 0, + None, + ) + .await + .expect("listing partitions failed"); + + assert_eq!( + &partitions + .iter() + .map(describe_partition) + .collect::>(), + &vec![ + ("tablepath", 0, vec![]), + ("tablepath/part1=p1v1", 1, vec![]), + ("tablepath/part1=p1v2", 1, vec![]), + ("tablepath/part1=p1v3", 1, vec![]), + ] + ); + + let partitions = list_partitions( + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + 1, + None, + ) + .await + .expect("listing partitions failed"); + + assert_eq!( + &partitions + .iter() + .map(describe_partition) + .collect::>(), + &vec![ + ("tablepath", 0, vec![]), + ("tablepath/part1=p1v1", 1, vec!["file.parquet"]), + ("tablepath/part1=p1v2", 1, vec![]), + ("tablepath/part1=p1v2/part2=p2v1", 2, vec![]), + ("tablepath/part1=p1v2/part2=p2v2", 2, vec![]), + ("tablepath/part1=p1v3", 1, vec![]), + ("tablepath/part1=p1v3/part2=p2v1", 2, vec![]), + ] + ); + + let partitions = list_partitions( + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + 2, + None, + ) + .await + .expect("listing partitions failed"); + + assert_eq!( + &partitions + .iter() + .map(describe_partition) + .collect::>(), + &vec![ + ("tablepath", 0, vec![]), + ("tablepath/part1=p1v1", 1, vec!["file.parquet"]), + ("tablepath/part1=p1v2", 1, vec![]), + ("tablepath/part1=p1v3", 1, vec![]), + ( + "tablepath/part1=p1v2/part2=p2v1", + 2, + vec!["file1.parquet", "file2.parquet"] + ), + ("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]), + ("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]), + ] + ); +} + +pub fn make_test_store_and_state( + files: &[(&str, u64)], +) -> (Arc, Arc) { + let memory = InMemory::new(); + + for (name, size) in files { + memory + .put(&Path::from(*name), vec![0; *size as usize].into()) + .now_or_never() + .unwrap() + .unwrap(); + } + + let state = SessionStateBuilder::new().build(); + (Arc::new(memory), Arc::new(state)) +} diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs index edcf039e4e704..cc4dfcf72059f 100644 --- a/datafusion/core/tests/core_integration.rs +++ b/datafusion/core/tests/core_integration.rs @@ -57,6 +57,9 @@ mod serde; /// Run all tests that are found in the `catalog` directory mod catalog; +/// Run all tests that are found in the `catalog_listing` directory +mod catalog_listing; + /// Run all tests that are found in the `tracing` directory mod tracing; diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs index f89ca9e049147..33129150db588 100644 --- a/datafusion/core/tests/datasource/object_store_access.rs +++ b/datafusion/core/tests/datasource/object_store_access.rs @@ -145,17 +145,8 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 13 - - LIST (with delimiter) prefix=data - - LIST (with delimiter) prefix=data/a=1 - - LIST (with delimiter) prefix=data/a=2 - - LIST (with delimiter) prefix=data/a=3 - - LIST (with delimiter) prefix=data/a=1/b=10 - - LIST (with delimiter) prefix=data/a=2/b=20 - - LIST (with delimiter) prefix=data/a=3/b=30 - - LIST (with delimiter) prefix=data/a=1/b=10/c=100 - - LIST (with delimiter) prefix=data/a=2/b=20/c=200 - - LIST (with delimiter) prefix=data/a=3/b=30/c=300 + Total Requests: 4 + - LIST prefix=data - GET (opts) path=data/a=1/b=10/c=100/file_1.csv - GET (opts) path=data/a=2/b=20/c=200/file_2.csv - GET (opts) path=data/a=3/b=30/c=300/file_3.csv @@ -174,10 +165,8 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 4 - - LIST (with delimiter) prefix=data/a=2 - - LIST (with delimiter) prefix=data/a=2/b=20 - - LIST (with delimiter) prefix=data/a=2/b=20/c=200 + Total Requests: 2 + - LIST prefix=data - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -194,17 +183,8 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 11 - - LIST (with delimiter) prefix=data - - LIST (with delimiter) prefix=data/a=1 - - LIST (with delimiter) prefix=data/a=2 - - LIST (with delimiter) prefix=data/a=3 - - LIST (with delimiter) prefix=data/a=1/b=10 - - LIST (with delimiter) prefix=data/a=2/b=20 - - LIST (with delimiter) prefix=data/a=3/b=30 - - LIST (with delimiter) prefix=data/a=1/b=10/c=100 - - LIST (with delimiter) prefix=data/a=2/b=20/c=200 - - LIST (with delimiter) prefix=data/a=3/b=30/c=300 + Total Requests: 2 + - LIST prefix=data - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -221,17 +201,8 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 11 - - LIST (with delimiter) prefix=data - - LIST (with delimiter) prefix=data/a=1 - - LIST (with delimiter) prefix=data/a=2 - - LIST (with delimiter) prefix=data/a=3 - - LIST (with delimiter) prefix=data/a=1/b=10 - - LIST (with delimiter) prefix=data/a=2/b=20 - - LIST (with delimiter) prefix=data/a=3/b=30 - - LIST (with delimiter) prefix=data/a=1/b=10/c=100 - - LIST (with delimiter) prefix=data/a=2/b=20/c=200 - - LIST (with delimiter) prefix=data/a=3/b=30/c=300 + Total Requests: 2 + - LIST prefix=data - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -248,9 +219,8 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 3 - - LIST (with delimiter) prefix=data/a=2/b=20 - - LIST (with delimiter) prefix=data/a=2/b=20/c=200 + Total Requests: 2 + - LIST prefix=data - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -267,17 +237,8 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 11 - - LIST (with delimiter) prefix=data - - LIST (with delimiter) prefix=data/a=1 - - LIST (with delimiter) prefix=data/a=2 - - LIST (with delimiter) prefix=data/a=3 - - LIST (with delimiter) prefix=data/a=1/b=10 - - LIST (with delimiter) prefix=data/a=2/b=20 - - LIST (with delimiter) prefix=data/a=3/b=30 - - LIST (with delimiter) prefix=data/a=1/b=10/c=100 - - LIST (with delimiter) prefix=data/a=2/b=20/c=200 - - LIST (with delimiter) prefix=data/a=3/b=30/c=300 + Total Requests: 2 + - LIST prefix=data - GET (opts) path=data/a=1/b=10/c=100/file_1.csv " ); From f162fd325565e14be8e4cace17d8a3a8b2764cc8 Mon Sep 17 00:00:00 2001 From: Suhail <19748270+nmbr7@users.noreply.github.com> Date: Mon, 10 Nov 2025 05:08:13 +0530 Subject: [PATCH 0045/1589] refactor: include metric output_batches into BaselineMetrics (#18491) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #17027 ## Rationale for this change `output_batches` should be a common metric in all operators, thus should ideally be added to `BaselineMetrics` ``` > explain analyze select * from generate_series(1, 1000000) as t1(v1) order by v1 desc; +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | plan_type | plan | +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Plan with Metrics | SortExec: expr=[v1@0 DESC], preserve_partitioning=[false], metrics=[output_rows=1000000, elapsed_compute=535.320324ms, output_bytes=7.6 MB, output_batches=123, spill_count=0, spilled_bytes=0.0 B, spilled_rows=0, batches_split=0] | | | ProjectionExec: expr=[value@0 as v1], metrics=[output_rows=1000000, elapsed_compute=208.379µs, output_bytes=7.7 MB, output_batches=123] | | | LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=1000000, batch_size=8192], metrics=[output_rows=1000000, elapsed_compute=15.924291ms, output_bytes=7.7 MB, output_batches=123] | | | | +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 1 row(s) fetched. Elapsed 0.492 second ``` ## What changes are included in this PR? - Added `output_batches` into `BaselineMetrics` with `DEV` MetricType - Tracked through `record_poll()` API - Changes are similar to https://github.com/apache/datafusion/pull/18268 - Refactored `assert_metrics` macro to take multiple metrics strings for substring check - Added `output_bytes` and `output_batches` tracking in `TopK` operator - Added `baseline` metrics for `RepartitionExec` ## Are these changes tested? Added UT ## Are there any user-facing changes? Changes in the `EXPLAIN ANALYZE` output, `output_batches` will be added to `metrics=[...]` --- datafusion/core/tests/sql/explain_analyze.rs | 102 ++++++++++-------- datafusion/core/tests/sql/mod.rs | 18 ++-- .../physical-plan/src/joins/cross_join.rs | 1 - .../src/joins/hash_join/stream.rs | 4 - .../src/joins/nested_loop_join.rs | 4 - .../src/joins/sort_merge_join/metrics.rs | 8 -- .../src/joins/sort_merge_join/stream.rs | 6 +- .../src/joins/stream_join_utils.rs | 6 -- .../src/joins/symmetric_hash_join.rs | 1 - datafusion/physical-plan/src/joins/utils.rs | 6 -- .../physical-plan/src/metrics/baseline.rs | 14 +++ .../physical-plan/src/metrics/builder.rs | 8 ++ datafusion/physical-plan/src/metrics/mod.rs | 1 + datafusion/physical-plan/src/metrics/value.rs | 43 +++++--- .../physical-plan/src/repartition/mod.rs | 31 ++++-- datafusion/physical-plan/src/sorts/sort.rs | 5 +- datafusion/physical-plan/src/topk/mod.rs | 6 +- datafusion/physical-plan/src/unnest.rs | 9 +- docs/source/user-guide/metrics.md | 1 + 19 files changed, 157 insertions(+), 117 deletions(-) diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 26b71b5496f29..e56d4e6d8b04c 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -61,12 +61,9 @@ async fn explain_analyze_baseline_metrics() { assert_metrics!( &formatted, "AggregateExec: mode=Partial, gby=[]", - "metrics=[output_rows=3, elapsed_compute=" - ); - assert_metrics!( - &formatted, - "AggregateExec: mode=Partial, gby=[]", - "output_bytes=" + "metrics=[output_rows=3, elapsed_compute=", + "output_bytes=", + "output_batches=3" ); assert_metrics!( @@ -75,59 +72,76 @@ async fn explain_analyze_baseline_metrics() { "reduction_factor=5.1% (5/99)" ); - assert_metrics!( - &formatted, - "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", - "metrics=[output_rows=5, elapsed_compute=" - ); - assert_metrics!( - &formatted, - "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", - "output_bytes=" - ); - assert_metrics!( - &formatted, - "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", - "metrics=[output_rows=99, elapsed_compute=" - ); + { + let expected_batch_count_after_repartition = + if cfg!(not(feature = "force_hash_collisions")) { + "output_batches=3" + } else { + "output_batches=1" + }; + + assert_metrics!( + &formatted, + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", + "metrics=[output_rows=5, elapsed_compute=", + "output_bytes=", + expected_batch_count_after_repartition + ); + + assert_metrics!( + &formatted, + "RepartitionExec: partitioning=Hash([c1@0], 3), input_partitions=3", + "metrics=[output_rows=5, elapsed_compute=", + "output_bytes=", + expected_batch_count_after_repartition + ); + + assert_metrics!( + &formatted, + "ProjectionExec: expr=[]", + "metrics=[output_rows=5, elapsed_compute=", + "output_bytes=", + expected_batch_count_after_repartition + ); + + assert_metrics!( + &formatted, + "CoalesceBatchesExec: target_batch_size=4096", + "metrics=[output_rows=5, elapsed_compute", + "output_bytes=", + expected_batch_count_after_repartition + ); + } + assert_metrics!( &formatted, "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", - "output_bytes=" + "metrics=[output_rows=99, elapsed_compute=", + "output_bytes=", + "output_batches=1" ); + assert_metrics!( &formatted, "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", "selectivity=99% (99/100)" ); - assert_metrics!( - &formatted, - "ProjectionExec: expr=[]", - "metrics=[output_rows=5, elapsed_compute=" - ); - assert_metrics!(&formatted, "ProjectionExec: expr=[]", "output_bytes="); - assert_metrics!( - &formatted, - "CoalesceBatchesExec: target_batch_size=4096", - "metrics=[output_rows=5, elapsed_compute" - ); - assert_metrics!( - &formatted, - "CoalesceBatchesExec: target_batch_size=4096", - "output_bytes=" - ); + assert_metrics!( &formatted, "UnionExec", - "metrics=[output_rows=3, elapsed_compute=" + "metrics=[output_rows=3, elapsed_compute=", + "output_bytes=", + "output_batches=3" ); - assert_metrics!(&formatted, "UnionExec", "output_bytes="); + assert_metrics!( &formatted, "WindowAggExec", - "metrics=[output_rows=1, elapsed_compute=" + "metrics=[output_rows=1, elapsed_compute=", + "output_bytes=", + "output_batches=1" ); - assert_metrics!(&formatted, "WindowAggExec", "output_bytes="); fn expected_to_have_metrics(plan: &dyn ExecutionPlan) -> bool { use datafusion::physical_plan; @@ -228,9 +242,13 @@ async fn explain_analyze_level() { for (level, needle, should_contain) in [ (ExplainAnalyzeLevel::Summary, "spill_count", false), + (ExplainAnalyzeLevel::Summary, "output_batches", false), (ExplainAnalyzeLevel::Summary, "output_rows", true), + (ExplainAnalyzeLevel::Summary, "output_bytes", true), (ExplainAnalyzeLevel::Dev, "spill_count", true), (ExplainAnalyzeLevel::Dev, "output_rows", true), + (ExplainAnalyzeLevel::Dev, "output_bytes", true), + (ExplainAnalyzeLevel::Dev, "output_batches", true), ] { let plan = collect_plan(sql, level).await; assert_eq!( diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 743c8750b5215..426ec213b3246 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -40,18 +40,24 @@ use std::io::Write; use std::path::PathBuf; use tempfile::TempDir; -/// A macro to assert that some particular line contains two substrings +/// A macro to assert that some particular line contains the given substrings /// -/// Usage: `assert_metrics!(actual, operator_name, metrics)` +/// Usage: `assert_metrics!(actual, operator_name, metrics_1, metrics_2, ...)` macro_rules! assert_metrics { - ($ACTUAL: expr, $OPERATOR_NAME: expr, $METRICS: expr) => { + ($ACTUAL: expr, $OPERATOR_NAME: expr, $($METRICS: expr),+) => { let found = $ACTUAL .lines() - .any(|line| line.contains($OPERATOR_NAME) && line.contains($METRICS)); + .any(|line| line.contains($OPERATOR_NAME) $( && line.contains($METRICS))+); + + let mut metrics = String::new(); + $(metrics.push_str(format!(" '{}',", $METRICS).as_str());)+ + // remove the last `,` from the string + metrics.pop(); + assert!( found, - "Can not find a line with both '{}' and '{}' in\n\n{}", - $OPERATOR_NAME, $METRICS, $ACTUAL + "Cannot find a line with operator name '{}' and metrics containing values {} in :\n\n{}", + $OPERATOR_NAME, metrics, $ACTUAL ); }; } diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index fc32bb6fc94c7..2c531786c9c2f 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -650,7 +650,6 @@ impl CrossJoinStream { self.left_index += 1; } - self.join_metrics.output_batches.add(1); return Ok(StatefulStreamResult::Ready(Some(batch))); } } diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index bb3465365ec96..1f4aeecb29720 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -494,7 +494,6 @@ impl HashJoinStream { &self.column_indices, self.join_type, )?; - self.join_metrics.output_batches.add(1); timer.done(); self.state = HashJoinStreamState::FetchProbeBatch; @@ -597,7 +596,6 @@ impl HashJoinStream { )? }; - self.join_metrics.output_batches.add(1); timer.done(); if next_offset.is_none() { @@ -653,8 +651,6 @@ impl HashJoinStream { if let Ok(ref batch) = result { self.join_metrics.input_batches.add(1); self.join_metrics.input_rows.add(batch.num_rows()); - - self.join_metrics.output_batches.add(1); } timer.done(); diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 1f0cdf391c1f9..9377ace33a1bb 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -1483,10 +1483,6 @@ impl NestedLoopJoinStream { fn maybe_flush_ready_batch(&mut self) -> Option>>> { if self.output_buffer.has_completed_batch() { if let Some(batch) = self.output_buffer.next_completed_batch() { - // HACK: this is not part of `BaselineMetrics` yet, so update it - // manually - self.metrics.join_metrics.output_batches.add(1); - // Update output rows for selectivity metric let output_rows = batch.num_rows(); self.metrics.selectivity.add_part(output_rows); diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs b/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs index 5920cd663a775..ac476853d5d75 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs @@ -31,8 +31,6 @@ pub(super) struct SortMergeJoinMetrics { input_batches: Count, /// Number of rows consumed by this operator input_rows: Count, - /// Number of batches produced by this operator - output_batches: Count, /// Execution metrics baseline_metrics: BaselineMetrics, /// Peak memory used for buffered data. @@ -49,8 +47,6 @@ impl SortMergeJoinMetrics { let input_batches = MetricBuilder::new(metrics).counter("input_batches", partition); let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition); - let output_batches = - MetricBuilder::new(metrics).counter("output_batches", partition); let peak_mem_used = MetricBuilder::new(metrics).gauge("peak_mem_used", partition); let spill_metrics = SpillMetrics::new(metrics, partition); @@ -60,7 +56,6 @@ impl SortMergeJoinMetrics { join_time, input_batches, input_rows, - output_batches, baseline_metrics, peak_mem_used, spill_metrics, @@ -82,9 +77,6 @@ impl SortMergeJoinMetrics { pub fn input_rows(&self) -> Count { self.input_rows.clone() } - pub fn output_batches(&self) -> Count { - self.output_batches.clone() - } pub fn peak_mem_used(&self) -> Gauge { self.peak_mem_used.clone() diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs b/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs index 1185866b9f46e..28020450c4277 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs @@ -35,6 +35,7 @@ use std::task::{Context, Poll}; use crate::joins::sort_merge_join::metrics::SortMergeJoinMetrics; use crate::joins::utils::{compare_join_arrays, JoinFilter}; +use crate::metrics::RecordOutput; use crate::spill::spill_manager::SpillManager; use crate::{PhysicalExpr, RecordBatchStream, SendableRecordBatchStream}; @@ -1462,10 +1463,7 @@ impl SortMergeJoinStream { fn output_record_batch_and_reset(&mut self) -> Result { let record_batch = concat_batches(&self.schema, &self.staging_output_record_batches.batches)?; - self.join_metrics.output_batches().add(1); - self.join_metrics - .baseline_metrics() - .record_output(record_batch.num_rows()); + (&record_batch).record_output(&self.join_metrics.baseline_metrics()); // If join filter exists, `self.output_size` is not accurate as we don't know the exact // number of rows in the output record batch. If streamed row joined with buffered rows, // once join filter is applied, the number of output rows may be more than 1. diff --git a/datafusion/physical-plan/src/joins/stream_join_utils.rs b/datafusion/physical-plan/src/joins/stream_join_utils.rs index 80221a77992ce..f4a3cd92f16da 100644 --- a/datafusion/physical-plan/src/joins/stream_join_utils.rs +++ b/datafusion/physical-plan/src/joins/stream_join_utils.rs @@ -682,8 +682,6 @@ pub struct StreamJoinMetrics { pub(crate) right: StreamJoinSideMetrics, /// Memory used by sides in bytes pub(crate) stream_memory_usage: metrics::Gauge, - /// Number of batches produced by this operator - pub(crate) output_batches: metrics::Count, /// Number of rows produced by this operator pub(crate) baseline_metrics: BaselineMetrics, } @@ -709,13 +707,9 @@ impl StreamJoinMetrics { let stream_memory_usage = MetricBuilder::new(metrics).gauge("stream_memory_usage", partition); - let output_batches = - MetricBuilder::new(metrics).counter("output_batches", partition); - Self { left, right, - output_batches, stream_memory_usage, baseline_metrics: BaselineMetrics::new(metrics, partition), } diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index be4646e88bd76..a9a2bbff42c6b 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -1376,7 +1376,6 @@ impl SymmetricHashJoinStream { } } Some((batch, _)) => { - self.metrics.output_batches.add(1); return self .metrics .baseline_metrics diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 9b589b674cc5b..6ff8298154517 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -1327,8 +1327,6 @@ pub(crate) struct BuildProbeJoinMetrics { pub(crate) input_batches: metrics::Count, /// Number of rows consumed by probe-side this operator pub(crate) input_rows: metrics::Count, - /// Number of batches produced by this operator - pub(crate) output_batches: metrics::Count, } // This Drop implementation updates the elapsed compute part of the metrics. @@ -1372,9 +1370,6 @@ impl BuildProbeJoinMetrics { let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition); - let output_batches = - MetricBuilder::new(metrics).counter("output_batches", partition); - Self { build_time, build_input_batches, @@ -1383,7 +1378,6 @@ impl BuildProbeJoinMetrics { join_time, input_batches, input_rows, - output_batches, baseline, } } diff --git a/datafusion/physical-plan/src/metrics/baseline.rs b/datafusion/physical-plan/src/metrics/baseline.rs index 858773b94664d..8dc2f30d9f799 100644 --- a/datafusion/physical-plan/src/metrics/baseline.rs +++ b/datafusion/physical-plan/src/metrics/baseline.rs @@ -63,6 +63,9 @@ pub struct BaselineMetrics { /// multiple times. /// Issue: output_bytes: Count, + + /// output batches: the total output batch count + output_batches: Count, // Remember to update `docs/source/user-guide/metrics.md` when updating comments // or adding new metrics } @@ -86,6 +89,9 @@ impl BaselineMetrics { output_bytes: MetricBuilder::new(metrics) .with_type(super::MetricType::SUMMARY) .output_bytes(partition), + output_batches: MetricBuilder::new(metrics) + .with_type(super::MetricType::DEV) + .output_batches(partition), } } @@ -100,6 +106,7 @@ impl BaselineMetrics { elapsed_compute: self.elapsed_compute.clone(), output_rows: Default::default(), output_bytes: Default::default(), + output_batches: Default::default(), } } @@ -113,6 +120,11 @@ impl BaselineMetrics { &self.output_rows } + /// return the metric for the total number of output batches produced + pub fn output_batches(&self) -> &Count { + &self.output_batches + } + /// Records the fact that this operator's execution is complete /// (recording the `end_time` metric). /// @@ -229,6 +241,7 @@ impl RecordOutput for RecordBatch { bm.record_output(self.num_rows()); let n_bytes = get_record_batch_memory_size(&self); bm.output_bytes.add(n_bytes); + bm.output_batches.add(1); self } } @@ -238,6 +251,7 @@ impl RecordOutput for &RecordBatch { bm.record_output(self.num_rows()); let n_bytes = get_record_batch_memory_size(self); bm.output_bytes.add(n_bytes); + bm.output_batches.add(1); self } } diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-plan/src/metrics/builder.rs index 6ea947b6d21b0..91b2440122f0d 100644 --- a/datafusion/physical-plan/src/metrics/builder.rs +++ b/datafusion/physical-plan/src/metrics/builder.rs @@ -161,6 +161,14 @@ impl<'a> MetricBuilder<'a> { count } + /// Consume self and create a new counter for recording total output batches + pub fn output_batches(self, partition: usize) -> Count { + let count = Count::new(); + self.with_partition(partition) + .build(MetricValue::OutputBatches(count.clone())); + count + } + /// Consume self and create a new gauge for reporting current memory usage pub fn mem_used(self, partition: usize) -> Gauge { let gauge = Gauge::new(); diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-plan/src/metrics/mod.rs index 4e98af722d4e0..613c031808cb7 100644 --- a/datafusion/physical-plan/src/metrics/mod.rs +++ b/datafusion/physical-plan/src/metrics/mod.rs @@ -299,6 +299,7 @@ impl MetricsSet { MetricValue::SpillCount(_) => false, MetricValue::SpilledBytes(_) => false, MetricValue::OutputBytes(_) => false, + MetricValue::OutputBatches(_) => false, MetricValue::SpilledRows(_) => false, MetricValue::CurrentMemoryUsage(_) => false, MetricValue::Gauge { name, .. } => name == metric_name, diff --git a/datafusion/physical-plan/src/metrics/value.rs b/datafusion/physical-plan/src/metrics/value.rs index 298d63e5e216a..7f31f757944d3 100644 --- a/datafusion/physical-plan/src/metrics/value.rs +++ b/datafusion/physical-plan/src/metrics/value.rs @@ -551,6 +551,8 @@ pub enum MetricValue { SpilledBytes(Count), /// Total size of output bytes produced: "output_bytes" metric OutputBytes(Count), + /// Total number of output batches produced: "output_batches" metric + OutputBatches(Count), /// Total size of spilled rows produced: "spilled_rows" metric SpilledRows(Count), /// Current memory used @@ -618,6 +620,9 @@ impl PartialEq for MetricValue { (MetricValue::OutputBytes(count), MetricValue::OutputBytes(other)) => { count == other } + (MetricValue::OutputBatches(count), MetricValue::OutputBatches(other)) => { + count == other + } (MetricValue::SpilledRows(count), MetricValue::SpilledRows(other)) => { count == other } @@ -699,6 +704,7 @@ impl MetricValue { Self::SpillCount(_) => "spill_count", Self::SpilledBytes(_) => "spilled_bytes", Self::OutputBytes(_) => "output_bytes", + Self::OutputBatches(_) => "output_batches", Self::SpilledRows(_) => "spilled_rows", Self::CurrentMemoryUsage(_) => "mem_used", Self::ElapsedCompute(_) => "elapsed_compute", @@ -721,6 +727,7 @@ impl MetricValue { Self::SpillCount(count) => count.value(), Self::SpilledBytes(bytes) => bytes.value(), Self::OutputBytes(bytes) => bytes.value(), + Self::OutputBatches(count) => count.value(), Self::SpilledRows(count) => count.value(), Self::CurrentMemoryUsage(used) => used.value(), Self::ElapsedCompute(time) => time.value(), @@ -755,6 +762,7 @@ impl MetricValue { Self::SpillCount(_) => Self::SpillCount(Count::new()), Self::SpilledBytes(_) => Self::SpilledBytes(Count::new()), Self::OutputBytes(_) => Self::OutputBytes(Count::new()), + Self::OutputBatches(_) => Self::OutputBatches(Count::new()), Self::SpilledRows(_) => Self::SpilledRows(Count::new()), Self::CurrentMemoryUsage(_) => Self::CurrentMemoryUsage(Gauge::new()), Self::ElapsedCompute(_) => Self::ElapsedCompute(Time::new()), @@ -802,6 +810,7 @@ impl MetricValue { | (Self::SpillCount(count), Self::SpillCount(other_count)) | (Self::SpilledBytes(count), Self::SpilledBytes(other_count)) | (Self::OutputBytes(count), Self::OutputBytes(other_count)) + | (Self::OutputBatches(count), Self::OutputBatches(other_count)) | (Self::SpilledRows(count), Self::SpilledRows(other_count)) | ( Self::Count { count, .. }, @@ -879,6 +888,7 @@ impl MetricValue { Self::OutputRows(_) => 0, Self::ElapsedCompute(_) => 1, Self::OutputBytes(_) => 2, + Self::OutputBatches(_) => 3, // Other metrics Self::PruningMetrics { name, .. } => match name.as_ref() { // The following metrics belong to `DataSourceExec` with a Parquet data source. @@ -888,23 +898,23 @@ impl MetricValue { // You may update these metrics as long as their relative order remains unchanged. // // Reference PR: - "files_ranges_pruned_statistics" => 3, - "row_groups_pruned_statistics" => 4, - "row_groups_pruned_bloom_filter" => 5, - "page_index_rows_pruned" => 6, - _ => 7, + "files_ranges_pruned_statistics" => 4, + "row_groups_pruned_statistics" => 5, + "row_groups_pruned_bloom_filter" => 6, + "page_index_rows_pruned" => 7, + _ => 8, }, - Self::SpillCount(_) => 8, - Self::SpilledBytes(_) => 9, - Self::SpilledRows(_) => 10, - Self::CurrentMemoryUsage(_) => 11, - Self::Count { .. } => 12, - Self::Gauge { .. } => 13, - Self::Time { .. } => 14, - Self::Ratio { .. } => 15, - Self::StartTimestamp(_) => 16, // show timestamps last - Self::EndTimestamp(_) => 17, - Self::Custom { .. } => 18, + Self::SpillCount(_) => 9, + Self::SpilledBytes(_) => 10, + Self::SpilledRows(_) => 11, + Self::CurrentMemoryUsage(_) => 12, + Self::Count { .. } => 13, + Self::Gauge { .. } => 14, + Self::Time { .. } => 15, + Self::Ratio { .. } => 16, + Self::StartTimestamp(_) => 17, // show timestamps last + Self::EndTimestamp(_) => 18, + Self::Custom { .. } => 19, } } @@ -919,6 +929,7 @@ impl Display for MetricValue { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Self::OutputRows(count) + | Self::OutputBatches(count) | Self::SpillCount(count) | Self::SpilledRows(count) | Self::Count { count, .. } => { diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 74cf798895998..8f73fe86cfefd 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -915,6 +915,7 @@ impl ExecutionPlan for RepartitionExec { Arc::clone(&reservation), spill_stream, 1, // Each receiver handles one input partition + BaselineMetrics::new(&metrics, partition), )) as SendableRecordBatchStream }) .collect::>(); @@ -952,6 +953,7 @@ impl ExecutionPlan for RepartitionExec { reservation, spill_stream, num_input_partitions, + BaselineMetrics::new(&metrics, partition), )) as SendableRecordBatchStream) } }) @@ -1402,6 +1404,9 @@ struct PerPartitionStream { /// In non-preserve-order mode, multiple input partitions send to the same channel, /// each sending None when complete. We must wait for all of them. remaining_partitions: usize, + + /// Execution metrics + baseline_metrics: BaselineMetrics, } impl PerPartitionStream { @@ -1412,6 +1417,7 @@ impl PerPartitionStream { reservation: SharedMemoryReservation, spill_stream: SendableRecordBatchStream, num_input_partitions: usize, + baseline_metrics: BaselineMetrics, ) -> Self { Self { schema, @@ -1421,18 +1427,17 @@ impl PerPartitionStream { spill_stream, state: StreamState::ReadingMemory, remaining_partitions: num_input_partitions, + baseline_metrics, } } -} - -impl Stream for PerPartitionStream { - type Item = Result; - fn poll_next( - mut self: Pin<&mut Self>, + fn poll_next_inner( + self: &mut Pin<&mut Self>, cx: &mut Context<'_>, - ) -> Poll> { + ) -> Poll>> { use futures::StreamExt; + let cloned_time = self.baseline_metrics.elapsed_compute().clone(); + let _timer = cloned_time.timer(); loop { match self.state { @@ -1508,6 +1513,18 @@ impl Stream for PerPartitionStream { } } +impl Stream for PerPartitionStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + let poll = self.poll_next_inner(cx); + self.baseline_metrics.record_poll(poll) + } +} + impl RecordBatchStream for PerPartitionStream { /// Get the schema fn schema(&self) -> SchemaRef { diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index a95fad19f614b..2b31ff3da9f0f 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -34,7 +34,8 @@ use crate::filter_pushdown::{ }; use crate::limit::LimitStream; use crate::metrics::{ - BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, SpillMetrics, SplitMetrics, + BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput, SpillMetrics, + SplitMetrics, }; use crate::projection::{make_with_child, update_ordering, ProjectionExec}; use crate::sorts::streaming_merge::{SortedSpillFile, StreamingMergeBuilder}; @@ -738,7 +739,7 @@ impl ExternalSorter { let sorted = sort_batch(&batch, &expressions, None)?; - metrics.record_output(sorted.num_rows()); + (&sorted).record_output(&metrics); drop(batch); drop(reservation); Ok(sorted) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 9435de1cc4488..0b5ab784df67c 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -26,7 +26,9 @@ use datafusion_expr::{ColumnarValue, Operator}; use std::mem::size_of; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; -use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder}; +use super::metrics::{ + BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, RecordOutput, +}; use crate::spill::get_record_batch_memory_size; use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; @@ -596,7 +598,7 @@ impl TopK { // break into record batches as needed let mut batches = vec![]; if let Some(mut batch) = heap.emit()? { - metrics.baseline.output_rows().add(batch.num_rows()); + (&batch).record_output(&metrics.baseline); loop { if batch.num_rows() <= batch_size { diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 7212c764130e0..22132f2f86392 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -277,8 +277,6 @@ struct UnnestMetrics { input_batches: metrics::Count, /// Number of rows consumed input_rows: metrics::Count, - /// Number of batches produced - output_batches: metrics::Count, } impl UnnestMetrics { @@ -288,14 +286,10 @@ impl UnnestMetrics { let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition); - let output_batches = - MetricBuilder::new(metrics).counter("output_batches", partition); - Self { baseline_metrics: BaselineMetrics::new(metrics, partition), input_batches, input_rows, - output_batches, } } } @@ -361,7 +355,6 @@ impl UnnestStream { let Some(result_batch) = result else { continue; }; - self.metrics.output_batches.add(1); (&result_batch).record_output(&self.metrics.baseline_metrics); // Empty record batches should not be emitted. @@ -375,7 +368,7 @@ impl UnnestStream { produced {} output batches containing {} rows in {}", self.metrics.input_batches, self.metrics.input_rows, - self.metrics.output_batches, + self.metrics.baseline_metrics.output_batches(), self.metrics.baseline_metrics.output_rows(), self.metrics.baseline_metrics.elapsed_compute(), ); diff --git a/docs/source/user-guide/metrics.md b/docs/source/user-guide/metrics.md index 1fb2f4a5c7700..43bfcd2afec2a 100644 --- a/docs/source/user-guide/metrics.md +++ b/docs/source/user-guide/metrics.md @@ -32,6 +32,7 @@ DataFusion operators expose runtime metrics so you can understand where time is | elapsed_compute | CPU time the operator actively spends processing work. | | output_rows | Total number of rows the operator produces. | | output_bytes | Memory usage of all output batches. Note: This value may be overestimated. If multiple output `RecordBatch` instances share underlying memory buffers, their sizes will be counted multiple times. | +| output_batches | Total number of output batches the operator produces. | ## Operator-specific Metrics From 1586cabebb3557704f4b0a59f3122fbdd9953c44 Mon Sep 17 00:00:00 2001 From: Gohlub <62673775+Gohlub@users.noreply.github.com> Date: Sun, 9 Nov 2025 20:29:04 -0500 Subject: [PATCH 0046/1589] feat: added clippy::needless_pass_by_value lint rule to datafusion/expr (#18532) ## Which issue does this PR close? - Closes #18504. ## Rationale for this change Followed suggestions to not update any public-facing APIs and put the lint rule in the appropriate spot. ## What changes are included in this PR? * Add `#![deny(clippy::needless_pass_by_value)]` and `#![cfg_attr(test, allow(clippy::needless_pass_by_value))]` to `lib.rs`. * Add `#[allow(clippy::needless_pass_by_value)]` to public functions * fix `rewrite_in_terms_of_projection()` and `get_exprs_except_skipped()` to use references per the lint suggestion ## Are these changes tested? Yes, though the same test failed even without changes to the public APIs: `test expr_rewriter::order_by::test::rewrite_sort_cols_by_agg_alias ... FAILED` I'll append the logs for your convenience: ``` failures: ---- expr_rewriter::order_by::test::rewrite_sort_cols_by_agg_alias stdout ---- running: 'c1 --> c1 -- column *named* c1 that came out of the projection, (not t.c1)' running: 'min(c2) --> "min(c2)" -- (column *named* "min(t.c2)"!)' thread 'expr_rewriter::order_by::test::rewrite_sort_cols_by_agg_alias' (27524241) panicked at datafusion/expr/src/expr_rewriter/order_by.rs:308:13: assertion `left == right` failed: input:Sort { expr: AggregateFunction(AggregateFunction { func: AggregateUDF { inner: Min { name: "min", signature: Signature { type_signature: VariadicAny, volatility: Immutable, parameter_names: None } } }, params: AggregateFunctionParams { args: [Column(Column { relation: None, name: "c2" })], distinct: false, filter: None, order_by: [], null_treatment: None } }), asc: true, nulls_first: true } rewritten:Sort { expr: Column(Column { relation: None, name: "min(t.c2)" }), asc: true, nulls_first: true } expected:Sort { expr: Column(Column { relation: Some(Bare { table: "min(t" }), name: "c2)" }), asc: true, nulls_first: true } left: Sort { expr: Column(Column { relation: None, name: "min(t.c2)" }), asc: true, nulls_first: true } right: Sort { expr: Column(Column { relation: Some(Bare { table: "min(t" }), name: "c2)" }), asc: true, nulls_first: true } note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace failures: expr_rewriter::order_by::test::rewrite_sort_cols_by_agg_alias ``` ## Are there any user-facing changes? No, all modification were constrained to internal APIs. --------- Co-authored-by: Yongting You <2010youy01@gmail.com> --- datafusion/expr/src/execution_props.rs | 1 + datafusion/expr/src/expr_rewriter/order_by.rs | 6 +++--- datafusion/expr/src/lib.rs | 3 +++ datafusion/expr/src/literal.rs | 3 +++ datafusion/expr/src/logical_plan/plan.rs | 1 + datafusion/expr/src/utils.rs | 7 ++++--- 6 files changed, 15 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/execution_props.rs b/datafusion/expr/src/execution_props.rs index d8a8c6bb49e19..fe20ed9331cbb 100644 --- a/datafusion/expr/src/execution_props.rs +++ b/datafusion/expr/src/execution_props.rs @@ -102,6 +102,7 @@ impl ExecutionProps { } /// Returns the provider for the `var_type`, if any + #[allow(clippy::needless_pass_by_value)] pub fn get_var_provider( &self, var_type: VarType, diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index 6db95555502da..c21c6e6222a05 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -52,7 +52,7 @@ fn rewrite_sort_col_by_aggs(expr: Expr, plan: &LogicalPlan) -> Result { // on top of them) if plan_inputs.len() == 1 { let proj_exprs = plan.expressions(); - rewrite_in_terms_of_projection(expr, proj_exprs, plan_inputs[0]) + rewrite_in_terms_of_projection(expr, &proj_exprs, plan_inputs[0]) } else { Ok(expr) } @@ -71,7 +71,7 @@ fn rewrite_sort_col_by_aggs(expr: Expr, plan: &LogicalPlan) -> Result { /// 2. t produces an output schema with two columns "a", "b + c" fn rewrite_in_terms_of_projection( expr: Expr, - proj_exprs: Vec, + proj_exprs: &[Expr], input: &LogicalPlan, ) -> Result { // assumption is that each item in exprs, such as "b + c" is @@ -104,7 +104,7 @@ fn rewrite_in_terms_of_projection( // look for the column named the same as this expr let mut found = None; - for proj_expr in &proj_exprs { + for proj_expr in proj_exprs { proj_expr.apply(|e| { if expr_match(&search_col, e) { found = Some(e.clone()); diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 2b7cc9d46ad34..885e582ea6d43 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -23,6 +23,9 @@ // Make sure fast / cheap clones on Arc are explicit: // https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] //! [DataFusion](https://github.com/apache/datafusion) //! is an extensible query execution framework that uses diff --git a/datafusion/expr/src/literal.rs b/datafusion/expr/src/literal.rs index 335d7b471f5fe..c7345a455a760 100644 --- a/datafusion/expr/src/literal.rs +++ b/datafusion/expr/src/literal.rs @@ -21,10 +21,12 @@ use crate::Expr; use datafusion_common::{metadata::FieldMetadata, ScalarValue}; /// Create a literal expression +#[allow(clippy::needless_pass_by_value)] pub fn lit(n: T) -> Expr { n.lit() } +#[allow(clippy::needless_pass_by_value)] pub fn lit_with_metadata(n: T, metadata: Option) -> Expr { let Some(metadata) = metadata else { return n.lit(); @@ -45,6 +47,7 @@ pub fn lit_with_metadata(n: T, metadata: Option) -> E } /// Create a literal timestamp expression +#[allow(clippy::needless_pass_by_value)] pub fn lit_timestamp_nano(n: T) -> Expr { n.lit_timestamp_nano() } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 0b89a5250902e..892ab135d6dc4 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -3481,6 +3481,7 @@ impl Aggregate { /// /// This method should only be called when you are absolutely sure that the schema being /// provided is correct for the aggregate. If in doubt, call [try_new](Self::try_new) instead. + #[allow(clippy::needless_pass_by_value)] pub fn try_new_with_schema( input: Arc, group_expr: Vec, diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index cd733e0a130a9..b4e763cdf497b 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -354,7 +354,7 @@ fn get_excluded_columns( /// Returns all `Expr`s in the schema, except the `Column`s in the `columns_to_skip` fn get_exprs_except_skipped( schema: &DFSchema, - columns_to_skip: HashSet, + columns_to_skip: &HashSet, ) -> Vec { if columns_to_skip.is_empty() { schema.iter().map(Expr::from).collect::>() @@ -419,7 +419,7 @@ pub fn expand_wildcard( }; // Add each excluded `Column` to columns_to_skip columns_to_skip.extend(excluded_columns); - Ok(get_exprs_except_skipped(schema, columns_to_skip)) + Ok(get_exprs_except_skipped(schema, &columns_to_skip)) } /// Resolves an `Expr::Wildcard` to a collection of qualified `Expr::Column`'s. @@ -464,7 +464,7 @@ pub fn expand_qualified_wildcard( columns_to_skip.extend(excluded_columns); Ok(get_exprs_except_skipped( &qualified_dfschema, - columns_to_skip, + &columns_to_skip, )) } @@ -928,6 +928,7 @@ pub fn find_valid_equijoin_key_pair( /// round(Float64) /// round(Float32) /// ``` +#[allow(clippy::needless_pass_by_value)] pub fn generate_signature_error_msg( func_name: &str, func_signature: Signature, From d8845a61616dd5f99f0230e51cc395e5849a9dc1 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Tue, 11 Nov 2025 00:05:07 +0800 Subject: [PATCH 0047/1589] feat: support nested key for get_field (#18394) ## Which issue does this PR close? ## Rationale for this change get_field doesn't support nested key ## What changes are included in this PR? support nested key ## Are these changes tested? UT ## Are there any user-facing changes? No --------- Co-authored-by: Andrew Lamb --- datafusion/functions/src/core/getfield.rs | 42 +++++++++++++++++++++- datafusion/sqllogictest/test_files/map.slt | 18 +++++++--- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index d18bd6e31f72e..3be7dd67981db 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -245,6 +245,46 @@ impl ScalarUDFImpl for GetFieldFunc { Ok(ColumnarValue::Array(data)) } + fn process_map_with_nested_key( + array: Arc, + key_array: Arc, + ) -> Result { + let map_array = as_map_array(array.as_ref())?; + + let comparator = make_comparator( + map_array.keys().as_ref(), + key_array.as_ref(), + SortOptions::default(), + )?; + + let original_data = map_array.entries().column(1).to_data(); + let capacity = Capacities::Array(original_data.len()); + let mut mutable = + MutableArrayData::with_capacities(vec![&original_data], true, capacity); + + for entry in 0..map_array.len() { + let start = map_array.value_offsets()[entry] as usize; + let end = map_array.value_offsets()[entry + 1] as usize; + + let mut found_match = false; + for i in start..end { + if comparator(i, 0).is_eq() { + mutable.extend(0, i, i + 1); + found_match = true; + break; + } + } + + if !found_match { + mutable.extend_nulls(1); + } + } + + let data = mutable.freeze(); + let data = make_array(data); + Ok(ColumnarValue::Array(data)) + } + match (array.data_type(), name) { (DataType::Map(_, _), ScalarValue::List(arr)) => { let key_array: Arc = arr; @@ -256,7 +296,7 @@ impl ScalarUDFImpl for GetFieldFunc { (DataType::Map(_, _), other) => { let data_type = other.data_type(); if data_type.is_nested() { - exec_err!("unsupported type {} for map access", data_type) + process_map_with_nested_key(array, other.to_array()?) } else { process_map_array(array, other.to_array()?) } diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index a3234b4e7ee52..45f8c5d25fbeb 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -544,11 +544,19 @@ SELECT (CASE WHEN 1 > 0 THEN MAP {'x': 100} ELSE MAP {'y': 200} END)['x']; ---- 100 -# TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key -# query ? -# SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {1:'a', 2:'b'}]; -# ---- -# 1 +# fix accessing map with nested key +query I +SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {1:'a', 2:'b'}]; +---- +1 + +query I +SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {2:'b', 1:'a'}]; +---- +NULL + +# TODO(https://github.com/apache/datafusion/pull/18394): Test accessing map with empty map as key +# TODO(https://github.com/apache/datafusion/pull/18394): Test accessing map with null map as key # accessing map with non-string key query I From 900ee658d6435b392658f64468a730edd8eaa416 Mon Sep 17 00:00:00 2001 From: Cora Sutton Date: Mon, 10 Nov 2025 10:46:04 -0600 Subject: [PATCH 0048/1589] Support Arrow IPC Stream Files (#18457) ## Which issue does this PR close? - Closes #16688. ## Rationale for this change Currently Datafusion can only read Arrow files if the're in the File format, not the Stream format. I work with a bunch of Stream format files and wanted native support. ## What changes are included in this PR? To accomplish the above, this PR splits the Arrow datasource into two separate implementations (`ArrowStream*` and `ArrowFile*`) with a facade on top to differentiate between the formats at query planning time. ## Are these changes tested? Yes, there are end-to-end sqllogictests along with tests for the changes within datasource-arrow. ## Are there any user-facing changes? Technically yes, in that we support a new format now. I'm not sure which documentation would need to be updated? --------- Co-authored-by: Martin Grigorov --- .../part=123/data.arrow | Bin 0 -> 1608 bytes .../part=456/data.arrow | Bin 0 -> 1608 bytes datafusion/core/tests/execution/mod.rs | 1 + .../core/tests/execution/register_arrow.rs | 90 +++ .../schema_adapter_integration_tests.rs | 4 +- .../datasource-arrow/src/file_format.rs | 429 +++++++++---- datafusion/datasource-arrow/src/mod.rs | 2 + datafusion/datasource-arrow/src/source.rs | 591 ++++++++++++++++-- .../tests/data/example_stream.arrow | Bin 0 -> 1480 bytes ...ple_stream_corrupted_metadata_length.arrow | Bin 0 -> 1480 bytes .../tests/data/example_stream_empty.arrow | Bin 0 -> 776 bytes .../sqllogictest/test_files/arrow_files.slt | 260 ++++++++ 12 files changed, 1206 insertions(+), 171 deletions(-) create mode 100644 datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow create mode 100644 datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow create mode 100644 datafusion/core/tests/execution/register_arrow.rs create mode 100644 datafusion/datasource-arrow/tests/data/example_stream.arrow create mode 100644 datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow create mode 100644 datafusion/datasource-arrow/tests/data/example_stream_empty.arrow diff --git a/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow new file mode 100644 index 0000000000000000000000000000000000000000..bad9e3de4a57fc4c0bd169f0275e9ca8b1b9d656 GIT binary patch literal 1608 zcmbtT&2G~`5T3T7L=Y7Ym530BXwMxS4Nu*H!FR0@95pJM+!XuV?DIuFsE%9ul8>bV38VPd#u%rxv3FmImPa9`A$y zen@nH+u;MNW1Q(5bR}pbQXQ-F*}`P2r7p#*(ff_~3=Etoq&`z(JQ7+i6#%`;GDu}5 ziwa_3d6R>UEUTCew;v!le>3Y`dADsZNvg7V*2Z$FV_D9sb$Nm!w-oy&3neR4*am0mypYA3 z`x(MN)M!S-=aXO(2=qYsz&pxl5=}y}rDM#W(-L8<=_6VbY>)_JCR zW--0T?k4&L9OFt!MD!E<5WIhL$L$5%^N+~B3wd8oK_~RUqJ6tRY&QbI_u&)#_u-|7 zZQ`Anf^c|coR5GxB801dj7l+U`yZ8n;+j3sb;ktKa z#{VYso4fqyPZM_P<)#l89sQ-3n_jvK$$ke`O}(SW-$h5Rz5Tvd13A0wt{MrudWr@ z^oyF6T-idEGM1%;`C2Fq^F&{;?}ztqpT8EHono3*hiWLn5# z#U0L9vM93e9POJbmPw`=4PTCeQ6SI-;ZpA?qfs;p#Rl_t3bJ}4j5d7+4M82yFU&j7 zHP0<3_|)D+e}H3TX&w>%2HywoZ0_0JV1ND*8M}~o<{WfJj||#1JH&ni5Mm!c!+#f1 zI@l=Qi!lg?*T%*Oh>)uO3z>8D=qdH-3GCU+#vuf-D`FPuwTs!|mFMcZ@{w{|cU)LeN d-iK@Ob$INy_f-D4t?6F7yVieKf1Y11>>p)(?fC!z literal 0 HcmV?d00001 diff --git a/datafusion/core/tests/execution/mod.rs b/datafusion/core/tests/execution/mod.rs index 8770b2a201051..f33ef87aa3023 100644 --- a/datafusion/core/tests/execution/mod.rs +++ b/datafusion/core/tests/execution/mod.rs @@ -18,3 +18,4 @@ mod coop; mod datasource_split; mod logical_plan; +mod register_arrow; diff --git a/datafusion/core/tests/execution/register_arrow.rs b/datafusion/core/tests/execution/register_arrow.rs new file mode 100644 index 0000000000000..4ce16dc0906c1 --- /dev/null +++ b/datafusion/core/tests/execution/register_arrow.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests for register_arrow API + +use datafusion::{execution::options::ArrowReadOptions, prelude::*}; +use datafusion_common::Result; + +#[tokio::test] +async fn test_register_arrow_auto_detects_format() -> Result<()> { + let ctx = SessionContext::new(); + + ctx.register_arrow( + "file_format", + "../../datafusion/datasource-arrow/tests/data/example.arrow", + ArrowReadOptions::default(), + ) + .await?; + + ctx.register_arrow( + "stream_format", + "../../datafusion/datasource-arrow/tests/data/example_stream.arrow", + ArrowReadOptions::default(), + ) + .await?; + + let file_result = ctx.sql("SELECT * FROM file_format ORDER BY f0").await?; + let stream_result = ctx.sql("SELECT * FROM stream_format ORDER BY f0").await?; + + let file_batches = file_result.collect().await?; + let stream_batches = stream_result.collect().await?; + + assert_eq!(file_batches.len(), stream_batches.len()); + assert_eq!(file_batches[0].schema(), stream_batches[0].schema()); + + let file_rows: usize = file_batches.iter().map(|b| b.num_rows()).sum(); + let stream_rows: usize = stream_batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(file_rows, stream_rows); + + Ok(()) +} + +#[tokio::test] +async fn test_register_arrow_join_file_and_stream() -> Result<()> { + let ctx = SessionContext::new(); + + ctx.register_arrow( + "file_table", + "../../datafusion/datasource-arrow/tests/data/example.arrow", + ArrowReadOptions::default(), + ) + .await?; + + ctx.register_arrow( + "stream_table", + "../../datafusion/datasource-arrow/tests/data/example_stream.arrow", + ArrowReadOptions::default(), + ) + .await?; + + let result = ctx + .sql( + "SELECT a.f0, a.f1, b.f0, b.f1 + FROM file_table a + JOIN stream_table b ON a.f0 = b.f0 + WHERE a.f0 <= 2 + ORDER BY a.f0", + ) + .await?; + let batches = result.collect().await?; + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 2); + + Ok(()) +} diff --git a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs index 0b093485c1ce1..1915298164819 100644 --- a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs @@ -284,12 +284,12 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Create a test factory let factory = Arc::new(UppercaseAdapterFactory {}); - // Test ArrowSource + // Test ArrowFileSource { let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); let table_schema = TableSchema::new(schema, vec![]); - let source = ArrowSource::new(table_schema); + let source = ArrowSource::new_file_source(table_schema); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs index dc1f5cf72da7f..ef478e2688909 100644 --- a/datafusion/datasource-arrow/src/file_format.rs +++ b/datafusion/datasource-arrow/src/file_format.rs @@ -20,15 +20,15 @@ //! Works with files following the [Arrow IPC format](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format) use std::any::Any; -use std::borrow::Cow; use std::collections::HashMap; use std::fmt::{self, Debug}; +use std::io::{Seek, SeekFrom}; use std::sync::Arc; use arrow::datatypes::{Schema, SchemaRef}; use arrow::error::ArrowError; use arrow::ipc::convert::fb_to_schema; -use arrow::ipc::reader::FileReader; +use arrow::ipc::reader::{FileReader, StreamReader}; use arrow::ipc::writer::IpcWriteOptions; use arrow::ipc::{root_as_message, CompressionType}; use datafusion_common::error::Result; @@ -62,7 +62,9 @@ use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use datafusion_session::Session; use futures::stream::BoxStream; use futures::StreamExt; -use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; +use object_store::{ + path::Path, GetOptions, GetRange, GetResultPayload, ObjectMeta, ObjectStore, +}; use tokio::io::AsyncWriteExt; /// Initial writing buffer size. Note this is just a size hint for efficiency. It @@ -72,8 +74,8 @@ const INITIAL_BUFFER_BYTES: usize = 1048576; /// If the buffered Arrow data exceeds this size, it is flushed to object store const BUFFER_FLUSH_BYTES: usize = 1024000; +/// Factory struct used to create [`ArrowFormat`] #[derive(Default, Debug)] -/// Factory struct used to create [ArrowFormat] pub struct ArrowFormatFactory; impl ArrowFormatFactory { @@ -108,7 +110,7 @@ impl GetExt for ArrowFormatFactory { } } -/// Arrow `FileFormat` implementation. +/// Arrow [`FileFormat`] implementation. #[derive(Default, Debug)] pub struct ArrowFormat; @@ -151,12 +153,23 @@ impl FileFormat for ArrowFormat { let schema = match r.payload { #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(mut file, _) => { - let reader = FileReader::try_new(&mut file, None)?; - reader.schema() - } - GetResultPayload::Stream(stream) => { - infer_schema_from_file_stream(stream).await? + match FileReader::try_new(&mut file, None) { + Ok(reader) => reader.schema(), + Err(file_error) => { + // not in the file format, but FileReader read some bytes + // while trying to parse the file and so we need to rewind + // it to the beginning of the file + file.seek(SeekFrom::Start(0))?; + match StreamReader::try_new(&mut file, None) { + Ok(reader) => reader.schema(), + Err(stream_error) => { + return Err(internal_datafusion_err!("Failed to parse Arrow file as either file format or stream format. File format error: {file_error}. Stream format error: {stream_error}")); + } + } + } + } } + GetResultPayload::Stream(stream) => infer_stream_schema(stream).await?, }; schemas.push(schema.as_ref().clone()); } @@ -176,14 +189,33 @@ impl FileFormat for ArrowFormat { async fn create_physical_plan( &self, - _state: &dyn Session, + state: &dyn Session, conf: FileScanConfig, ) -> Result> { + let object_store = state.runtime_env().object_store(&conf.object_store_url)?; + let object_location = &conf + .file_groups + .first() + .ok_or_else(|| internal_datafusion_err!("No files found in file group"))? + .files() + .first() + .ok_or_else(|| internal_datafusion_err!("No files found in file group"))? + .object_meta + .location; + let table_schema = TableSchema::new( Arc::clone(conf.file_schema()), conf.table_partition_cols().clone(), ); - let source = Arc::new(ArrowSource::new(table_schema)); + + let source: Arc = + match is_object_in_arrow_ipc_file_format(object_store, object_location).await + { + Ok(true) => Arc::new(ArrowSource::new_file_source(table_schema)), + Ok(false) => Arc::new(ArrowSource::new_stream_file_source(table_schema)), + Err(e) => Err(e)?, + }; + let config = FileScanConfigBuilder::from(conf) .with_source(source) .build(); @@ -208,11 +240,11 @@ impl FileFormat for ArrowFormat { } fn file_source(&self, table_schema: TableSchema) -> Arc { - Arc::new(ArrowSource::new(table_schema)) + Arc::new(ArrowSource::new_file_source(table_schema)) } } -/// Implements [`FileSink`] for writing to arrow_ipc files +/// Implements [`FileSink`] for Arrow IPC files struct ArrowFileSink { config: FileSinkConfig, } @@ -349,94 +381,160 @@ impl DataSink for ArrowFileSink { } } +// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs. +// See + const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1']; const CONTINUATION_MARKER: [u8; 4] = [0xff; 4]; -/// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs. -/// See -async fn infer_schema_from_file_stream( +async fn infer_stream_schema( mut stream: BoxStream<'static, object_store::Result>, ) -> Result { - // Expected format: - // - 6 bytes - // - 2 bytes - // - 4 bytes, not present below v0.15.0 - // - 4 bytes - // - // - - // So in first read we need at least all known sized sections, - // which is 6 + 2 + 4 + 4 = 16 bytes. - let bytes = collect_at_least_n_bytes(&mut stream, 16, None).await?; - - // Files should start with these magic bytes - if bytes[0..6] != ARROW_MAGIC { - return Err(ArrowError::ParseError( - "Arrow file does not contain correct header".to_string(), - ))?; - } - - // Since continuation marker bytes added in later versions - let (meta_len, rest_of_bytes_start_index) = if bytes[8..12] == CONTINUATION_MARKER { - (&bytes[12..16], 16) + // IPC streaming format. + // See https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format + // + // + // + // ... + // + // + // ... + // + // ... + // + // ... + // + // + + // The streaming format is made up of a sequence of encapsulated messages. + // See https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format + // + // (added in v0.15.0) + // + // + // + // + // + // The first message is the schema. + + // IPC file format is a wrapper around the streaming format with indexing information. + // See https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format + // + // + // + // + //