From 8330375d9a74964493d991a93f2f17ca5a96efab Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Wed, 8 May 2024 22:50:26 +0200 Subject: [PATCH 01/21] feat: extend unnest for struct --- datafusion/common/src/unnest.rs | 8 -- datafusion/core/src/physical_planner.rs | 16 ++- datafusion/expr/src/logical_plan/builder.rs | 59 ++++++---- datafusion/expr/src/logical_plan/plan.rs | 21 +++- datafusion/expr/src/logical_plan/tree_node.rs | 1 + datafusion/physical-plan/src/lib.rs | 1 + datafusion/physical-plan/src/unnest.rs | 103 ++++++++++++++---- 7 files changed, 154 insertions(+), 55 deletions(-) diff --git a/datafusion/common/src/unnest.rs b/datafusion/common/src/unnest.rs index fd92267f9b4c..cf0a9b2b4638 100644 --- a/datafusion/common/src/unnest.rs +++ b/datafusion/common/src/unnest.rs @@ -66,14 +66,6 @@ pub struct UnnestOptions { pub preserve_nulls: bool, } -impl Default for UnnestOptions { - fn default() -> Self { - Self { - // default to true to maintain backwards compatible behavior - preserve_nulls: true, - } - } -} impl UnnestOptions { /// Create a new [`UnnestOptions`] with default values diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index a041ab31f7cf..00eda21d6bbb 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1156,13 +1156,22 @@ impl DefaultPhysicalPlanner { Arc::new(GlobalLimitExec::new(input, *skip, *fetch)) } LogicalPlan::Unnest(Unnest { - columns, + list_type_columns, + struct_type_columns, schema, options, .. }) => { let input = children.one()?; - let column_execs = columns + let list_column_execs = list_type_columns + .iter() + .map(|column| { + schema + .index_of_column(column) + .map(|idx| Column::new(&column.name, idx)) + }) + .collect::>()?; + let struct_column_execs = struct_type_columns .iter() .map(|column| { schema @@ -1173,7 +1182,8 @@ impl DefaultPhysicalPlanner { let schema = SchemaRef::new(schema.as_ref().to_owned().into()); Arc::new(UnnestExec::new( input, - column_execs, + list_type_columns, + struct_column_execs, schema, options.clone(), )) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 43873cb90cda..13e57cd860bd 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1545,7 +1545,7 @@ impl TableSource for LogicalTableSource { /// Create a [`LogicalPlan::Unnest`] plan pub fn unnest(input: LogicalPlan, columns: Vec) -> Result { - unnest_with_options(input, columns, UnnestOptions::new()) + unnest_with_options(input, columns, UnnestOptions::default()) } /// Create a [`LogicalPlan::Unnest`] plan with options @@ -1555,39 +1555,59 @@ pub fn unnest_with_options( options: UnnestOptions, ) -> Result { // Extract the type of the nested field in the list. - let mut unnested_fields: HashMap = HashMap::with_capacity(columns.len()); + let mut unnested_fields_map: HashMap = + HashMap::with_capacity(columns.len()); // Add qualifiers to the columns. - let mut qualified_columns = Vec::with_capacity(columns.len()); + let mut qualified_list_columns = Vec::with_capacity(columns.len()); + let mut qualified_struct_columns = Vec::with_capacity(columns.len()); for c in &columns { let index = input.schema().index_of_column(c)?; let (unnest_qualifier, unnest_field) = input.schema().qualified_field(index); - let unnested_field = match unnest_field.data_type() { + match unnest_field.data_type() { DataType::List(field) | DataType::FixedSizeList(field, _) - | DataType::LargeList(field) => Arc::new(Field::new( - unnest_field.name(), - field.data_type().clone(), - // Unnesting may produce NULLs even if the list is not null. - // For example: unnset([1], []) -> 1, null - true, - )), + | DataType::LargeList(field) => { + let unnest_field = Arc::new(Field::new( + unnest_field.name(), + field.data_type().clone(), + // Unnesting may produce NULLs even if the list is not null. + // For example: unnset([1], []) -> 1, null + true, + )); + unnested_fields_map.insert(index, unnested_fields); + qualified_columns.extend( + unnested_fields + .iter() + .map(|f| Column::from((unnest_qualifier, f))), + ); + } + DataType::Struct(fields) => { + let unnested_fields = fields.tovec(); + unnested_fields_map.insert(index, unnested_fields); + qualified_columns.extend( + unnested_fields + .iter() + .map(|f| Column::from((unnest_qualifier, f))), + ); + }, _ => { // If the unnest field is not a list type return the input plan. return Ok(input); } - }; - qualified_columns.push(Column::from((unnest_qualifier, &unnested_field))); - unnested_fields.insert(index, unnested_field); + } } - // Update the schema with the unnest column types changed to contain the nested types. let input_schema = input.schema(); + let fields = input_schema .iter() .enumerate() - .map(|(index, (q, f))| match unnested_fields.get(&index) { - Some(unnested_field) => (q.cloned(), unnested_field.clone()), - None => (q.cloned(), f.clone()), + .flat_map(|(index, (q, f))| match unnested_fields_map.get(&index) { + Some(unnested_fields) => unnested_fields + .iter() + .map(move |f| (q.cloned(), f.clone())) + .collect(), + None => vec![(q.cloned(), f.clone())], }) .collect::>(); @@ -1598,7 +1618,8 @@ pub fn unnest_with_options( let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), - columns: qualified_columns, + list_type_columns: qualified_list_columns, + struct_type_columns: qualified_struct_columns, schema, options, })) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 64c5b56a4080..e9d03389464b 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -339,6 +339,7 @@ impl LogicalPlan { LogicalPlan::Copy(copy) => vec![©.input], LogicalPlan::Ddl(ddl) => ddl.inputs(), LogicalPlan::Unnest(Unnest { input, .. }) => vec![input], + // TODO: add here LogicalPlan::Prepare(Prepare { input, .. }) => vec![input], LogicalPlan::RecursiveQuery(RecursiveQuery { static_term, @@ -807,11 +808,15 @@ impl LogicalPlan { } LogicalPlan::DescribeTable(_) => Ok(self.clone()), LogicalPlan::Unnest(Unnest { - columns, options, .. + list_type_columns, + struct_type_columns, + options, + .. }) => { // Update schema with unnested column type. let input = inputs.swap_remove(0); - unnest_with_options(input, columns.clone(), options.clone()) + list_type_columns.extend_from_slice(struct_type_columns); + unnest_with_options(input, list_type_columns.clone(), options.clone()) } } } @@ -1541,8 +1546,10 @@ impl LogicalPlan { LogicalPlan::DescribeTable(DescribeTable { .. }) => { write!(f, "DescribeTable") } - LogicalPlan::Unnest(Unnest { columns, .. }) => { - write!(f, "Unnest: {}", expr_vec_fmt!(columns)) + LogicalPlan::Unnest(Unnest { + list_type_columns, + struct_type_columns, .. }) => { + write!(f, "Unnest: {}{}", expr_vec_fmt!(list_type_columns),expr_vec_fmt!(struct_type_columns)) } } } @@ -2516,8 +2523,10 @@ pub enum Partitioning { pub struct Unnest { /// The incoming logical plan pub input: Arc, - /// The columns to unnest - pub columns: Vec, + /// The list columns to unnest + pub list_type_columns: Vec, + /// The struct columns to unnest + pub struct_type_columns: Vec, /// The output schema, containing the unnested field column. pub schema: DFSchemaRef, /// Options diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 37a36c36ca53..825535fe23e9 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -309,6 +309,7 @@ impl TreeNode for LogicalPlan { } .update_data(LogicalPlan::Ddl) } + // TODO: add here LogicalPlan::Unnest(Unnest { input, columns, diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index cd2be33e86c1..5c91cd0210db 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -68,6 +68,7 @@ pub mod streaming; pub mod tree_node; pub mod union; pub mod unnest; +pub mod unnest_struct; pub mod values; pub mod windows; pub mod work_table; diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 06dd8230d39e..4a104cc03b8c 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -36,28 +36,34 @@ use arrow::compute::kernels::zip::zip; use arrow::compute::{cast, is_not_null, kernels, sum}; use arrow::datatypes::{DataType, Int64Type, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; -use arrow_array::{Int64Array, Scalar}; +use arrow_array::{Int64Array, Scalar, StructArray}; use arrow_ord::cmp::lt; -use datafusion_common::{exec_datafusion_err, exec_err, Result, UnnestOptions}; +use datafusion_common::{ + exec_datafusion_err, exec_err, Result, UnnestListOptions, UnnestOptions, +}; use datafusion_execution::TaskContext; use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; use futures::{Stream, StreamExt}; +use hashbrown::HashSet; use log::trace; /// Unnest the given columns by joining the row with each value in the /// nested type. /// /// See [`UnnestOptions`] for more details and an example. +/// TODO: rename into UnnestListExec #[derive(Debug)] pub struct UnnestExec { /// Input execution plan input: Arc, /// The schema once the unnest is applied schema: SchemaRef, - /// The unnest columns - columns: Vec, + /// The unnest list type columns + list_type_columns: Vec, + /// + struct_column_indices: HashSet, /// Options options: UnnestOptions, /// Execution metrics @@ -70,15 +76,20 @@ impl UnnestExec { /// Create a new [UnnestExec]. pub fn new( input: Arc, - columns: Vec, + list_type_columns: Vec, + struct_type_columns: Vec, schema: SchemaRef, options: UnnestOptions, ) -> Self { let cache = Self::compute_properties(&input, schema.clone()); + let struct_column_indices: HashSet<_> = + struct_type_columns.iter().map(|column| column.index()).collect(); + UnnestExec { input, schema, - columns, + list_type_columns, + struct_column_indices, options, metrics: Default::default(), cache, @@ -137,7 +148,8 @@ impl ExecutionPlan for UnnestExec { ) -> Result> { Ok(Arc::new(UnnestExec::new( children[0].clone(), - self.columns.clone(), + self.list_type_columns.clone(), + self.struct_column_indices.clone(), self.schema.clone(), self.options.clone(), ))) @@ -158,7 +170,8 @@ impl ExecutionPlan for UnnestExec { Ok(Box::pin(UnnestStream { input, schema: self.schema.clone(), - columns: self.columns.clone(), + list_type_columns: self.list_type_columns.clone(), + struct_column_indices: self.struct_column_indices.clone(), options: self.options.clone(), metrics, })) @@ -214,7 +227,9 @@ struct UnnestStream { /// Unnested schema schema: Arc, /// The unnest columns - columns: Vec, + list_type_columns: Vec, + /// + struct_column_indices: HashSet, /// Options options: UnnestOptions, /// Metrics @@ -251,8 +266,13 @@ impl UnnestStream { .map(|maybe_batch| match maybe_batch { Some(Ok(batch)) => { let timer = self.metrics.elapsed_compute.timer(); - let result = - build_batch(&batch, &self.schema, &self.columns, &self.options); + let result = build_batch( + &batch, + &self.schema, + &self.list_type_columns, + &self.struct_column_indices, + &self.options, + ); self.metrics.input_batches.add(1); self.metrics.input_rows.add(batch.num_rows()); if let Ok(ref batch) = result { @@ -287,10 +307,12 @@ impl UnnestStream { fn build_batch( batch: &RecordBatch, schema: &SchemaRef, - columns: &[Column], + // list type column only + list_type_columns: &[Column], + struct_column_indices: &HashSet, options: &UnnestOptions, ) -> Result { - let list_arrays: Vec = columns + let list_arrays: Vec = list_type_columns .iter() .map(|column| column.evaluate(batch)?.into_array(batch.num_rows())) .collect::>()?; @@ -311,16 +333,61 @@ fn build_batch( // Unnest all the list arrays let unnested_arrays = unnest_list_arrays(&list_arrays, unnested_length, total_length)?; + let unnested_array_map: HashMap<_, _> = unnested_arrays .into_iter() - .zip(columns.iter()) + .zip(list_type_columns.iter()) .map(|(array, column)| (column.index(), array)) .collect(); // Create the take indices array for other columns let take_indicies = create_take_indicies(unnested_length, total_length); - batch_from_indices(batch, schema, &unnested_array_map, &take_indicies) + // vertical expansion because of list unnest + let rows_expanded = batch_from_indices(batch, &unnested_array_map, &take_indicies)?; + // horizontal expansion because of struct unnest + let columns_expanded = rows_expanded + .iter() + .enumerate() + .flat_map(|(idx, column_data)| match struct_column_indices.get(&idx) { + Some(_) => match column_data.data_type() { + DataType::Struct(fields) => { + let struct_arr = + column_data.as_any().downcast_ref::().unwrap(); + struct_arr.columns().to_vec() + } + other => exec_err!("Invalid unnest datatype {other}"), + }, + None => { + vec![column_data.clone()] + } + }) + .collect::>>()?; + Ok(RecordBatch::try_new(schema.clone(), columns_expanded)) +} + +fn unnest_struct_arrays( + unnested_indices: HashSet, + input: &[ArrayRef], +) -> Result> { + let unnested_arrays = input + .iter() + .enumerate() + .flat_map(|(idx, column_data)| match unnested_indices.get(&idx) { + Some(_) => match column_data.data_type() { + DataType::Struct(fields) => { + let struct_arr = + column_data.as_any().downcast_ref::().unwrap(); + struct_arr.columns().to_vec() + } + other => exec_err!("Invalid unnest datatype {other }"), + }, + None => { + vec![column_data.clone()] + } + }) + .collect::>>()?; + Ok(unnested_arrays) } /// Find the longest list length among the given list arrays for each row. @@ -570,10 +637,9 @@ fn create_take_indicies( /// fn batch_from_indices( batch: &RecordBatch, - schema: &SchemaRef, unnested_list_arrays: &HashMap, indices: &PrimitiveArray, -) -> Result { +) -> Result>> { let arrays = batch .columns() .iter() @@ -583,8 +649,7 @@ fn batch_from_indices( None => Ok(kernels::take::take(arr, indices, None)?), }) .collect::>>()?; - - Ok(RecordBatch::try_new(schema.clone(), arrays.to_vec())?) + return Ok(arrays); } #[cfg(test)] From 78c96a0d878b5e3b8c9880f7929f7bfbc5940c11 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Thu, 9 May 2024 07:44:12 +0200 Subject: [PATCH 02/21] compile err --- datafusion/common/src/unnest.rs | 8 +++ datafusion/core/src/physical_planner.rs | 26 ++++----- datafusion/expr/src/expr_schema.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 38 +++++++------ datafusion/expr/src/logical_plan/display.rs | 5 +- datafusion/expr/src/logical_plan/plan.rs | 15 +++-- datafusion/expr/src/logical_plan/tree_node.rs | 10 +++- datafusion/physical-plan/src/lib.rs | 1 - datafusion/physical-plan/src/unnest.rs | 38 ++----------- datafusion/sql/src/expr/function.rs | 2 +- .../sqllogictest/test_files/unnest_debug.slt | 55 +++++++++++++++++++ 11 files changed, 121 insertions(+), 79 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/unnest_debug.slt diff --git a/datafusion/common/src/unnest.rs b/datafusion/common/src/unnest.rs index cf0a9b2b4638..fd92267f9b4c 100644 --- a/datafusion/common/src/unnest.rs +++ b/datafusion/common/src/unnest.rs @@ -66,6 +66,14 @@ pub struct UnnestOptions { pub preserve_nulls: bool, } +impl Default for UnnestOptions { + fn default() -> Self { + Self { + // default to true to maintain backwards compatible behavior + preserve_nulls: true, + } + } +} impl UnnestOptions { /// Create a new [`UnnestOptions`] with default values diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 00eda21d6bbb..e76bb38b6d1f 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -100,6 +100,7 @@ use datafusion_sql::utils::window_expr_common_partition_keys; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; +use hashbrown::HashSet; use itertools::{multiunzip, Itertools}; use log::{debug, trace}; use sqlparser::ast::NullTreatment; @@ -1156,6 +1157,7 @@ impl DefaultPhysicalPlanner { Arc::new(GlobalLimitExec::new(input, *skip, *fetch)) } LogicalPlan::Unnest(Unnest { + columns, list_type_columns, struct_type_columns, schema, @@ -1163,27 +1165,25 @@ impl DefaultPhysicalPlanner { .. }) => { let input = children.one()?; - let list_column_execs = list_type_columns + let list_column_exec = list_type_columns .iter() - .map(|column| { - schema - .index_of_column(column) - .map(|idx| Column::new(&column.name, idx)) + .map(|idx| { + let column = &columns[*idx]; + let schema_idx = schema.index_of_column(column)?; + Ok(Column::new(&column.name,schema_idx)) }) .collect::>()?; - let struct_column_execs = struct_type_columns + + let struct_columns_set: HashSet = struct_type_columns .iter() - .map(|column| { - schema - .index_of_column(column) - .map(|idx| Column::new(&column.name, idx)) - }) + .map(|idx| schema.index_of_column(&columns[*idx])) .collect::>()?; + let schema = SchemaRef::new(schema.as_ref().to_owned().into()); Arc::new(UnnestExec::new( input, - list_type_columns, - struct_column_execs, + list_column_exec, + struct_columns_set, schema, options.clone(), )) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index f93f08574906..240f6a09f332 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -123,7 +123,7 @@ impl ExprSchemable for Expr { Ok(field.data_type().clone()) } DataType::Struct(_) => { - not_impl_err!("unnest() does not support struct yet") + Ok(arg_data_type.clone()) } DataType::Null => { not_impl_err!("unnest() does not support null yet") diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 13e57cd860bd..2e15a962979a 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1558,44 +1558,45 @@ pub fn unnest_with_options( let mut unnested_fields_map: HashMap = HashMap::with_capacity(columns.len()); // Add qualifiers to the columns. - let mut qualified_list_columns = Vec::with_capacity(columns.len()); - let mut qualified_struct_columns = Vec::with_capacity(columns.len()); - for c in &columns { - let index = input.schema().index_of_column(c)?; - let (unnest_qualifier, unnest_field) = input.schema().qualified_field(index); + let mut qualified_columns = Vec::with_capacity(columns.len()); + let mut list_columns = Vec::with_capacity(columns.len()); + let mut struct_columns = Vec::with_capacity(columns.len()); + for (idx,c) in columns.iter().enumerate() { + let schema_idx = input.schema().index_of_column(c)?; + println!("unnesting column {} {}",schema_idx,c.name); + let (unnest_qualifier, unnest_field) = input.schema().qualified_field(schema_idx); match unnest_field.data_type() { DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => { - let unnest_field = Arc::new(Field::new( + let new_field = Arc::new(Field::new( unnest_field.name(), field.data_type().clone(), // Unnesting may produce NULLs even if the list is not null. // For example: unnset([1], []) -> 1, null true, )); - unnested_fields_map.insert(index, unnested_fields); - qualified_columns.extend( - unnested_fields - .iter() - .map(|f| Column::from((unnest_qualifier, f))), - ); + qualified_columns.push(Column::from((unnest_qualifier, &new_field))); + unnested_fields_map.insert(schema_idx, vec![new_field]); + list_columns.push(idx); } DataType::Struct(fields) => { - let unnested_fields = fields.tovec(); - unnested_fields_map.insert(index, unnested_fields); + let new_fields = fields.to_vec(); qualified_columns.extend( - unnested_fields + new_fields .iter() .map(|f| Column::from((unnest_qualifier, f))), ); - }, + unnested_fields_map.insert(schema_idx, new_fields); + struct_columns.push(idx); + } _ => { // If the unnest field is not a list type return the input plan. return Ok(input); } } } + println!("---"); let input_schema = input.schema(); @@ -1618,8 +1619,9 @@ pub fn unnest_with_options( let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), - list_type_columns: qualified_list_columns, - struct_type_columns: qualified_struct_columns, + columns: qualified_columns, + list_type_columns: list_columns, + struct_type_columns: struct_columns, schema, options, })) diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs index 3a2ed9ffc2d8..172e4fa3dc2f 100644 --- a/datafusion/expr/src/logical_plan/display.rs +++ b/datafusion/expr/src/logical_plan/display.rs @@ -638,10 +638,11 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> { "Node Type": "DescribeTable" }) } - LogicalPlan::Unnest(Unnest { columns, .. }) => { + LogicalPlan::Unnest(Unnest { list_type_columns,struct_type_columns, .. }) => { json!({ "Node Type": "Unnest", - "Column": expr_vec_fmt!(columns), + "ListColumn": expr_vec_fmt!(list_type_columns), + "StructColumn": expr_vec_fmt!(struct_type_columns), }) } } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index e9d03389464b..af5bd16fda71 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -808,15 +808,13 @@ impl LogicalPlan { } LogicalPlan::DescribeTable(_) => Ok(self.clone()), LogicalPlan::Unnest(Unnest { - list_type_columns, - struct_type_columns, + columns, options, .. }) => { // Update schema with unnested column type. let input = inputs.swap_remove(0); - list_type_columns.extend_from_slice(struct_type_columns); - unnest_with_options(input, list_type_columns.clone(), options.clone()) + unnest_with_options(input, columns.clone(), options.clone()) } } } @@ -2523,10 +2521,11 @@ pub enum Partitioning { pub struct Unnest { /// The incoming logical plan pub input: Arc, - /// The list columns to unnest - pub list_type_columns: Vec, - /// The struct columns to unnest - pub struct_type_columns: Vec, + pub columns: Vec, + /// refer to the indices of field columns that have type list + pub list_type_columns: Vec, + /// refer to the indices of field columns that have type struct + pub struct_type_columns: Vec, /// The output schema, containing the unnested field column. pub schema: DFSchemaRef, /// Options diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 825535fe23e9..25a187411bbc 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -309,16 +309,19 @@ impl TreeNode for LogicalPlan { } .update_data(LogicalPlan::Ddl) } - // TODO: add here LogicalPlan::Unnest(Unnest { input, columns, + list_type_columns, + struct_type_columns, schema, options, }) => rewrite_arc(input, f)?.update_data(|input| { LogicalPlan::Unnest(Unnest { input, columns, + list_type_columns, + struct_type_columns, schema, options, }) @@ -491,7 +494,10 @@ impl LogicalPlan { LogicalPlan::TableScan(TableScan { filters, .. }) => { filters.iter().apply_until_stop(f) } - LogicalPlan::Unnest(Unnest { columns, .. }) => { + LogicalPlan::Unnest(Unnest { + columns, + .. + }) => { let exprs = columns .iter() .map(|c| Expr::Column(c.clone())) diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 5c91cd0210db..cd2be33e86c1 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -68,7 +68,6 @@ pub mod streaming; pub mod tree_node; pub mod union; pub mod unnest; -pub mod unnest_struct; pub mod values; pub mod windows; pub mod work_table; diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 4a104cc03b8c..7a43836ee4f3 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -38,9 +38,7 @@ use arrow::datatypes::{DataType, Int64Type, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::{Int64Array, Scalar, StructArray}; use arrow_ord::cmp::lt; -use datafusion_common::{ - exec_datafusion_err, exec_err, Result, UnnestListOptions, UnnestOptions, -}; +use datafusion_common::{exec_datafusion_err, exec_err, Result, UnnestOptions}; use datafusion_execution::TaskContext; use datafusion_physical_expr::EquivalenceProperties; @@ -77,13 +75,11 @@ impl UnnestExec { pub fn new( input: Arc, list_type_columns: Vec, - struct_type_columns: Vec, + struct_column_indices: HashSet, schema: SchemaRef, options: UnnestOptions, ) -> Self { let cache = Self::compute_properties(&input, schema.clone()); - let struct_column_indices: HashSet<_> = - struct_type_columns.iter().map(|column| column.index()).collect(); UnnestExec { input, @@ -356,38 +352,14 @@ fn build_batch( column_data.as_any().downcast_ref::().unwrap(); struct_arr.columns().to_vec() } - other => exec_err!("Invalid unnest datatype {other}"), + other => vec![column_data.clone()], }, None => { vec![column_data.clone()] } }) - .collect::>>()?; - Ok(RecordBatch::try_new(schema.clone(), columns_expanded)) -} - -fn unnest_struct_arrays( - unnested_indices: HashSet, - input: &[ArrayRef], -) -> Result> { - let unnested_arrays = input - .iter() - .enumerate() - .flat_map(|(idx, column_data)| match unnested_indices.get(&idx) { - Some(_) => match column_data.data_type() { - DataType::Struct(fields) => { - let struct_arr = - column_data.as_any().downcast_ref::().unwrap(); - struct_arr.columns().to_vec() - } - other => exec_err!("Invalid unnest datatype {other }"), - }, - None => { - vec![column_data.clone()] - } - }) - .collect::>>()?; - Ok(unnested_arrays) + .collect::>(); + Ok(RecordBatch::try_new(schema.clone(), columns_expanded)?) } /// Find the longest list length among the given list arrays for each row. diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 3adf2960784d..2c9cfa77a9cb 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -357,7 +357,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { | DataType::LargeList(_) | DataType::FixedSizeList(_, _) => Ok(()), DataType::Struct(_) => { - not_impl_err!("unnest() does not support struct yet") + Ok(()) } DataType::Null => { not_impl_err!("unnest() does not support null yet") diff --git a/datafusion/sqllogictest/test_files/unnest_debug.slt b/datafusion/sqllogictest/test_files/unnest_debug.slt new file mode 100644 index 000000000000..0bf3e4fc6b7d --- /dev/null +++ b/datafusion/sqllogictest/test_files/unnest_debug.slt @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################ +# Unnest Expressions Tests # +############################ + +statement ok +CREATE TABLE unnest_table +AS VALUES + ([1,2,3], [7], 1, [13, 14]), + ([4,5], [8,9,10], 2, [15, 16]), + ([6], [11,12], 3, null), + ([12], [null, 42, null], null, null), + -- null array to verify the `preserve_nulls` option + (null, null, 4, [17, 18]) +; + +query ?II +select unnest(struct(1 as "name0", 3.14 as name1, 'e', true as 'name3')); +---- + +query ?II +select array_remove(column1, 4), unnest(column2), column3 * 10 from unnest_table; +---- +[1, 2, 3] 7 10 +[5] 8 20 +[5] 9 20 +[5] 10 20 +[6] 11 30 +[6] 12 30 +[12] NULL NULL +[12] 42 NULL +[12] NULL NULL + + + + + +statement ok +drop table unnest_table; From 4217655adf71def8743eb7f55ced56d32f2a91df Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 11 May 2024 20:16:31 +0200 Subject: [PATCH 03/21] debugging --- datafusion/core/src/dataframe/mod.rs | 10 +- datafusion/core/src/physical_planner.rs | 2 +- datafusion/expr/src/expr_schema.rs | 23 ++++ datafusion/expr/src/logical_plan/builder.rs | 92 ++++++++++------ datafusion/expr/src/logical_plan/plan.rs | 26 ++++- datafusion/expr/src/logical_plan/tree_node.rs | 17 ++- .../optimizer/src/analyzer/type_coercion.rs | 15 ++- .../optimizer/src/optimize_projections/mod.rs | 18 ++- datafusion/physical-plan/src/unnest.rs | 103 ++++++++++-------- datafusion/sql/src/select.rs | 61 +++++++++-- .../sqllogictest/test_files/unnest_debug.slt | 31 +----- 11 files changed, 266 insertions(+), 132 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 4644e15febef..b8d1a96c9847 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -297,9 +297,13 @@ impl DataFrame { options: UnnestOptions, ) -> Result { let columns = columns.iter().map(|c| Column::from(*c)).collect(); - let plan = LogicalPlanBuilder::from(self.plan) - .unnest_columns_with_options(columns, options)? - .build()?; + let (plan, transformed_columns) = LogicalPlanBuilder::from(self.plan) + .unnest_columns_with_options(columns, options)?; + let new_column_exprs: Vec = transformed_columns + .into_iter() + .map(|col| Expr::Column(col)) + .collect(); + let plan = plan.project(new_column_exprs)?.build()?; Ok(DataFrame { session_state: self.session_state, plan, diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index e76bb38b6d1f..40b5b24c9f44 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1157,7 +1157,7 @@ impl DefaultPhysicalPlanner { Arc::new(GlobalLimitExec::new(input, *skip, *fetch)) } LogicalPlan::Unnest(Unnest { - columns, + output_columns: columns, list_type_columns, struct_type_columns, schema, diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 240f6a09f332..06a98bd1446b 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -422,6 +422,9 @@ impl ExprSchemable for Expr { match self { Expr::Column(c) => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; + if let Err(_) = self.metadata(input_schema) { + panic!("here"); + } Ok(( c.relation.clone(), Field::new(&c.name, data_type, nullable) @@ -431,6 +434,9 @@ impl ExprSchemable for Expr { } Expr::Alias(Alias { relation, name, .. }) => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; + if let Err(_) = self.metadata(input_schema) { + panic!("here"); + } Ok(( relation.clone(), Field::new(name, data_type, nullable) @@ -438,8 +444,25 @@ impl ExprSchemable for Expr { .into(), )) } + Expr::Unnest(Unnest { expr }) => { + let st = self.data_type_and_nullable(input_schema); + let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; + if let Err(_) = self.metadata(input_schema) { + panic!("here"); + } + Ok(( + None, + Field::new(self.display_name()?, data_type, nullable) + .with_metadata(self.metadata(input_schema)?) + .into(), + )) + } _ => { + let st = self.data_type_and_nullable(input_schema); let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; + if let Err(_) = self.metadata(input_schema) { + panic!("here"); + } Ok(( None, Field::new(self.display_name()?, data_type, nullable) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 2e15a962979a..b72055cb9b9e 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -50,9 +50,9 @@ use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use datafusion_common::config::FormatOptions; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ - get_target_functional_dependencies, not_impl_err, plan_datafusion_err, plan_err, - Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference, - ToDFSchema, UnnestOptions, + get_target_functional_dependencies, not_impl_err, plan_datafusion_err, + plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, + TableReference, ToDFSchema, UnnestOptions, }; /// Default table name for unnamed table @@ -1111,7 +1111,7 @@ impl LogicalPlanBuilder { /// Unnest the given column. pub fn unnest_column(self, column: impl Into) -> Result { - Ok(Self::from(unnest(self.plan, vec![column.into()])?)) + Ok(Self::from(unnest(self.plan,vec![column.into()])?)) } /// Unnest the given column given [`UnnestOptions`] @@ -1120,11 +1120,7 @@ impl LogicalPlanBuilder { column: impl Into, options: UnnestOptions, ) -> Result { - Ok(Self::from(unnest_with_options( - self.plan, - vec![column.into()], - options, - )?)) + Ok(Self::from(unnest_with_options(self.plan, vec![column.into()], options)?)) } /// Unnest the given columns with the given [`UnnestOptions`] @@ -1133,9 +1129,7 @@ impl LogicalPlanBuilder { columns: Vec, options: UnnestOptions, ) -> Result { - Ok(Self::from(unnest_with_options( - self.plan, columns, options, - )?)) + Ok(Self::from(unnest_with_options(self.plan, columns, options)?)) } } pub fn change_redundant_column(fields: &Fields) -> Vec { @@ -1544,10 +1538,45 @@ impl TableSource for LogicalTableSource { } /// Create a [`LogicalPlan::Unnest`] plan -pub fn unnest(input: LogicalPlan, columns: Vec) -> Result { +pub fn unnest( + input: LogicalPlan, + columns: Vec, +) -> Result { unnest_with_options(input, columns, UnnestOptions::default()) } +pub fn projection_after_unnest(c: &Column, schema: &DFSchemaRef) -> Result> { + let mut qualified_columns = Vec::with_capacity(1); + let schema_idx = schema.index_of_column(c)?; + let (unnest_qualifier, unnest_field) = schema.qualified_field(schema_idx); + match unnest_field.data_type() { + DataType::List(field) + | DataType::FixedSizeList(field, _) + | DataType::LargeList(field) => { + let new_field = Arc::new(Field::new( + unnest_field.name(), + field.data_type().clone(), + // Unnesting may produce NULLs even if the list is not null. + // For example: unnset([1], []) -> 1, null + true, + )); + qualified_columns.push(Column::from((unnest_qualifier, &new_field))); + } + DataType::Struct(fields) => { + let new_fields = fields.to_vec(); + qualified_columns.extend( + new_fields + .iter() + .map(|f| Column::from((unnest_qualifier, f))), + ); + } + _ => { + return Ok(vec![c.clone()]); + } + } + Ok(qualified_columns) +} + /// Create a [`LogicalPlan::Unnest`] plan with options pub fn unnest_with_options( input: LogicalPlan, @@ -1561,9 +1590,8 @@ pub fn unnest_with_options( let mut qualified_columns = Vec::with_capacity(columns.len()); let mut list_columns = Vec::with_capacity(columns.len()); let mut struct_columns = Vec::with_capacity(columns.len()); - for (idx,c) in columns.iter().enumerate() { + for (idx, c) in columns.iter().enumerate() { let schema_idx = input.schema().index_of_column(c)?; - println!("unnesting column {} {}",schema_idx,c.name); let (unnest_qualifier, unnest_field) = input.schema().qualified_field(schema_idx); match unnest_field.data_type() { DataType::List(field) @@ -1596,7 +1624,6 @@ pub fn unnest_with_options( } } } - println!("---"); let input_schema = input.schema(); @@ -1617,14 +1644,17 @@ pub fn unnest_with_options( // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); - Ok(LogicalPlan::Unnest(Unnest { - input: Arc::new(input), - columns: qualified_columns, - list_type_columns: list_columns, - struct_type_columns: struct_columns, - schema, - options, - })) + Ok( + LogicalPlan::Unnest(Unnest { + input: Arc::new(input), + input_columns: columns, + output_columns: qualified_columns, + list_type_columns: list_columns, + struct_type_columns: struct_columns, + schema, + options, + }) + ) } #[cfg(test)] @@ -2052,16 +2082,14 @@ mod tests { fn plan_builder_unnest() -> Result<()> { // Unnesting a simple column should return the child plan. let plan = nested_table_scan("test_table")? - .unnest_column("scalar")? - .build()?; + .unnest_column("scalar")?.build()?; let expected = "TableScan: test_table"; assert_eq!(expected, format!("{plan:?}")); // Unnesting the strings list. - let plan = nested_table_scan("test_table")? - .unnest_column("strings")? - .build()?; + let plan = nested_table_scan("test_table")?.unnest_column("strings")? + .build()?; let expected = "\ Unnest: test_table.strings\ @@ -2076,10 +2104,8 @@ mod tests { assert_eq!(&DataType::Utf8, field.data_type()); // Unnesting multiple fields. - let plan = nested_table_scan("test_table")? - .unnest_column("strings")? - .unnest_column("structs")? - .build()?; + let plan = nested_table_scan("test_table")?.unnest_column("strings")? + .unnest_column("structs")?.build()?; let expected = "\ Unnest: test_table.structs\ diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index af5bd16fda71..2a4646d8e9b0 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -808,13 +808,15 @@ impl LogicalPlan { } LogicalPlan::DescribeTable(_) => Ok(self.clone()), LogicalPlan::Unnest(Unnest { - columns, + input_columns: columns, options, .. }) => { // Update schema with unnested column type. let input = inputs.swap_remove(0); - unnest_with_options(input, columns.clone(), options.clone()) + let new_plan = unnest_with_options(input, + columns.clone(), options.clone())?; + Ok(new_plan) } } } @@ -1686,6 +1688,8 @@ impl Projection { /// produced by the projection operation. If the schema computation is successful, /// the `Result` will contain the schema; otherwise, it will contain an error. pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result> { + // println!("display projection input {}\n---with schema {:?}",input.display_indent_schema(),input.schema()); + // println!("exprs being used {:?}",exprs[0]); let mut schema = DFSchema::new_with_metadata( exprlist_to_fields(exprs, input)?, input.schema().metadata().clone(), @@ -2521,7 +2525,10 @@ pub enum Partitioning { pub struct Unnest { /// The incoming logical plan pub input: Arc, - pub columns: Vec, + pub input_columns: Vec, + // TODO: this should be converted into output column + // instead of the columns in the input + pub output_columns: Vec, /// refer to the indices of field columns that have type list pub list_type_columns: Vec, /// refer to the indices of field columns that have type struct @@ -2531,6 +2538,19 @@ pub struct Unnest { /// Options pub options: UnnestOptions, } +impl Unnest{ + // reference to columns field indexed by list_type_columns + pub fn get_list_columns(&self)-> Vec { + self.list_type_columns.iter().map( + |&i| self.output_columns[i].clone() + ).collect() + } + pub fn get_struct_columns(&self) ->Vec{ + self.struct_type_columns.iter().map( + |&i| self.output_columns[i].clone() + ).collect() + } +} #[cfg(test)] mod tests { diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 25a187411bbc..9a8e2fba7b9d 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -311,7 +311,8 @@ impl TreeNode for LogicalPlan { } LogicalPlan::Unnest(Unnest { input, - columns, + input_columns, + output_columns: columns, list_type_columns, struct_type_columns, schema, @@ -319,7 +320,8 @@ impl TreeNode for LogicalPlan { }) => rewrite_arc(input, f)?.update_data(|input| { LogicalPlan::Unnest(Unnest { input, - columns, + input_columns, + output_columns: columns, list_type_columns, struct_type_columns, schema, @@ -494,10 +496,13 @@ impl LogicalPlan { LogicalPlan::TableScan(TableScan { filters, .. }) => { filters.iter().apply_until_stop(f) } - LogicalPlan::Unnest(Unnest { - columns, - .. - }) => { + LogicalPlan::Unnest(unnest) => { + let columns = unnest.input_columns.clone(); + // let mut columns = unnest.get_list_columns(); + // columns.extend_from_slice( + // unnest.get_struct_columns().as_ref()); + + // TODO: detect struct column let exprs = columns .iter() .map(|c| Expr::Column(c.clone())) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index b7f95d83e8fc..caf0882d6a1a 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -77,6 +77,9 @@ fn analyze_internal( plan: &LogicalPlan, ) -> Result { // optimize child plans first + println!("\n==============analyzing_internal begins========================"); + println!("external schema: {:?}", external_schema); + println!("current plan\n{:?}\nrecursively transforming input", plan); let new_inputs = plan .inputs() .iter() @@ -103,17 +106,25 @@ fn analyze_internal( schema: Arc::new(schema), }; + println!("===|rewriting children expressions"); + let new_expr = plan .expressions() .into_iter() .map(|expr| { // ensure aggregate names don't change: // https://github.com/apache/datafusion/issues/3555 - rewrite_preserving_name(expr, &mut expr_rewrite) + let new_expr = rewrite_preserving_name(expr.clone(), &mut expr_rewrite); + let ret = new_expr?; + println!("=====|{:?} => {:?}",expr, ret); + Ok(ret) }) .collect::>>()?; - plan.with_new_exprs(new_expr, new_inputs) + + let new_plan = plan.with_new_exprs(new_expr, new_inputs)?; + println!("===|New plan:\n{:?}\n\n===============", new_plan); + Ok(new_plan) } pub(crate) struct TypeCoercionRewriter { diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 0f2aaa6cbcb3..7f9f2b530abd 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -109,10 +109,26 @@ fn optimize_projections( indices: RequiredIndicies, ) -> Result> { let child_required_indices: Vec = match plan { + LogicalPlan::Unnest(_) => { + println!("{:?}",indices); + // Pass index requirements from the parent as well as column indices + // that appear in this plan's expressions to its child. All these + // operators benefit from "small" inputs, so the projection_beneficial + // flag is `true`. + vec![RequiredIndicies::new_from_indices(vec![0,0])] + // plan.inputs() + // .into_iter() + // .map(|input| { + // indices + // .clone() + // .with_projection_beneficial() + // .with_plan_exprs(plan, input.schema()) + // }) + // .collect::>()? + } LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Repartition(_) - | LogicalPlan::Unnest(_) | LogicalPlan::Union(_) | LogicalPlan::SubqueryAlias(_) | LogicalPlan::Distinct(Distinct::On(_)) => { diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 7a43836ee4f3..5715fdf31a29 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -295,54 +295,13 @@ impl UnnestStream { } } -/// For each row in a `RecordBatch`, some list columns need to be unnested. -/// We will expand the values in each list into multiple rows, -/// taking the longest length among these lists, and shorter lists are padded with NULLs. -// -/// For columns that don't need to be unnested, repeat their values until reaching the longest length. -fn build_batch( - batch: &RecordBatch, +fn build_batch_vertically( + input_batch: &[Arc], schema: &SchemaRef, - // list type column only - list_type_columns: &[Column], struct_column_indices: &HashSet, - options: &UnnestOptions, ) -> Result { - let list_arrays: Vec = list_type_columns - .iter() - .map(|column| column.evaluate(batch)?.into_array(batch.num_rows())) - .collect::>()?; - - let longest_length = find_longest_length(&list_arrays, options)?; - let unnested_length = longest_length.as_primitive::(); - let total_length = if unnested_length.is_empty() { - 0 - } else { - sum(unnested_length).ok_or_else(|| { - exec_datafusion_err!("Failed to calculate the total unnested length") - })? as usize - }; - if total_length == 0 { - return Ok(RecordBatch::new_empty(schema.clone())); - } - - // Unnest all the list arrays - let unnested_arrays = - unnest_list_arrays(&list_arrays, unnested_length, total_length)?; - - let unnested_array_map: HashMap<_, _> = unnested_arrays - .into_iter() - .zip(list_type_columns.iter()) - .map(|(array, column)| (column.index(), array)) - .collect(); - - // Create the take indices array for other columns - let take_indicies = create_take_indicies(unnested_length, total_length); - - // vertical expansion because of list unnest - let rows_expanded = batch_from_indices(batch, &unnested_array_map, &take_indicies)?; // horizontal expansion because of struct unnest - let columns_expanded = rows_expanded + let columns_expanded = input_batch .iter() .enumerate() .flat_map(|(idx, column_data)| match struct_column_indices.get(&idx) { @@ -362,6 +321,62 @@ fn build_batch( Ok(RecordBatch::try_new(schema.clone(), columns_expanded)?) } +/// For each row in a `RecordBatch`, some list columns need to be unnested. +/// We will expand the values in each list into multiple rows, +/// taking the longest length among these lists, and shorter lists are padded with NULLs. +// +/// For columns that don't need to be unnested, repeat their values until reaching the longest length. +fn build_batch( + batch: &RecordBatch, + schema: &SchemaRef, + // list type column only + list_type_columns: &[Column], + struct_column_indices: &HashSet, + options: &UnnestOptions, +) -> Result { + let transformed = match list_type_columns.len() { + 0 => build_batch_vertically(batch.columns(), schema, struct_column_indices), + _ => { + let list_arrays: Vec = list_type_columns + .iter() + .map(|column| column.evaluate(batch)?.into_array(batch.num_rows())) + .collect::>()?; + + let longest_length = find_longest_length(&list_arrays, options)?; + let unnested_length = longest_length.as_primitive::(); + let total_length = if unnested_length.is_empty() { + 0 + } else { + sum(unnested_length).ok_or_else(|| { + exec_datafusion_err!("Failed to calculate the total unnested length") + })? as usize + }; + if total_length == 0 { + return Ok(RecordBatch::new_empty(schema.clone())); + } + + // Unnest all the list arrays + let unnested_arrays = + unnest_list_arrays(&list_arrays, unnested_length, total_length)?; + let unnested_array_map: HashMap<_, _> = unnested_arrays + .into_iter() + .zip(list_type_columns.iter()) + .map(|(array, column)| (column.index(), array)) + .collect(); + + // Create the take indices array for other columns + let take_indicies = create_take_indicies(unnested_length, total_length); + + // vertical expansion because of list unnest + let ret = batch_from_indices(batch, + &unnested_array_map, &take_indicies)?; + build_batch_vertically(&ret, schema, struct_column_indices) + } + }; + transformed + // Ok(RecordBatch::try_new(schema.clone(), transformed)?) +} + /// Find the longest list length among the given list arrays for each row. /// /// For example if we have the following two list arrays: diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 730e84cd094b..7d8918a2f6a5 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -24,9 +24,13 @@ use crate::utils::{ resolve_columns, resolve_positions_to_exprs, }; +use arrow_schema::DataType; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; +use datafusion_common::{ + internal_err, not_impl_err, plan_err, DataFusionError, ExprSchema, Result, +}; use datafusion_common::{Column, UnnestOptions}; +use datafusion_expr::builder::projection_after_unnest; use datafusion_expr::expr::{Alias, Unnest}; use datafusion_expr::expr_rewriter::{ normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols, @@ -298,26 +302,57 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { select_exprs: Vec, ) -> Result { let mut unnest_columns = vec![]; + // from which column used for projection, before the unnest happen + // including non unnest column and unnest column let mut inner_projection_exprs = vec![]; - let outer_projection_exprs = select_exprs + let outer_projection_exprs:Vec = select_exprs .into_iter() .map(|expr| { + // For expr at root level, we need to check if it is an unnest on struct + // column to transform the expr unnest(struct(field1, field2)) + // into struct.field1, struct.field2 + if let Expr::Unnest(Unnest { expr: ref arg }) = expr { + // will need to be rewritten into (struct_col.field1, struct_col, + //field2 ...) + let column_name = expr.display_name()?; + let column = Column::from_name(column_name); + unnest_columns.push(column_name.clone()); + // Add alias for the argument expression, to avoid naming conflicts + // with other expressions + // in the select list. For example: `select unnest(col1), col1 from t`. + inner_projection_exprs.push(arg.clone().alias(column_name.clone())); + let schema = input.schema(); + + let outer_projection_columns = projection_after_unnest( + &column, schema)?; + let expr = outer_projection_columns + .iter() + .map(|col| Expr::Column(col.clone())) + .collect::>(); + return Ok(expr); + } let Transformed { data: transformed_expr, transformed, tnr: _, } = expr.transform_up(|expr: Expr| { if let Expr::Unnest(Unnest { expr: ref arg }) = expr { + // TODO: an expr like (unnest(struct_col)) + // will need to be rewritten into (struct_col.field1, struct_col,field2 ...) let column_name = expr.display_name()?; unnest_columns.push(column_name.clone()); + let col = Column::from_name(column_name); // Add alias for the argument expression, to avoid naming conflicts with other expressions // in the select list. For example: `select unnest(col1), col1 from t`. + let schema = input.schema(); + if let DataType::Struct(_)= schema.data_type(&col)? { + internal_err!("unnest on struct can ony be applied at the root level of select expression") + } + inner_projection_exprs .push(arg.clone().alias(column_name.clone())); - Ok(Transformed::yes(Expr::Column(Column::from_name( - column_name, - )))) + Ok(Transformed::yes(Expr::Column(col))) } else { Ok(Transformed::no(expr)) } @@ -326,19 +361,21 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if !transformed { if matches!(&transformed_expr, Expr::Column(_)) { inner_projection_exprs.push(transformed_expr.clone()); - Ok(transformed_expr) + Ok(vec![transformed_expr]) } else { // We need to evaluate the expr in the inner projection, // outer projection just select its name let column_name = transformed_expr.display_name()?; inner_projection_exprs.push(transformed_expr); - Ok(Expr::Column(Column::from_name(column_name))) + Ok(vec![Expr::Column(Column::from_name(column_name))]) } } else { - Ok(transformed_expr) + Ok(vec![transformed_expr]) } }) - .collect::>>()?; + .collect::>>()? + .into_iter().flatten().collect(); + // Do the final projection if unnest_columns.is_empty() { @@ -349,11 +386,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let columns = unnest_columns.into_iter().map(|col| col.into()).collect(); // Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL let unnest_options = UnnestOptions::new().with_preserve_nulls(false); - LogicalPlanBuilder::from(input) + let plan = LogicalPlanBuilder::from(input) .project(inner_projection_exprs)? .unnest_columns_with_options(columns, unnest_options)? - .project(outer_projection_exprs)? - .build() + .project(outer_projection_exprs)?.build(); + plan } } diff --git a/datafusion/sqllogictest/test_files/unnest_debug.slt b/datafusion/sqllogictest/test_files/unnest_debug.slt index 0bf3e4fc6b7d..7a4e9ddc95a4 100644 --- a/datafusion/sqllogictest/test_files/unnest_debug.slt +++ b/datafusion/sqllogictest/test_files/unnest_debug.slt @@ -22,34 +22,11 @@ statement ok CREATE TABLE unnest_table AS VALUES - ([1,2,3], [7], 1, [13, 14]), - ([4,5], [8,9,10], 2, [15, 16]), - ([6], [11,12], 3, null), - ([12], [null, 42, null], null, null), - -- null array to verify the `preserve_nulls` option - (null, null, 4, [17, 18]) + (struct(1 as "name0", 2 as "name1")) ; -query ?II -select unnest(struct(1 as "name0", 3.14 as name1, 'e', true as 'name3')); ----- -query ?II -select array_remove(column1, 4), unnest(column2), column3 * 10 from unnest_table; +query II +select unnest(column1), column1 from unnest_table; ---- -[1, 2, 3] 7 10 -[5] 8 20 -[5] 9 20 -[5] 10 20 -[6] 11 30 -[6] 12 30 -[12] NULL NULL -[12] 42 NULL -[12] NULL NULL - - - - - -statement ok -drop table unnest_table; +1 2 From 3982bb99dd70772fa770de7b1bc5a7ed8ec374fc Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Tue, 14 May 2024 22:38:26 +0200 Subject: [PATCH 04/21] finish basic --- datafusion/core/src/dataframe/mod.rs | 13 +- datafusion/core/src/physical_planner.rs | 11 +- datafusion/expr/src/expr_schema.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 172 ++++++++++-------- datafusion/expr/src/logical_plan/plan.rs | 18 +- datafusion/expr/src/logical_plan/tree_node.rs | 8 + .../optimizer/src/analyzer/type_coercion.rs | 15 +- .../optimizer/src/optimize_projections/mod.rs | 30 ++- datafusion/sql/src/select.rs | 24 ++- .../sqllogictest/test_files/unnest_debug.slt | 9 +- 10 files changed, 161 insertions(+), 141 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index b8d1a96c9847..b2f14e43b79c 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -296,14 +296,11 @@ impl DataFrame { columns: &[&str], options: UnnestOptions, ) -> Result { - let columns = columns.iter().map(|c| Column::from(*c)).collect(); - let (plan, transformed_columns) = LogicalPlanBuilder::from(self.plan) - .unnest_columns_with_options(columns, options)?; - let new_column_exprs: Vec = transformed_columns - .into_iter() - .map(|col| Expr::Column(col)) - .collect(); - let plan = plan.project(new_column_exprs)?.build()?; + // TODO: fix me + let cols:Vec = columns.iter().map(|c| Column::from(*c)).collect(); + let plan = LogicalPlanBuilder::from(self.plan) + .unnest_columns_with_options(cols.clone(), options)? + .project(cols)?.build()?; Ok(DataFrame { session_state: self.session_state, plan, diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 40b5b24c9f44..8678cda615a7 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1170,14 +1170,15 @@ impl DefaultPhysicalPlanner { .map(|idx| { let column = &columns[*idx]; let schema_idx = schema.index_of_column(column)?; - Ok(Column::new(&column.name,schema_idx)) + Ok(Column::new(&column.name, schema_idx)) }) .collect::>()?; - let struct_columns_set: HashSet = struct_type_columns - .iter() - .map(|idx| schema.index_of_column(&columns[*idx])) - .collect::>()?; + let struct_columns_set: HashSet = + struct_type_columns.iter().copied().collect(); + // .iter() + // .map(|idx| schema.index_of_column(&columns[*idx])) + // .collect::>()?; let schema = SchemaRef::new(schema.as_ref().to_owned().into()); Arc::new(UnnestExec::new( diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 06a98bd1446b..6bd002d6093a 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -447,7 +447,7 @@ impl ExprSchemable for Expr { Expr::Unnest(Unnest { expr }) => { let st = self.data_type_and_nullable(input_schema); let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - if let Err(_) = self.metadata(input_schema) { + if let Err(_) = st { panic!("here"); } Ok(( diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index b72055cb9b9e..68fe0bfe1bab 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -50,7 +50,7 @@ use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use datafusion_common::config::FormatOptions; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ - get_target_functional_dependencies, not_impl_err, plan_datafusion_err, + get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions, }; @@ -1111,7 +1111,7 @@ impl LogicalPlanBuilder { /// Unnest the given column. pub fn unnest_column(self, column: impl Into) -> Result { - Ok(Self::from(unnest(self.plan,vec![column.into()])?)) + Ok(Self::from(unnest(self.plan, vec![column.into()])?)) } /// Unnest the given column given [`UnnestOptions`] @@ -1120,7 +1120,11 @@ impl LogicalPlanBuilder { column: impl Into, options: UnnestOptions, ) -> Result { - Ok(Self::from(unnest_with_options(self.plan, vec![column.into()], options)?)) + Ok(Self::from(unnest_with_options( + self.plan, + vec![column.into()], + options, + )?)) } /// Unnest the given columns with the given [`UnnestOptions`] @@ -1129,7 +1133,9 @@ impl LogicalPlanBuilder { columns: Vec, options: UnnestOptions, ) -> Result { - Ok(Self::from(unnest_with_options(self.plan, columns, options)?)) + Ok(Self::from(unnest_with_options( + self.plan, columns, options, + )?)) } } pub fn change_redundant_column(fields: &Fields) -> Vec { @@ -1538,43 +1544,53 @@ impl TableSource for LogicalTableSource { } /// Create a [`LogicalPlan::Unnest`] plan -pub fn unnest( - input: LogicalPlan, - columns: Vec, -) -> Result { +pub fn unnest(input: LogicalPlan, columns: Vec) -> Result { unnest_with_options(input, columns, UnnestOptions::default()) } +pub enum UnnestType { + Struct, + List, +} -pub fn projection_after_unnest(c: &Column, schema: &DFSchemaRef) -> Result> { +pub fn projection_after_unnest( + c: &Column, + schema: &DFSchemaRef, +) -> Result<(Vec<(Column, Arc)>, UnnestType)> { let mut qualified_columns = Vec::with_capacity(1); let schema_idx = schema.index_of_column(c)?; - let (unnest_qualifier, unnest_field) = schema.qualified_field(schema_idx); - match unnest_field.data_type() { + let (_, unnest_field) = schema.qualified_field(schema_idx); + let unnest_type = match unnest_field.data_type() { DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => { let new_field = Arc::new(Field::new( + // TODO: append field name by original name + .fieldname unnest_field.name(), field.data_type().clone(), // Unnesting may produce NULLs even if the list is not null. // For example: unnset([1], []) -> 1, null true, )); - qualified_columns.push(Column::from((unnest_qualifier, &new_field))); + let column = Column::from((None, &new_field)); + qualified_columns.push((column, new_field)); + UnnestType::List } DataType::Struct(fields) => { let new_fields = fields.to_vec(); - qualified_columns.extend( - new_fields - .iter() - .map(|f| Column::from((unnest_qualifier, f))), - ); + qualified_columns.extend(new_fields.into_iter().map(|f| { + let column = Column::from((None, &f)); + (column, f) + })); + UnnestType::Struct } _ => { - return Ok(vec![c.clone()]); + return internal_err!( + "trying to unnest on invalid data type {:?}", + unnest_field.data_type() + ); } - } - Ok(qualified_columns) + }; + Ok((qualified_columns, unnest_type)) } /// Create a [`LogicalPlan::Unnest`] plan with options @@ -1584,59 +1600,55 @@ pub fn unnest_with_options( options: UnnestOptions, ) -> Result { // Extract the type of the nested field in the list. - let mut unnested_fields_map: HashMap = - HashMap::with_capacity(columns.len()); + // let mut unnested_fields_map: HashMap = + // HashMap::with_capacity(columns.len()); // Add qualifiers to the columns. let mut qualified_columns = Vec::with_capacity(columns.len()); let mut list_columns = Vec::with_capacity(columns.len()); let mut struct_columns = Vec::with_capacity(columns.len()); - for (idx, c) in columns.iter().enumerate() { - let schema_idx = input.schema().index_of_column(c)?; - let (unnest_qualifier, unnest_field) = input.schema().qualified_field(schema_idx); - match unnest_field.data_type() { - DataType::List(field) - | DataType::FixedSizeList(field, _) - | DataType::LargeList(field) => { - let new_field = Arc::new(Field::new( - unnest_field.name(), - field.data_type().clone(), - // Unnesting may produce NULLs even if the list is not null. - // For example: unnset([1], []) -> 1, null - true, - )); - qualified_columns.push(Column::from((unnest_qualifier, &new_field))); - unnested_fields_map.insert(schema_idx, vec![new_field]); - list_columns.push(idx); - } - DataType::Struct(fields) => { - let new_fields = fields.to_vec(); - qualified_columns.extend( - new_fields - .iter() - .map(|f| Column::from((unnest_qualifier, f))), - ); - unnested_fields_map.insert(schema_idx, new_fields); - struct_columns.push(idx); - } - _ => { - // If the unnest field is not a list type return the input plan. - return Ok(input); - } - } - } + let column_by_original_index = columns + .iter() + .map(|c| Ok((input.schema().index_of_column(c)?, c))) + .collect::>>()?; let input_schema = input.schema(); + let mut dependency_indices = vec![]; + // Transform input schema into new schema + // e.g int, unnest([]int), unnest(struct(varchar,varchar)) + // becomes int, int, varchar, varchar let fields = input_schema .iter() .enumerate() - .flat_map(|(index, (q, f))| match unnested_fields_map.get(&index) { - Some(unnested_fields) => unnested_fields - .iter() - .map(move |f| (q.cloned(), f.clone())) - .collect(), - None => vec![(q.cloned(), f.clone())], + .map(|(index, (original_qualifier, original_field))| { + match column_by_original_index.get(&index) { + Some(&column_to_unnest) => { + let (flatten_columns, unnest_type) = + projection_after_unnest(column_to_unnest, input_schema)?; + // new columns dependent on the same original index + dependency_indices + .extend(std::iter::repeat(index).take(flatten_columns.len())); + Ok(flatten_columns + .iter() + .map(|col| { + match unnest_type { + UnnestType::List => list_columns.push(index), + UnnestType::Struct => struct_columns.push(index), + } + qualified_columns.push(col.0.to_owned()); + (col.0.relation.to_owned(), col.1.to_owned()) + }) + .collect()) + } + None => { + dependency_indices.push(index); + Ok(vec![(original_qualifier.cloned(), original_field.clone())]) + } + } }) + .collect::>>()? + .into_iter() + .flatten() .collect::>(); let metadata = input_schema.metadata().clone(); @@ -1644,17 +1656,17 @@ pub fn unnest_with_options( // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); - Ok( - LogicalPlan::Unnest(Unnest { - input: Arc::new(input), - input_columns: columns, - output_columns: qualified_columns, - list_type_columns: list_columns, - struct_type_columns: struct_columns, - schema, - options, - }) - ) + + Ok(LogicalPlan::Unnest(Unnest { + input: Arc::new(input), + input_columns: columns, + output_columns: qualified_columns, + list_type_columns: list_columns, + struct_type_columns: struct_columns, + dependency_indices, + schema, + options, + })) } #[cfg(test)] @@ -2082,14 +2094,16 @@ mod tests { fn plan_builder_unnest() -> Result<()> { // Unnesting a simple column should return the child plan. let plan = nested_table_scan("test_table")? - .unnest_column("scalar")?.build()?; + .unnest_column("scalar")? + .build()?; let expected = "TableScan: test_table"; assert_eq!(expected, format!("{plan:?}")); // Unnesting the strings list. - let plan = nested_table_scan("test_table")?.unnest_column("strings")? - .build()?; + let plan = nested_table_scan("test_table")? + .unnest_column("strings")? + .build()?; let expected = "\ Unnest: test_table.strings\ @@ -2104,8 +2118,10 @@ mod tests { assert_eq!(&DataType::Utf8, field.data_type()); // Unnesting multiple fields. - let plan = nested_table_scan("test_table")?.unnest_column("strings")? - .unnest_column("structs")?.build()?; + let plan = nested_table_scan("test_table")? + .unnest_column("strings")? + .unnest_column("structs")? + .build()?; let expected = "\ Unnest: test_table.structs\ diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 2a4646d8e9b0..d2bea9d1ffb1 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -813,6 +813,10 @@ impl LogicalPlan { .. }) => { // Update schema with unnested column type. + // TODO: we are not considering new expr rewrite + // If this case happen + // original plan: column1.field1, column1.field2, column2.field3 + // input is optimized so that it only gives the schema of column1 let input = inputs.swap_remove(0); let new_plan = unnest_with_options(input, columns.clone(), options.clone())?; @@ -1688,8 +1692,6 @@ impl Projection { /// produced by the projection operation. If the schema computation is successful, /// the `Result` will contain the schema; otherwise, it will contain an error. pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result> { - // println!("display projection input {}\n---with schema {:?}",input.display_indent_schema(),input.schema()); - // println!("exprs being used {:?}",exprs[0]); let mut schema = DFSchema::new_with_metadata( exprlist_to_fields(exprs, input)?, input.schema().metadata().clone(), @@ -2526,13 +2528,21 @@ pub struct Unnest { /// The incoming logical plan pub input: Arc, pub input_columns: Vec, + // which index of original column in the input is each new column depend on + // e.g original input: (struct_col,list[int]) + // new column after unnest (struct_col.field1, struct_col,field2, int) + // then dependency_map will be {0:0,1:0,2:1} // TODO: this should be converted into output column // instead of the columns in the input pub output_columns: Vec, - /// refer to the indices of field columns that have type list + /// refer to the indices(in the original input schema) of field columns + /// that have type list to run unnest on pub list_type_columns: Vec, - /// refer to the indices of field columns that have type struct + /// refer to the indices (in the original input schema) of field columns + /// that have type struct to run unnest on pub struct_type_columns: Vec, + /// for each column in output schema, which column in the input schema it depends on + pub dependency_indices: Vec, /// The output schema, containing the unnested field column. pub schema: DFSchemaRef, /// Options diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 9a8e2fba7b9d..5e12decaa77d 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -315,6 +315,7 @@ impl TreeNode for LogicalPlan { output_columns: columns, list_type_columns, struct_type_columns, + dependency_indices, schema, options, }) => rewrite_arc(input, f)?.update_data(|input| { @@ -322,6 +323,7 @@ impl TreeNode for LogicalPlan { input, input_columns, output_columns: columns, + dependency_indices, list_type_columns, struct_type_columns, schema, @@ -498,6 +500,12 @@ impl LogicalPlan { } LogicalPlan::Unnest(unnest) => { let columns = unnest.input_columns.clone(); + // we use output_columns instead of input column + // let output_exprs = unnest + // .output_columns + // .iter() + // .map(|c| Expr::Column(c.clone())) + // .collect::>(); // let mut columns = unnest.get_list_columns(); // columns.extend_from_slice( // unnest.get_struct_columns().as_ref()); diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index caf0882d6a1a..2c1746c1e5f7 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -77,9 +77,6 @@ fn analyze_internal( plan: &LogicalPlan, ) -> Result { // optimize child plans first - println!("\n==============analyzing_internal begins========================"); - println!("external schema: {:?}", external_schema); - println!("current plan\n{:?}\nrecursively transforming input", plan); let new_inputs = plan .inputs() .iter() @@ -106,25 +103,17 @@ fn analyze_internal( schema: Arc::new(schema), }; - println!("===|rewriting children expressions"); - let new_expr = plan .expressions() .into_iter() .map(|expr| { // ensure aggregate names don't change: // https://github.com/apache/datafusion/issues/3555 - let new_expr = rewrite_preserving_name(expr.clone(), &mut expr_rewrite); - let ret = new_expr?; - println!("=====|{:?} => {:?}",expr, ret); - Ok(ret) + rewrite_preserving_name(expr.clone(), &mut expr_rewrite) }) .collect::>>()?; - - let new_plan = plan.with_new_exprs(new_expr, new_inputs)?; - println!("===|New plan:\n{:?}\n\n===============", new_plan); - Ok(new_plan) + plan.with_new_exprs(new_expr, new_inputs) } pub(crate) struct TypeCoercionRewriter { diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 7f9f2b530abd..c0c3d5d9de81 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -29,6 +29,7 @@ use datafusion_common::{ get_required_group_by_exprs_indices, internal_err, Column, JoinType, Result, }; use datafusion_expr::expr::{Alias, ScalarFunction}; +use datafusion_expr::Unnest; use datafusion_expr::{ logical_plan::LogicalPlan, projection_schema, Aggregate, BinaryExpr, Cast, Distinct, Expr, Projection, TableScan, Window, @@ -109,22 +110,9 @@ fn optimize_projections( indices: RequiredIndicies, ) -> Result> { let child_required_indices: Vec = match plan { - LogicalPlan::Unnest(_) => { - println!("{:?}",indices); - // Pass index requirements from the parent as well as column indices - // that appear in this plan's expressions to its child. All these - // operators benefit from "small" inputs, so the projection_beneficial - // flag is `true`. - vec![RequiredIndicies::new_from_indices(vec![0,0])] - // plan.inputs() - // .into_iter() - // .map(|input| { - // indices - // .clone() - // .with_projection_beneficial() - // .with_plan_exprs(plan, input.schema()) - // }) - // .collect::>()? + LogicalPlan::Unnest(Unnest{dependency_indices,..}) => { + println!("{:?}",dependency_indices); + vec![RequiredIndicies::new_from_indices(dependency_indices.clone())] } LogicalPlan::Sort(_) | LogicalPlan::Filter(_) @@ -402,13 +390,14 @@ fn optimize_projections( // If new_input is `None`, this means child is not changed, so use // `old_child` during construction: .map(|(new_input, old_child)| new_input.unwrap_or_else(|| old_child.clone())) - .collect(); + .collect::>(); + let exprs = plan.expressions(); plan.with_new_exprs(exprs, new_inputs).map(Some) } } -/// Merges consecutive projections. +/// Merges consecutive projectionsDFSchema { inner: Schema { fields: [Field { name: "name0", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "name1", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, field_qualifiers: [None, None], functional_dependencies: FunctionalDependencies { deps: [] } }. /// /// Given a projection `proj`, this function attempts to merge it with a previous /// projection if it exists and if merging is beneficial. Merging is considered @@ -445,6 +434,7 @@ fn merge_consecutive_projections(proj: &Projection) -> Result // projections will benefit from a compute-once approach. For details, see: // https://github.com/apache/datafusion/issues/8296 if column_referral_map.into_iter().any(|(col, usage)| { + let a = prev_projection.schema.index_of_column(&col); usage > 1 && !is_expr_trivial( &prev_projection.expr @@ -743,6 +733,10 @@ fn rewrite_projection_given_requirements( return if let Some(input) = optimize_projections(&proj.input, config, required_indices)? { + if let Err(e) = is_projection_unnecessary(&input, &exprs_used) { + println!("==========\n\nerror checking if projection is necessary, input\n{:?}\nwith schema\n{:?}\n project exprs {:?}", + input,input.schema(),exprs_used); + } if is_projection_unnecessary(&input, &exprs_used)? { Ok(Some(input)) } else { diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 7d8918a2f6a5..2acf6c4ba7e5 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -312,23 +312,28 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // For expr at root level, we need to check if it is an unnest on struct // column to transform the expr unnest(struct(field1, field2)) // into struct.field1, struct.field2 - if let Expr::Unnest(Unnest { expr: ref arg }) = expr { + if let Expr::Unnest(Unnest { expr: ref arg }) = expr { // will need to be rewritten into (struct_col.field1, struct_col, //field2 ...) let column_name = expr.display_name()?; - let column = Column::from_name(column_name); + let col_in_unnest = match arg.try_into_col() { + Ok(col) => col, + Err(err) => return internal_err!("failed to convert expression inside unnest into column {},unnest only support expression representing column for now", + err) + }; unnest_columns.push(column_name.clone()); // Add alias for the argument expression, to avoid naming conflicts // with other expressions // in the select list. For example: `select unnest(col1), col1 from t`. + // this extra projection is used to unnest transforming inner_projection_exprs.push(arg.clone().alias(column_name.clone())); let schema = input.schema(); - let outer_projection_columns = projection_after_unnest( - &column, schema)?; + let (outer_projection_columns ,_)= projection_after_unnest( + &col_in_unnest, schema)?; let expr = outer_projection_columns .iter() - .map(|col| Expr::Column(col.clone())) + .map(|col| Expr::Column(col.0.clone())) .collect::>(); return Ok(expr); } @@ -342,12 +347,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // will need to be rewritten into (struct_col.field1, struct_col,field2 ...) let column_name = expr.display_name()?; unnest_columns.push(column_name.clone()); - let col = Column::from_name(column_name); + let col = Column::from_name(&column_name); // Add alias for the argument expression, to avoid naming conflicts with other expressions // in the select list. For example: `select unnest(col1), col1 from t`. let schema = input.schema(); if let DataType::Struct(_)= schema.data_type(&col)? { - internal_err!("unnest on struct can ony be applied at the root level of select expression") + return internal_err!("unnest on struct can ony be applied at the root level of select expression"); } inner_projection_exprs @@ -376,7 +381,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .collect::>>()? .into_iter().flatten().collect(); - // Do the final projection if unnest_columns.is_empty() { LogicalPlanBuilder::from(input) @@ -388,8 +392,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let unnest_options = UnnestOptions::new().with_preserve_nulls(false); let plan = LogicalPlanBuilder::from(input) .project(inner_projection_exprs)? - .unnest_columns_with_options(columns, unnest_options)? - .project(outer_projection_exprs)?.build(); + .unnest_columns_with_options(columns, unnest_options)?; + let plan = plan.project(outer_projection_exprs)?.build(); plan } } diff --git a/datafusion/sqllogictest/test_files/unnest_debug.slt b/datafusion/sqllogictest/test_files/unnest_debug.slt index 7a4e9ddc95a4..a126fdf1a86a 100644 --- a/datafusion/sqllogictest/test_files/unnest_debug.slt +++ b/datafusion/sqllogictest/test_files/unnest_debug.slt @@ -22,11 +22,12 @@ statement ok CREATE TABLE unnest_table AS VALUES - (struct(1 as "name0", 2 as "name1")) + (struct(1 as "name0", 2 as "name1"),[1,2,3]) ; -query II -select unnest(column1), column1 from unnest_table; + +query II? +select unnest(column1), column1, unnest(column2) from unnest_table; ---- -1 2 +1 2 {name0: 1, name1: 2} From dd1dacede912a82df50a6299459d174dbd2e3871 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Wed, 15 May 2024 23:26:31 +0200 Subject: [PATCH 05/21] chore: complete impl --- datafusion/core/src/physical_planner.rs | 23 +++---- datafusion/expr/src/logical_plan/builder.rs | 35 ++++++----- datafusion/expr/src/logical_plan/plan.rs | 2 +- datafusion/physical-plan/src/unnest.rs | 23 ++++--- datafusion/sql/src/select.rs | 61 ++++++++++++------- .../sqllogictest/test_files/unnest_debug.slt | 11 +++- 6 files changed, 95 insertions(+), 60 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 8678cda615a7..9e156b568a02 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -49,7 +49,7 @@ use crate::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGro use crate::physical_plan::analyze::AnalyzeExec; use crate::physical_plan::empty::EmptyExec; use crate::physical_plan::explain::ExplainExec; -use crate::physical_plan::expressions::{Column, PhysicalSortExpr}; +use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::filter::FilterExec; use crate::physical_plan::joins::utils as join_utils; use crate::physical_plan::joins::{ @@ -1157,7 +1157,7 @@ impl DefaultPhysicalPlanner { Arc::new(GlobalLimitExec::new(input, *skip, *fetch)) } LogicalPlan::Unnest(Unnest { - output_columns: columns, + // output_columns: columns, list_type_columns, struct_type_columns, schema, @@ -1165,14 +1165,14 @@ impl DefaultPhysicalPlanner { .. }) => { let input = children.one()?; - let list_column_exec = list_type_columns - .iter() - .map(|idx| { - let column = &columns[*idx]; - let schema_idx = schema.index_of_column(column)?; - Ok(Column::new(&column.name, schema_idx)) - }) - .collect::>()?; + // let list_column_exec = list_type_columns + // .iter() + // .map(|idx| { + // let column = &columns[*idx]; + // let schema_idx = schema.index_of_column(column)?; + // Ok(Column::new(&column.name, schema_idx)) + // }) + // .collect::>()?; let struct_columns_set: HashSet = struct_type_columns.iter().copied().collect(); @@ -1183,7 +1183,8 @@ impl DefaultPhysicalPlanner { let schema = SchemaRef::new(schema.as_ref().to_owned().into()); Arc::new(UnnestExec::new( input, - list_column_exec, + list_type_columns.clone(), + // list_column_exec, struct_columns_set, schema, options.clone(), diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 68fe0bfe1bab..2162a429d217 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1553,40 +1553,45 @@ pub enum UnnestType { } pub fn projection_after_unnest( - c: &Column, - schema: &DFSchemaRef, + // c: &Column, + col_name: &String, + data_type: &DataType, ) -> Result<(Vec<(Column, Arc)>, UnnestType)> { let mut qualified_columns = Vec::with_capacity(1); - let schema_idx = schema.index_of_column(c)?; - let (_, unnest_field) = schema.qualified_field(schema_idx); - let unnest_type = match unnest_field.data_type() { + // let schema_idx: usize = schema.index_of_column(c)?; + // let (_, unnest_field) = schema.qualified_field(schema_idx); + let unnest_type = match data_type { DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => { + let name = format!("{}.element", col_name); let new_field = Arc::new(Field::new( // TODO: append field name by original name + .fieldname - unnest_field.name(), + name.clone(), field.data_type().clone(), // Unnesting may produce NULLs even if the list is not null. // For example: unnset([1], []) -> 1, null true, )); - let column = Column::from((None, &new_field)); + let column = Column::from_name(name); + // let column = Column::from((None, &new_field)); qualified_columns.push((column, new_field)); UnnestType::List } DataType::Struct(fields) => { - let new_fields = fields.to_vec(); - qualified_columns.extend(new_fields.into_iter().map(|f| { - let column = Column::from((None, &f)); - (column, f) + qualified_columns.extend(fields.iter().map(|f| { + let new_name = format!("{}.{}", col_name, f.name()); + let column = Column::from_name(&new_name); + let new_field = f.as_ref().clone().with_name(new_name); + // let column = Column::from((None, &f)); + (column, Arc::new(new_field)) })); UnnestType::Struct } _ => { return internal_err!( "trying to unnest on invalid data type {:?}", - unnest_field.data_type() + data_type ); } }; @@ -1623,8 +1628,10 @@ pub fn unnest_with_options( .map(|(index, (original_qualifier, original_field))| { match column_by_original_index.get(&index) { Some(&column_to_unnest) => { - let (flatten_columns, unnest_type) = - projection_after_unnest(column_to_unnest, input_schema)?; + let (flatten_columns, unnest_type) = projection_after_unnest( + &column_to_unnest.name, + original_field.data_type(), + )?; // new columns dependent on the same original index dependency_indices .extend(std::iter::repeat(index).take(flatten_columns.len())); diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index d2bea9d1ffb1..bd5a181c57f5 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1553,7 +1553,7 @@ impl LogicalPlan { LogicalPlan::Unnest(Unnest { list_type_columns, struct_type_columns, .. }) => { - write!(f, "Unnest: {}{}", expr_vec_fmt!(list_type_columns),expr_vec_fmt!(struct_type_columns)) + write!(f, "Unnest: TODO:i'm ugly{}{}", expr_vec_fmt!(list_type_columns),expr_vec_fmt!(struct_type_columns)) } } } diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 5715fdf31a29..8ce789afea93 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -23,8 +23,8 @@ use std::{any::Any, sync::Arc}; use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use super::{DisplayAs, ExecutionPlanProperties, PlanProperties}; use crate::{ - expressions::Column, DisplayFormatType, Distribution, ExecutionPlan, PhysicalExpr, - RecordBatchStream, SendableRecordBatchStream, + DisplayFormatType, Distribution, ExecutionPlan, RecordBatchStream, + SendableRecordBatchStream, }; use arrow::array::{ @@ -40,6 +40,7 @@ use arrow_array::{Int64Array, Scalar, StructArray}; use arrow_ord::cmp::lt; use datafusion_common::{exec_datafusion_err, exec_err, Result, UnnestOptions}; use datafusion_execution::TaskContext; +use datafusion_expr::ColumnarValue; use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; @@ -59,7 +60,7 @@ pub struct UnnestExec { /// The schema once the unnest is applied schema: SchemaRef, /// The unnest list type columns - list_type_columns: Vec, + list_type_columns: Vec, /// struct_column_indices: HashSet, /// Options @@ -74,7 +75,7 @@ impl UnnestExec { /// Create a new [UnnestExec]. pub fn new( input: Arc, - list_type_columns: Vec, + list_type_columns: Vec, struct_column_indices: HashSet, schema: SchemaRef, options: UnnestOptions, @@ -223,7 +224,7 @@ struct UnnestStream { /// Unnested schema schema: Arc, /// The unnest columns - list_type_columns: Vec, + list_type_columns: Vec, /// struct_column_indices: HashSet, /// Options @@ -330,7 +331,7 @@ fn build_batch( batch: &RecordBatch, schema: &SchemaRef, // list type column only - list_type_columns: &[Column], + list_type_columns: &[usize], struct_column_indices: &HashSet, options: &UnnestOptions, ) -> Result { @@ -339,7 +340,10 @@ fn build_batch( _ => { let list_arrays: Vec = list_type_columns .iter() - .map(|column| column.evaluate(batch)?.into_array(batch.num_rows())) + .map(|index| { + ColumnarValue::Array(batch.column(*index).clone()) + .into_array(batch.num_rows()) + }) .collect::>()?; let longest_length = find_longest_length(&list_arrays, options)?; @@ -361,15 +365,14 @@ fn build_batch( let unnested_array_map: HashMap<_, _> = unnested_arrays .into_iter() .zip(list_type_columns.iter()) - .map(|(array, column)| (column.index(), array)) + .map(|(array, column)| (*column, array)) .collect(); // Create the take indices array for other columns let take_indicies = create_take_indicies(unnested_length, total_length); // vertical expansion because of list unnest - let ret = batch_from_indices(batch, - &unnested_array_map, &take_indicies)?; + let ret = batch_from_indices(batch, &unnested_array_map, &take_indicies)?; build_batch_vertically(&ret, schema, struct_column_indices) } }; diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 2acf6c4ba7e5..c524fc9eb619 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -26,9 +26,7 @@ use crate::utils::{ use arrow_schema::DataType; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{ - internal_err, not_impl_err, plan_err, DataFusionError, ExprSchema, Result, -}; +use datafusion_common::{internal_err, not_impl_err, plan_err, DataFusionError, Result}; use datafusion_common::{Column, UnnestOptions}; use datafusion_expr::builder::projection_after_unnest; use datafusion_expr::expr::{Alias, Unnest}; @@ -40,7 +38,8 @@ use datafusion_expr::utils::{ find_aggregate_exprs, find_window_exprs, }; use datafusion_expr::{ - Expr, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder, Partitioning, + Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder, + Partitioning, }; use sqlparser::ast::{ Distinct, Expr as SQLExpr, GroupByExpr, OrderByExpr, ReplaceSelectItem, @@ -316,11 +315,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // will need to be rewritten into (struct_col.field1, struct_col, //field2 ...) let column_name = expr.display_name()?; - let col_in_unnest = match arg.try_into_col() { - Ok(col) => col, - Err(err) => return internal_err!("failed to convert expression inside unnest into column {},unnest only support expression representing column for now", - err) - }; + unnest_columns.push(column_name.clone()); // Add alias for the argument expression, to avoid naming conflicts // with other expressions @@ -329,8 +324,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { inner_projection_exprs.push(arg.clone().alias(column_name.clone())); let schema = input.schema(); + let (data_type,_) = expr.data_type_and_nullable(schema)?; + let (outer_projection_columns ,_)= projection_after_unnest( - &col_in_unnest, schema)?; + &column_name, &data_type)?; let expr = outer_projection_columns .iter() .map(|col| Expr::Column(col.0.clone())) @@ -343,21 +340,34 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { tnr: _, } = expr.transform_up(|expr: Expr| { if let Expr::Unnest(Unnest { expr: ref arg }) = expr { - // TODO: an expr like (unnest(struct_col)) - // will need to be rewritten into (struct_col.field1, struct_col,field2 ...) - let column_name = expr.display_name()?; - unnest_columns.push(column_name.clone()); - let col = Column::from_name(&column_name); - // Add alias for the argument expression, to avoid naming conflicts with other expressions - // in the select list. For example: `select unnest(col1), col1 from t`. + // This column is the copy of the column that will be unnested + // it will be replaced by the unnested columns + let intermediate_col_name = expr.display_name()?; + unnest_columns.push(intermediate_col_name.clone()); let schema = input.schema(); - if let DataType::Struct(_)= schema.data_type(&col)? { + let (data_type,_) = arg.data_type_and_nullable(schema)?; + + // TODO: things like unnest(struct_col) as alias/unnest(struct_col) > 0 does not make sense + // or do they? + if let DataType::Struct(_)= data_type { return internal_err!("unnest on struct can ony be applied at the root level of select expression"); } + // because unnest logical plan will call the same function + // to transform the column_name into new column(s) + let (outer_projection_columns ,_)= projection_after_unnest( + &intermediate_col_name, &data_type)?; + let unnested_child_exprs = outer_projection_columns + .iter() + .map(|col| Expr::Column(col.0.clone())) + .collect::>(); + + // Add alias for the argument expression, to avoid naming conflicts with other expressions + // in the select list. For example: `select unnest(col1), col1 from t`. + // TODO: this does not work with case like unnest(col1) + unnest(col1) inner_projection_exprs - .push(arg.clone().alias(column_name.clone())); - Ok(Transformed::yes(Expr::Column(col))) + .push(arg.clone().alias(intermediate_col_name.clone())); + Ok(Transformed::yes(unnested_child_exprs[0].clone())) } else { Ok(Transformed::no(expr)) } @@ -391,8 +401,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL let unnest_options = UnnestOptions::new().with_preserve_nulls(false); let plan = LogicalPlanBuilder::from(input) - .project(inner_projection_exprs)? + .project(inner_projection_exprs.clone())? .unnest_columns_with_options(columns, unnest_options)?; + println!("unnesting over columns {:?}", inner_projection_exprs); + println!( + "running projection fields {:?}\non plan\n{:?}\n + with schema {:?}", + outer_projection_exprs, + plan, + plan.schema(), + ); + // plan.build() let plan = plan.project(outer_projection_exprs)?.build(); plan } diff --git a/datafusion/sqllogictest/test_files/unnest_debug.slt b/datafusion/sqllogictest/test_files/unnest_debug.slt index a126fdf1a86a..871a53a59dee 100644 --- a/datafusion/sqllogictest/test_files/unnest_debug.slt +++ b/datafusion/sqllogictest/test_files/unnest_debug.slt @@ -27,7 +27,12 @@ AS VALUES -query II? -select unnest(column1), column1, unnest(column2) from unnest_table; +query III +select unnest(column1), unnest(column2) + (unnest(column2) as alias) as some_alias from unnest_table; ---- -1 2 {name0: 1, name1: 2} +1 2 2 +1 2 3 +1 2 4 + + + From 70af3834db6360d802590f1c7a0b47d8198b005e Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Thu, 16 May 2024 19:56:17 +0200 Subject: [PATCH 06/21] chore: clean garbage --- datafusion/expr/src/expr_schema.rs | 26 +-- datafusion/expr/src/logical_plan/builder.rs | 33 +-- datafusion/sql/src/select.rs | 208 ++++++++++-------- .../sqllogictest/test_files/unnest_debug.slt | 2 +- 4 files changed, 127 insertions(+), 142 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 6bd002d6093a..285f0491b8f0 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -123,7 +123,8 @@ impl ExprSchemable for Expr { Ok(field.data_type().clone()) } DataType::Struct(_) => { - Ok(arg_data_type.clone()) + // TODO: this is not correct, because unnest(struct) wll result into multiple data_type + Ok(arg_data_type) } DataType::Null => { not_impl_err!("unnest() does not support null yet") @@ -422,9 +423,6 @@ impl ExprSchemable for Expr { match self { Expr::Column(c) => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - if let Err(_) = self.metadata(input_schema) { - panic!("here"); - } Ok(( c.relation.clone(), Field::new(&c.name, data_type, nullable) @@ -434,9 +432,6 @@ impl ExprSchemable for Expr { } Expr::Alias(Alias { relation, name, .. }) => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - if let Err(_) = self.metadata(input_schema) { - panic!("here"); - } Ok(( relation.clone(), Field::new(name, data_type, nullable) @@ -444,25 +439,8 @@ impl ExprSchemable for Expr { .into(), )) } - Expr::Unnest(Unnest { expr }) => { - let st = self.data_type_and_nullable(input_schema); - let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - if let Err(_) = st { - panic!("here"); - } - Ok(( - None, - Field::new(self.display_name()?, data_type, nullable) - .with_metadata(self.metadata(input_schema)?) - .into(), - )) - } _ => { - let st = self.data_type_and_nullable(input_schema); let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - if let Err(_) = self.metadata(input_schema) { - panic!("here"); - } Ok(( None, Field::new(self.display_name()?, data_type, nullable) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 2162a429d217..cce89b2b2877 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1547,20 +1547,14 @@ impl TableSource for LogicalTableSource { pub fn unnest(input: LogicalPlan, columns: Vec) -> Result { unnest_with_options(input, columns, UnnestOptions::default()) } -pub enum UnnestType { - Struct, - List, -} -pub fn projection_after_unnest( - // c: &Column, +pub fn get_unnested_columns( col_name: &String, data_type: &DataType, -) -> Result<(Vec<(Column, Arc)>, UnnestType)> { +) -> Result)>> { let mut qualified_columns = Vec::with_capacity(1); - // let schema_idx: usize = schema.index_of_column(c)?; - // let (_, unnest_field) = schema.qualified_field(schema_idx); - let unnest_type = match data_type { + + match data_type { DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => { @@ -1576,7 +1570,6 @@ pub fn projection_after_unnest( let column = Column::from_name(name); // let column = Column::from((None, &new_field)); qualified_columns.push((column, new_field)); - UnnestType::List } DataType::Struct(fields) => { qualified_columns.extend(fields.iter().map(|f| { @@ -1585,8 +1578,7 @@ pub fn projection_after_unnest( let new_field = f.as_ref().clone().with_name(new_name); // let column = Column::from((None, &f)); (column, Arc::new(new_field)) - })); - UnnestType::Struct + })) } _ => { return internal_err!( @@ -1595,7 +1587,7 @@ pub fn projection_after_unnest( ); } }; - Ok((qualified_columns, unnest_type)) + Ok(qualified_columns) } /// Create a [`LogicalPlan::Unnest`] plan with options @@ -1604,10 +1596,6 @@ pub fn unnest_with_options( columns: Vec, options: UnnestOptions, ) -> Result { - // Extract the type of the nested field in the list. - // let mut unnested_fields_map: HashMap = - // HashMap::with_capacity(columns.len()); - // Add qualifiers to the columns. let mut qualified_columns = Vec::with_capacity(columns.len()); let mut list_columns = Vec::with_capacity(columns.len()); let mut struct_columns = Vec::with_capacity(columns.len()); @@ -1628,7 +1616,7 @@ pub fn unnest_with_options( .map(|(index, (original_qualifier, original_field))| { match column_by_original_index.get(&index) { Some(&column_to_unnest) => { - let (flatten_columns, unnest_type) = projection_after_unnest( + let flatten_columns = get_unnested_columns( &column_to_unnest.name, original_field.data_type(), )?; @@ -1638,9 +1626,10 @@ pub fn unnest_with_options( Ok(flatten_columns .iter() .map(|col| { - match unnest_type { - UnnestType::List => list_columns.push(index), - UnnestType::Struct => struct_columns.push(index), + match original_field.data_type() { + DataType::List(_) => list_columns.push(index), + DataType::Struct(_) => struct_columns.push(index), + _ => {panic!("not reachable, should be caught by get_unnested_columns")} } qualified_columns.push(col.0.to_owned()); (col.0.relation.to_owned(), col.1.to_owned()) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index c524fc9eb619..c7ac6fcc8e27 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -28,7 +28,7 @@ use arrow_schema::DataType; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{internal_err, not_impl_err, plan_err, DataFusionError, Result}; use datafusion_common::{Column, UnnestOptions}; -use datafusion_expr::builder::projection_after_unnest; +use datafusion_expr::builder::get_unnested_columns; use datafusion_expr::expr::{Alias, Unnest}; use datafusion_expr::expr_rewriter::{ normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols, @@ -294,6 +294,100 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(plan) } + fn recursive_transform_unnest_expr( + &self, + input: &LogicalPlan, + unnest_columns: &mut Vec, + inner_projection_exprs: &mut Vec, + expr: Expr, + ) -> Result> { + // expr returned here maybe different from the originals in inner_projection_exprs + // for example: + // - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2 + // - unnest(array_col) will be transformed into unnest(array_col).element + // - unnest(array_col) + 1 will be transformed into unnest(array_col).element +1 + + // Specifically handle root level unnest expr, this is the only place + // unnest(struct_col) can be handled + if let Expr::Unnest(Unnest { expr: ref arg }) = expr { + // will need to be rewritten into (struct_col.field1, struct_col, + //field2 ...) + let column_name = expr.display_name()?; + + unnest_columns.push(column_name.clone()); + // Add alias for the argument expression, to avoid naming conflicts + // with other expressions + // in the select list. For example: `select unnest(col1), col1 from t`. + // this extra projection is used to unnest transforming + inner_projection_exprs.push(arg.clone().alias(column_name.clone())); + let schema = input.schema(); + + let (data_type, _) = expr.data_type_and_nullable(schema)?; + + let outer_projection_columns = + get_unnested_columns(&column_name, &data_type)?; + let expr = outer_projection_columns + .iter() + .map(|col| Expr::Column(col.0.clone())) + .collect::>(); + return Ok(expr); + } + let Transformed { + data: transformed_expr, + transformed, + tnr: _, + } = expr.transform_up(|expr: Expr| { + if let Expr::Unnest(Unnest { expr: ref arg }) = expr { + // This column is the copy of the column that will be unnested + // it will be replaced by the unnested columns + let intermediate_col_name = expr.display_name()?; + unnest_columns.push(intermediate_col_name.clone()); + let schema = input.schema(); + let (data_type, _) = arg.data_type_and_nullable(schema)?; + + // TODO: things like unnest(struct_col) as alias/unnest(struct_col) > 0 + // does not make sense + // or do they? + if let DataType::Struct(_) = data_type { + return internal_err!("unnest on struct can ony be applied at the root level of select expression"); + } + // because unnest logical plan will call the same function + // to transform the column_name into new column(s) + let outer_projection_columns = + get_unnested_columns(&intermediate_col_name, &data_type)?; + let unnested_child_exprs = outer_projection_columns + .iter() + .map(|col| Expr::Column(col.0.clone())) + .collect::>(); + + // Add alias for the argument expression, to avoid naming conflicts with other expressions + // in the select list. For example: `select unnest(col1), col1 from t`. + // TODO: this does not work with case like unnest(col1) + unnest(col1) + inner_projection_exprs + .push(arg.clone().alias(intermediate_col_name.clone())); + // because unnest on list expr will only transform into 1 expr + Ok(Transformed::yes(unnested_child_exprs[0].clone())) + } else { + Ok(Transformed::no(expr)) + } + })?; + + if !transformed { + if matches!(&transformed_expr, Expr::Column(_)) { + inner_projection_exprs.push(transformed_expr.clone()); + Ok(vec![transformed_expr]) + } else { + // We need to evaluate the expr in the inner projection, + // outer projection just select its name + let column_name = transformed_expr.display_name()?; + inner_projection_exprs.push(transformed_expr); + Ok(vec![Expr::Column(Column::from_name(column_name))]) + } + } else { + Ok(vec![transformed_expr]) + } + } + /// Try converting Expr(Unnest(Expr)) to Projection/Unnest/Projection pub(super) fn try_process_unnest( &self, @@ -305,91 +399,25 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // including non unnest column and unnest column let mut inner_projection_exprs = vec![]; - let outer_projection_exprs:Vec = select_exprs + // expr returned here maybe different from the originals in inner_projection_exprs + // for example: + // - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2 + // - unnest(array_col) will be transformed into unnest(array_col).element + // - unnest(array_col) + 1 will be transformed into unnest(array_col).element +1 + let outer_projection_exprs: Vec = select_exprs .into_iter() .map(|expr| { - // For expr at root level, we need to check if it is an unnest on struct - // column to transform the expr unnest(struct(field1, field2)) - // into struct.field1, struct.field2 - if let Expr::Unnest(Unnest { expr: ref arg }) = expr { - // will need to be rewritten into (struct_col.field1, struct_col, - //field2 ...) - let column_name = expr.display_name()?; - - unnest_columns.push(column_name.clone()); - // Add alias for the argument expression, to avoid naming conflicts - // with other expressions - // in the select list. For example: `select unnest(col1), col1 from t`. - // this extra projection is used to unnest transforming - inner_projection_exprs.push(arg.clone().alias(column_name.clone())); - let schema = input.schema(); - - let (data_type,_) = expr.data_type_and_nullable(schema)?; - - let (outer_projection_columns ,_)= projection_after_unnest( - &column_name, &data_type)?; - let expr = outer_projection_columns - .iter() - .map(|col| Expr::Column(col.0.clone())) - .collect::>(); - return Ok(expr); - } - let Transformed { - data: transformed_expr, - transformed, - tnr: _, - } = expr.transform_up(|expr: Expr| { - if let Expr::Unnest(Unnest { expr: ref arg }) = expr { - // This column is the copy of the column that will be unnested - // it will be replaced by the unnested columns - let intermediate_col_name = expr.display_name()?; - unnest_columns.push(intermediate_col_name.clone()); - let schema = input.schema(); - let (data_type,_) = arg.data_type_and_nullable(schema)?; - - // TODO: things like unnest(struct_col) as alias/unnest(struct_col) > 0 does not make sense - // or do they? - if let DataType::Struct(_)= data_type { - return internal_err!("unnest on struct can ony be applied at the root level of select expression"); - } - // because unnest logical plan will call the same function - // to transform the column_name into new column(s) - let (outer_projection_columns ,_)= projection_after_unnest( - &intermediate_col_name, &data_type)?; - let unnested_child_exprs = outer_projection_columns - .iter() - .map(|col| Expr::Column(col.0.clone())) - .collect::>(); - - - // Add alias for the argument expression, to avoid naming conflicts with other expressions - // in the select list. For example: `select unnest(col1), col1 from t`. - // TODO: this does not work with case like unnest(col1) + unnest(col1) - inner_projection_exprs - .push(arg.clone().alias(intermediate_col_name.clone())); - Ok(Transformed::yes(unnested_child_exprs[0].clone())) - } else { - Ok(Transformed::no(expr)) - } - })?; - - if !transformed { - if matches!(&transformed_expr, Expr::Column(_)) { - inner_projection_exprs.push(transformed_expr.clone()); - Ok(vec![transformed_expr]) - } else { - // We need to evaluate the expr in the inner projection, - // outer projection just select its name - let column_name = transformed_expr.display_name()?; - inner_projection_exprs.push(transformed_expr); - Ok(vec![Expr::Column(Column::from_name(column_name))]) - } - } else { - Ok(vec![transformed_expr]) - } + self.recursive_transform_unnest_expr( + &input, + &mut unnest_columns, + &mut inner_projection_exprs, + expr, + ) }) .collect::>>()? - .into_iter().flatten().collect(); + .into_iter() + .flatten() + .collect(); // Do the final projection if unnest_columns.is_empty() { @@ -400,20 +428,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let columns = unnest_columns.into_iter().map(|col| col.into()).collect(); // Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL let unnest_options = UnnestOptions::new().with_preserve_nulls(false); - let plan = LogicalPlanBuilder::from(input) + LogicalPlanBuilder::from(input) .project(inner_projection_exprs.clone())? - .unnest_columns_with_options(columns, unnest_options)?; - println!("unnesting over columns {:?}", inner_projection_exprs); - println!( - "running projection fields {:?}\non plan\n{:?}\n - with schema {:?}", - outer_projection_exprs, - plan, - plan.schema(), - ); - // plan.build() - let plan = plan.project(outer_projection_exprs)?.build(); - plan + .unnest_columns_with_options(columns, unnest_options)? + .project(outer_projection_exprs)?.build() } } diff --git a/datafusion/sqllogictest/test_files/unnest_debug.slt b/datafusion/sqllogictest/test_files/unnest_debug.slt index 871a53a59dee..da2dab56457d 100644 --- a/datafusion/sqllogictest/test_files/unnest_debug.slt +++ b/datafusion/sqllogictest/test_files/unnest_debug.slt @@ -28,7 +28,7 @@ AS VALUES query III -select unnest(column1), unnest(column2) + (unnest(column2) as alias) as some_alias from unnest_table; +select unnest(column1), unnest(column2) as some_alias from unnest_table; ---- 1 2 2 1 2 3 From ebb8558b68a5bdea97dc29d2c7c44cb5eb1cb94c Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Thu, 16 May 2024 20:48:38 +0200 Subject: [PATCH 07/21] chore: more test --- datafusion/expr/src/logical_plan/builder.rs | 4 +- datafusion/sql/src/select.rs | 97 ++++++++----------- datafusion/sqllogictest/test_files/unnest.slt | 35 +++++-- .../sqllogictest/test_files/unnest_debug.slt | 38 -------- 4 files changed, 71 insertions(+), 103 deletions(-) delete mode 100644 datafusion/sqllogictest/test_files/unnest_debug.slt diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index cce89b2b2877..85fa27e3c570 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1627,7 +1627,9 @@ pub fn unnest_with_options( .iter() .map(|col| { match original_field.data_type() { - DataType::List(_) => list_columns.push(index), + DataType::List(_) + | DataType::FixedSizeList(_,_) + | DataType::LargeList(_) => list_columns.push(index), DataType::Struct(_) => struct_columns.push(index), _ => {panic!("not reachable, should be caught by get_unnested_columns")} } diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index c7ac6fcc8e27..926f73ad8ded 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -294,79 +294,61 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(plan) } - fn recursive_transform_unnest_expr( + fn recursive_transform_unnest( &self, input: &LogicalPlan, - unnest_columns: &mut Vec, + unnest_placeholder_columns: &mut Vec, inner_projection_exprs: &mut Vec, - expr: Expr, + original_expr: Expr, ) -> Result> { - // expr returned here maybe different from the originals in inner_projection_exprs + let mut transform = + |unnest_expr: &Expr, expr_in_unnest: &Expr| -> Result> { + // Full context, we are trying to plan the execution as InnerProjection->Unnest->OuterProjection + // inside unnest execution, each column inside the inner projection + // will be transformed into new columns. Thus we need to keep track of these placeholding column names + let placeholder_name = unnest_expr.display_name()?; + + unnest_placeholder_columns.push(placeholder_name.clone()); + // Add alias for the argument expression, to avoid naming conflicts + // with other expressions in the select list. For example: `select unnest(col1), col1 from t`. + // this extra projection is used to unnest transforming + inner_projection_exprs + .push(expr_in_unnest.clone().alias(placeholder_name.clone())); + let schema = input.schema(); + + let (data_type, _) = expr_in_unnest.data_type_and_nullable(schema)?; + + let outer_projection_columns = + get_unnested_columns(&placeholder_name, &data_type)?; + let expr = outer_projection_columns + .iter() + .map(|col| Expr::Column(col.0.clone())) + .collect::>(); + Ok(expr) + }; + // expr transformed maybe different from the originals exprs // for example: // - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2 // - unnest(array_col) will be transformed into unnest(array_col).element - // - unnest(array_col) + 1 will be transformed into unnest(array_col).element +1 + // - unnest(array_col) + 1 will be transformed into unnest(array_col).element + 1 // Specifically handle root level unnest expr, this is the only place - // unnest(struct_col) can be handled - if let Expr::Unnest(Unnest { expr: ref arg }) = expr { - // will need to be rewritten into (struct_col.field1, struct_col, - //field2 ...) - let column_name = expr.display_name()?; - - unnest_columns.push(column_name.clone()); - // Add alias for the argument expression, to avoid naming conflicts - // with other expressions - // in the select list. For example: `select unnest(col1), col1 from t`. - // this extra projection is used to unnest transforming - inner_projection_exprs.push(arg.clone().alias(column_name.clone())); - let schema = input.schema(); - - let (data_type, _) = expr.data_type_and_nullable(schema)?; - - let outer_projection_columns = - get_unnested_columns(&column_name, &data_type)?; - let expr = outer_projection_columns - .iter() - .map(|col| Expr::Column(col.0.clone())) - .collect::>(); - return Ok(expr); + // unnest on struct can be handled + if let Expr::Unnest(Unnest { expr: ref arg }) = original_expr { + return transform(&original_expr, arg); } let Transformed { data: transformed_expr, transformed, tnr: _, - } = expr.transform_up(|expr: Expr| { + } = original_expr.transform_up(|expr: Expr| { if let Expr::Unnest(Unnest { expr: ref arg }) = expr { - // This column is the copy of the column that will be unnested - // it will be replaced by the unnested columns - let intermediate_col_name = expr.display_name()?; - unnest_columns.push(intermediate_col_name.clone()); - let schema = input.schema(); - let (data_type, _) = arg.data_type_and_nullable(schema)?; - - // TODO: things like unnest(struct_col) as alias/unnest(struct_col) > 0 - // does not make sense - // or do they? + let (data_type, _) = expr.data_type_and_nullable(input.schema())?; if let DataType::Struct(_) = data_type { return internal_err!("unnest on struct can ony be applied at the root level of select expression"); } - // because unnest logical plan will call the same function - // to transform the column_name into new column(s) - let outer_projection_columns = - get_unnested_columns(&intermediate_col_name, &data_type)?; - let unnested_child_exprs = outer_projection_columns - .iter() - .map(|col| Expr::Column(col.0.clone())) - .collect::>(); - - // Add alias for the argument expression, to avoid naming conflicts with other expressions - // in the select list. For example: `select unnest(col1), col1 from t`. - // TODO: this does not work with case like unnest(col1) + unnest(col1) - inner_projection_exprs - .push(arg.clone().alias(intermediate_col_name.clone())); - // because unnest on list expr will only transform into 1 expr - Ok(Transformed::yes(unnested_child_exprs[0].clone())) + let transformed_exprs = transform(&expr, arg)?; + Ok(Transformed::yes(transformed_exprs[0].clone())) } else { Ok(Transformed::no(expr)) } @@ -407,7 +389,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let outer_projection_exprs: Vec = select_exprs .into_iter() .map(|expr| { - self.recursive_transform_unnest_expr( + self.recursive_transform_unnest( &input, &mut unnest_columns, &mut inner_projection_exprs, @@ -431,7 +413,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { LogicalPlanBuilder::from(input) .project(inner_projection_exprs.clone())? .unnest_columns_with_options(columns, unnest_options)? - .project(outer_projection_exprs)?.build() + .project(outer_projection_exprs)? + .build() } } diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index ca7e73cb87e0..94b144fe6b3b 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -38,7 +38,13 @@ select unnest([1,2,3]); 2 3 -## Basic unnest expression in from clause +## Basic unnest expression in select struct +query III +select unnest(struct(1,2,3)); +---- +1 2 3 + +## Basic unnest list expression in from clause query I select * from unnest([1,2,3]); ---- @@ -46,6 +52,20 @@ select * from unnest([1,2,3]); 2 3 +## Basic unnest struct expression in from clause +query III +select * from unnest(struct(1,2,3)); +---- +1 2 3 + +## Multiple unnest expression in from clause +query IIII +select * from unnest(struct(1,2,3)),unnest([4,5,6]); +---- +1 2 3 4 +1 2 3 5 +1 2 3 6 + ## Unnest null in select list query error DataFusion error: This feature is not implemented: unnest\(\) does not support null yet @@ -145,10 +165,6 @@ select array_remove(column1, 4), unnest(column2), column3 * 10 from unnest_table [12] NULL NULL -## Unnest column with scalars -query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null -select unnest(column3) from unnest_table; - ## Unnest doesn't work with untyped nulls query error DataFusion error: This feature is not implemented: unnest\(\) does not support null yet select unnest(null) from unnest_table; @@ -233,12 +249,17 @@ select * from unnest([], NULL::int[]); ## Unnest struct expression in select list -query error DataFusion error: This feature is not implemented: unnest\(\) does not support struct yet +query ? select unnest(struct(null)); +---- +NULL ## Unnest struct expression in from clause -query error DataFusion error: This feature is not implemented: unnest\(\) does not support struct yet +## TODO: unnest a struct col with 2 subfields, with some null rows +query ? select * from unnest(struct(null)); +---- +NULL ## Unnest array expression diff --git a/datafusion/sqllogictest/test_files/unnest_debug.slt b/datafusion/sqllogictest/test_files/unnest_debug.slt deleted file mode 100644 index da2dab56457d..000000000000 --- a/datafusion/sqllogictest/test_files/unnest_debug.slt +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -############################ -# Unnest Expressions Tests # -############################ - -statement ok -CREATE TABLE unnest_table -AS VALUES - (struct(1 as "name0", 2 as "name1"),[1,2,3]) -; - - - -query III -select unnest(column1), unnest(column2) as some_alias from unnest_table; ----- -1 2 2 -1 2 3 -1 2 4 - - - From 5dfc41d710251ba387aa1b821a7bceb44854b789 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 09:20:22 +0200 Subject: [PATCH 08/21] test: fix df test --- datafusion/core/src/dataframe/mod.rs | 4 ++-- datafusion/expr/src/logical_plan/builder.rs | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index b2f14e43b79c..b4695ef0e05e 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -296,11 +296,11 @@ impl DataFrame { columns: &[&str], options: UnnestOptions, ) -> Result { - // TODO: fix me let cols:Vec = columns.iter().map(|c| Column::from(*c)).collect(); + let plan = LogicalPlanBuilder::from(self.plan) .unnest_columns_with_options(cols.clone(), options)? - .project(cols)?.build()?; + .build()?; Ok(DataFrame { session_state: self.session_state, plan, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 85fa27e3c570..5571f122b322 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1558,16 +1558,16 @@ pub fn get_unnested_columns( DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => { - let name = format!("{}.element", col_name); + // let name = format!("{}.element", col_name); let new_field = Arc::new(Field::new( // TODO: append field name by original name + .fieldname - name.clone(), + col_name.clone(), field.data_type().clone(), // Unnesting may produce NULLs even if the list is not null. // For example: unnset([1], []) -> 1, null true, )); - let column = Column::from_name(name); + let column = Column::from_name(col_name); // let column = Column::from((None, &new_field)); qualified_columns.push((column, new_field)); } From ff36f7323e59c5773897654895801e3fc740cc2a Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 10:24:30 +0200 Subject: [PATCH 09/21] prettify display --- datafusion/core/tests/dataframe/mod.rs | 12 ++++---- datafusion/expr/src/logical_plan/builder.rs | 4 +-- datafusion/expr/src/logical_plan/plan.rs | 30 ++++++++++++++----- datafusion/expr/src/logical_plan/tree_node.rs | 10 +++---- .../optimizer/src/optimize_projections/mod.rs | 5 ---- 5 files changed, 35 insertions(+), 26 deletions(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index f565fba1db5b..549ae24dafcb 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1231,11 +1231,11 @@ async fn unnest_aggregate_columns() -> Result<()> { .collect() .await?; let expected = [ - r#"+--------------------+"#, - r#"| COUNT(shapes.tags) |"#, - r#"+--------------------+"#, - r#"| 9 |"#, - r#"+--------------------+"#, + r#"+-------------+"#, + r#"| COUNT(tags) |"#, + r#"+-------------+"#, + r#"| 9 |"#, + r#"+-------------+"#, ]; assert_batches_sorted_eq!(expected, &results); @@ -1384,7 +1384,7 @@ async fn unnest_with_redundant_columns() -> Result<()> { let optimized_plan = df.clone().into_optimized_plan()?; let expected = vec![ "Projection: shapes.shape_id [shape_id:UInt32]", - " Unnest: shape_id2 [shape_id:UInt32, shape_id2:UInt32;N]", + " Unnest: lists[shape_id2] structs[] [shape_id:UInt32, shape_id2:UInt32;N]", " Aggregate: groupBy=[[shapes.shape_id]], aggr=[[ARRAY_AGG(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: \"item\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]", " TableScan: shapes projection=[shape_id] [shape_id:UInt32]", ]; diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 5571f122b322..564876790888 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1657,8 +1657,8 @@ pub fn unnest_with_options( Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), - input_columns: columns, - output_columns: qualified_columns, + exec_columns: columns, + post_exec_columns: qualified_columns, list_type_columns: list_columns, struct_type_columns: struct_columns, dependency_indices, diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index bd5a181c57f5..07d42ae18888 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -808,7 +808,7 @@ impl LogicalPlan { } LogicalPlan::DescribeTable(_) => Ok(self.clone()), LogicalPlan::Unnest(Unnest { - input_columns: columns, + exec_columns: columns, options, .. }) => { @@ -1551,9 +1551,22 @@ impl LogicalPlan { write!(f, "DescribeTable") } LogicalPlan::Unnest(Unnest { - list_type_columns, - struct_type_columns, .. }) => { - write!(f, "Unnest: TODO:i'm ugly{}{}", expr_vec_fmt!(list_type_columns),expr_vec_fmt!(struct_type_columns)) + input: plan, + list_type_columns: list_col_indices, + struct_type_columns: struct_col_indices, .. }) => { + let input_columns = plan.schema().columns(); + let list_type_columns = list_col_indices + .iter() + .map(|i| &input_columns[*i]) + .collect::>(); + let struct_type_columns = struct_col_indices + .iter() + .map(|i| &input_columns[*i]) + .collect::>(); + // get items from input_columns indexed by list_col_indices + write!(f, "Unnest: lists[{}] structs[{}]", + expr_vec_fmt!(list_type_columns), + expr_vec_fmt!(struct_type_columns)) } } } @@ -2527,14 +2540,15 @@ pub enum Partitioning { pub struct Unnest { /// The incoming logical plan pub input: Arc, - pub input_columns: Vec, + /// Columns to run unnest on, can be a list of (List/Struct) columns + pub exec_columns: Vec, // which index of original column in the input is each new column depend on // e.g original input: (struct_col,list[int]) // new column after unnest (struct_col.field1, struct_col,field2, int) // then dependency_map will be {0:0,1:0,2:1} // TODO: this should be converted into output column // instead of the columns in the input - pub output_columns: Vec, + pub post_exec_columns: Vec, /// refer to the indices(in the original input schema) of field columns /// that have type list to run unnest on pub list_type_columns: Vec, @@ -2552,12 +2566,12 @@ impl Unnest{ // reference to columns field indexed by list_type_columns pub fn get_list_columns(&self)-> Vec { self.list_type_columns.iter().map( - |&i| self.output_columns[i].clone() + |&i| self.post_exec_columns[i].clone() ).collect() } pub fn get_struct_columns(&self) ->Vec{ self.struct_type_columns.iter().map( - |&i| self.output_columns[i].clone() + |&i| self.post_exec_columns[i].clone() ).collect() } } diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 5e12decaa77d..f3c103dafc76 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -311,8 +311,8 @@ impl TreeNode for LogicalPlan { } LogicalPlan::Unnest(Unnest { input, - input_columns, - output_columns: columns, + exec_columns: input_columns, + post_exec_columns: columns, list_type_columns, struct_type_columns, dependency_indices, @@ -321,8 +321,8 @@ impl TreeNode for LogicalPlan { }) => rewrite_arc(input, f)?.update_data(|input| { LogicalPlan::Unnest(Unnest { input, - input_columns, - output_columns: columns, + exec_columns: input_columns, + post_exec_columns: columns, dependency_indices, list_type_columns, struct_type_columns, @@ -499,7 +499,7 @@ impl LogicalPlan { filters.iter().apply_until_stop(f) } LogicalPlan::Unnest(unnest) => { - let columns = unnest.input_columns.clone(); + let columns = unnest.exec_columns.clone(); // we use output_columns instead of input column // let output_exprs = unnest // .output_columns diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index c0c3d5d9de81..cd632c814870 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -111,7 +111,6 @@ fn optimize_projections( ) -> Result> { let child_required_indices: Vec = match plan { LogicalPlan::Unnest(Unnest{dependency_indices,..}) => { - println!("{:?}",dependency_indices); vec![RequiredIndicies::new_from_indices(dependency_indices.clone())] } LogicalPlan::Sort(_) @@ -733,10 +732,6 @@ fn rewrite_projection_given_requirements( return if let Some(input) = optimize_projections(&proj.input, config, required_indices)? { - if let Err(e) = is_projection_unnecessary(&input, &exprs_used) { - println!("==========\n\nerror checking if projection is necessary, input\n{:?}\nwith schema\n{:?}\n project exprs {:?}", - input,input.schema(),exprs_used); - } if is_projection_unnecessary(&input, &exprs_used)? { Ok(Some(input)) } else { From dbc041d23913375b923a32b2867c6f025ae33e1e Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 18:28:13 +0200 Subject: [PATCH 10/21] fix unit test --- datafusion/core/src/dataframe/mod.rs | 4 +- datafusion/core/src/physical_planner.rs | 19 +--- datafusion/expr/src/logical_plan/builder.rs | 105 ++++++++++++------ datafusion/expr/src/logical_plan/plan.rs | 20 ---- datafusion/expr/src/logical_plan/tree_node.rs | 12 -- datafusion/physical-plan/src/unnest.rs | 4 +- 6 files changed, 78 insertions(+), 86 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index b4695ef0e05e..74fd51855587 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -296,10 +296,10 @@ impl DataFrame { columns: &[&str], options: UnnestOptions, ) -> Result { - let cols:Vec = columns.iter().map(|c| Column::from(*c)).collect(); + let columns = columns.iter().map(|c| Column::from(*c)).collect(); let plan = LogicalPlanBuilder::from(self.plan) - .unnest_columns_with_options(cols.clone(), options)? + .unnest_columns_with_options(columns, options)? .build()?; Ok(DataFrame { session_state: self.session_state, diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 9e156b568a02..940f4b98c3d4 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1157,7 +1157,6 @@ impl DefaultPhysicalPlanner { Arc::new(GlobalLimitExec::new(input, *skip, *fetch)) } LogicalPlan::Unnest(Unnest { - // output_columns: columns, list_type_columns, struct_type_columns, schema, @@ -1165,27 +1164,11 @@ impl DefaultPhysicalPlanner { .. }) => { let input = children.one()?; - // let list_column_exec = list_type_columns - // .iter() - // .map(|idx| { - // let column = &columns[*idx]; - // let schema_idx = schema.index_of_column(column)?; - // Ok(Column::new(&column.name, schema_idx)) - // }) - // .collect::>()?; - - let struct_columns_set: HashSet = - struct_type_columns.iter().copied().collect(); - // .iter() - // .map(|idx| schema.index_of_column(&columns[*idx])) - // .collect::>()?; - let schema = SchemaRef::new(schema.as_ref().to_owned().into()); Arc::new(UnnestExec::new( input, list_type_columns.clone(), - // list_column_exec, - struct_columns_set, + struct_type_columns, schema, options.clone(), )) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 564876790888..42b4a8c51f10 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1596,7 +1596,6 @@ pub fn unnest_with_options( columns: Vec, options: UnnestOptions, ) -> Result { - let mut qualified_columns = Vec::with_capacity(columns.len()); let mut list_columns = Vec::with_capacity(columns.len()); let mut struct_columns = Vec::with_capacity(columns.len()); let column_by_original_index = columns @@ -1620,20 +1619,23 @@ pub fn unnest_with_options( &column_to_unnest.name, original_field.data_type(), )?; + match original_field.data_type() { + DataType::List(_) + | DataType::FixedSizeList(_, _) + | DataType::LargeList(_) => list_columns.push(index), + DataType::Struct(_) => struct_columns.push(index), + _ => { + panic!( + "not reachable, should be caught by get_unnested_columns" + ) + } + } // new columns dependent on the same original index dependency_indices .extend(std::iter::repeat(index).take(flatten_columns.len())); Ok(flatten_columns .iter() - .map(|col| { - match original_field.data_type() { - DataType::List(_) - | DataType::FixedSizeList(_,_) - | DataType::LargeList(_) => list_columns.push(index), - DataType::Struct(_) => struct_columns.push(index), - _ => {panic!("not reachable, should be caught by get_unnested_columns")} - } - qualified_columns.push(col.0.to_owned()); + .map(|col: &(Column, Arc)| { (col.0.relation.to_owned(), col.1.to_owned()) }) .collect()) @@ -1658,7 +1660,6 @@ pub fn unnest_with_options( Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), exec_columns: columns, - post_exec_columns: qualified_columns, list_type_columns: list_columns, struct_type_columns: struct_columns, dependency_indices, @@ -2091,12 +2092,12 @@ mod tests { #[test] fn plan_builder_unnest() -> Result<()> { // Unnesting a simple column should return the child plan. - let plan = nested_table_scan("test_table")? - .unnest_column("scalar")? - .build()?; - - let expected = "TableScan: test_table"; - assert_eq!(expected, format!("{plan:?}")); + let err = nested_table_scan("test_table")? + .unnest_column("scalar") + .unwrap_err(); + assert!(err + .to_string() + .starts_with("Internal error: trying to unnest on invalid data type UInt32")); // Unnesting the strings list. let plan = nested_table_scan("test_table")? @@ -2104,36 +2105,65 @@ mod tests { .build()?; let expected = "\ - Unnest: test_table.strings\ + Unnest: lists[test_table.strings] structs[]\ \n TableScan: test_table"; assert_eq!(expected, format!("{plan:?}")); // Check unnested field is a scalar - let field = plan - .schema() - .field_with_name(Some(&TableReference::bare("test_table")), "strings") - .unwrap(); + let field = plan.schema().field_with_name(None, "strings").unwrap(); assert_eq!(&DataType::Utf8, field.data_type()); - // Unnesting multiple fields. + // Unnesting the singular struct column result into 2 new columns for each subfield + let plan = nested_table_scan("test_table")? + .unnest_column("struct_singular")? + .build()?; + + let expected = "\ + Unnest: lists[] structs[test_table.struct_singular]\ + \n TableScan: test_table"; + assert_eq!(expected, format!("{plan:?}")); + + for field_name in &["a", "b"] { + // Check unnested struct field is a scalar + let field = plan + .schema() + .field_with_name(None, &format!("struct_singular.{}", field_name)) + .unwrap(); + assert_eq!(&DataType::UInt32, field.data_type()); + } + + // Unnesting multiple fields in separate plans let plan = nested_table_scan("test_table")? .unnest_column("strings")? .unnest_column("structs")? + .unnest_column("struct_singular")? .build()?; let expected = "\ - Unnest: test_table.structs\ - \n Unnest: test_table.strings\ - \n TableScan: test_table"; + Unnest: lists[] structs[test_table.struct_singular]\ + \n Unnest: lists[test_table.structs] structs[]\ + \n Unnest: lists[test_table.strings] structs[]\ + \n TableScan: test_table"; assert_eq!(expected, format!("{plan:?}")); // Check unnested struct list field should be a struct. - let field = plan - .schema() - .field_with_name(Some(&TableReference::bare("test_table")), "structs") - .unwrap(); + let field = plan.schema().field_with_name(None, "structs").unwrap(); assert!(matches!(field.data_type(), DataType::Struct(_))); + // Unnesting multiple fields at the same time + let cols = vec!["strings", "structs", "struct_singular"] + .into_iter() + .map(|c| c.into()) + .collect(); + let plan = nested_table_scan("test_table")? + .unnest_columns_with_options(cols, UnnestOptions::default())? + .build()?; + + let expected = "\ + Unnest: lists[test_table.strings, test_table.structs] structs[test_table.struct_singular]\ + \n TableScan: test_table"; + assert_eq!(expected, format!("{plan:?}")); + // Unnesting missing column should fail. let plan = nested_table_scan("test_table")?.unnest_column("missing"); assert!(plan.is_err()); @@ -2142,8 +2172,9 @@ mod tests { } fn nested_table_scan(table_name: &str) -> Result { - // Create a schema with a scalar field, a list of strings, and a list of structs. - let struct_field = Field::new_struct( + // Create a schema with a scalar field, a list of strings, a list of structs + // and a singular struct + let struct_field_in_list = Field::new_struct( "item", vec![ Field::new("a", DataType::UInt32, false), @@ -2155,7 +2186,15 @@ mod tests { let schema = Schema::new(vec![ Field::new("scalar", DataType::UInt32, false), Field::new_list("strings", string_field, false), - Field::new_list("structs", struct_field, false), + Field::new_list("structs", struct_field_in_list.clone(), false), + Field::new( + "struct_singular", + DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::UInt32, false), + Field::new("b", DataType::UInt32, false), + ])), + false, + ), ]); table_scan(Some(table_name), &schema, None) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 07d42ae18888..354beb70fda2 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2542,13 +2542,6 @@ pub struct Unnest { pub input: Arc, /// Columns to run unnest on, can be a list of (List/Struct) columns pub exec_columns: Vec, - // which index of original column in the input is each new column depend on - // e.g original input: (struct_col,list[int]) - // new column after unnest (struct_col.field1, struct_col,field2, int) - // then dependency_map will be {0:0,1:0,2:1} - // TODO: this should be converted into output column - // instead of the columns in the input - pub post_exec_columns: Vec, /// refer to the indices(in the original input schema) of field columns /// that have type list to run unnest on pub list_type_columns: Vec, @@ -2562,19 +2555,6 @@ pub struct Unnest { /// Options pub options: UnnestOptions, } -impl Unnest{ - // reference to columns field indexed by list_type_columns - pub fn get_list_columns(&self)-> Vec { - self.list_type_columns.iter().map( - |&i| self.post_exec_columns[i].clone() - ).collect() - } - pub fn get_struct_columns(&self) ->Vec{ - self.struct_type_columns.iter().map( - |&i| self.post_exec_columns[i].clone() - ).collect() - } -} #[cfg(test)] mod tests { diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index f3c103dafc76..462fb62efe7a 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -312,7 +312,6 @@ impl TreeNode for LogicalPlan { LogicalPlan::Unnest(Unnest { input, exec_columns: input_columns, - post_exec_columns: columns, list_type_columns, struct_type_columns, dependency_indices, @@ -322,7 +321,6 @@ impl TreeNode for LogicalPlan { LogicalPlan::Unnest(Unnest { input, exec_columns: input_columns, - post_exec_columns: columns, dependency_indices, list_type_columns, struct_type_columns, @@ -500,17 +498,7 @@ impl LogicalPlan { } LogicalPlan::Unnest(unnest) => { let columns = unnest.exec_columns.clone(); - // we use output_columns instead of input column - // let output_exprs = unnest - // .output_columns - // .iter() - // .map(|c| Expr::Column(c.clone())) - // .collect::>(); - // let mut columns = unnest.get_list_columns(); - // columns.extend_from_slice( - // unnest.get_struct_columns().as_ref()); - // TODO: detect struct column let exprs = columns .iter() .map(|c| Expr::Column(c.clone())) diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 8ce789afea93..a53512a9161b 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -76,11 +76,13 @@ impl UnnestExec { pub fn new( input: Arc, list_type_columns: Vec, - struct_column_indices: HashSet, + struct_type_columns: Vec, schema: SchemaRef, options: UnnestOptions, ) -> Self { let cache = Self::compute_properties(&input, schema.clone()); + let struct_column_indices: HashSet = + struct_type_columns.iter().copied().collect(); UnnestExec { input, From 07e003a34753b4496f82f7195fcf8d193badea77 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 18:59:17 +0200 Subject: [PATCH 11/21] chore: compile err --- datafusion/core/src/physical_planner.rs | 3 +- .../optimizer/src/optimize_projections/mod.rs | 15 ++++--- datafusion/physical-plan/src/unnest.rs | 42 +++++++++++-------- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 0477c11fbb40..cc4851444402 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -100,7 +100,6 @@ use datafusion_sql::utils::window_expr_common_partition_keys; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; -use hashbrown::HashSet; use itertools::{multiunzip, Itertools}; use log::{debug, trace}; use sqlparser::ast::NullTreatment; @@ -1148,7 +1147,7 @@ impl DefaultPhysicalPlanner { Arc::new(UnnestExec::new( input, list_type_columns.clone(), - struct_type_columns, + struct_type_columns.clone(), schema, options.clone(), )) diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 474423e76b33..3695f4fd78a8 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -29,7 +29,7 @@ use datafusion_common::{ get_required_group_by_exprs_indices, internal_datafusion_err, internal_err, Column, JoinType, Result, }; -use datafusion_expr::expr::{Alias, ScalarFunction}; +use datafusion_expr::expr::Alias; use datafusion_expr::Unnest; use datafusion_expr::{ logical_plan::LogicalPlan, projection_schema, Aggregate, Distinct, Expr, Projection, @@ -398,9 +398,13 @@ fn optimize_projections( return internal_err!( "OptimizeProjection: should have handled in the match statement above" ); - }, - LogicalPlan::Unnest(Unnest{dependency_indices,..}) => { - vec![RequiredIndicies::new_from_indices(dependency_indices.clone())] + } + LogicalPlan::Unnest(Unnest { + dependency_indices, .. + }) => { + vec![RequiredIndicies::new_from_indices( + dependency_indices.clone(), + )] } }; @@ -443,7 +447,7 @@ fn optimize_projections( } } -/// Merges consecutive projectionsDFSchema { inner: Schema { fields: [Field { name: "name0", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "name1", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, field_qualifiers: [None, None], functional_dependencies: FunctionalDependencies { deps: [] } }. +/// Merges consecutive projections. /// /// Given a projection `proj`, this function attempts to merge it with a previous /// projection if it exists and if merging is beneficial. Merging is considered @@ -486,7 +490,6 @@ fn merge_consecutive_projections(proj: Projection) -> Result 1 && !is_expr_trivial( &prev_projection.expr diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index a53512a9161b..f189ee880d09 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -38,7 +38,9 @@ use arrow::datatypes::{DataType, Int64Type, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::{Int64Array, Scalar, StructArray}; use arrow_ord::cmp::lt; -use datafusion_common::{exec_datafusion_err, exec_err, Result, UnnestOptions}; +use datafusion_common::{ + exec_datafusion_err, exec_err, internal_err, Result, UnnestOptions, +}; use datafusion_execution::TaskContext; use datafusion_expr::ColumnarValue; use datafusion_physical_expr::EquivalenceProperties; @@ -60,9 +62,9 @@ pub struct UnnestExec { /// The schema once the unnest is applied schema: SchemaRef, /// The unnest list type columns - list_type_columns: Vec, - /// - struct_column_indices: HashSet, + list_column_indices: Vec, + /// The unnest list type columns + struct_column_indices: Vec, /// Options options: UnnestOptions, /// Execution metrics @@ -75,19 +77,17 @@ impl UnnestExec { /// Create a new [UnnestExec]. pub fn new( input: Arc, - list_type_columns: Vec, - struct_type_columns: Vec, + list_column_indices: Vec, + struct_column_indices: Vec, schema: SchemaRef, options: UnnestOptions, ) -> Self { let cache = Self::compute_properties(&input, schema.clone()); - let struct_column_indices: HashSet = - struct_type_columns.iter().copied().collect(); UnnestExec { input, schema, - list_type_columns, + list_column_indices, struct_column_indices, options, metrics: Default::default(), @@ -147,7 +147,7 @@ impl ExecutionPlan for UnnestExec { ) -> Result> { Ok(Arc::new(UnnestExec::new( children[0].clone(), - self.list_type_columns.clone(), + self.list_column_indices.clone(), self.struct_column_indices.clone(), self.schema.clone(), self.options.clone(), @@ -169,8 +169,8 @@ impl ExecutionPlan for UnnestExec { Ok(Box::pin(UnnestStream { input, schema: self.schema.clone(), - list_type_columns: self.list_type_columns.clone(), - struct_column_indices: self.struct_column_indices.clone(), + list_type_columns: self.struct_column_indices.clone(), + struct_column_indices: self.struct_column_indices.iter().copied().collect(), options: self.options.clone(), metrics, })) @@ -307,20 +307,26 @@ fn build_batch_vertically( let columns_expanded = input_batch .iter() .enumerate() - .flat_map(|(idx, column_data)| match struct_column_indices.get(&idx) { + .map(|(idx, column_data)| match struct_column_indices.get(&idx) { Some(_) => match column_data.data_type() { - DataType::Struct(fields) => { + DataType::Struct(_) => { let struct_arr = column_data.as_any().downcast_ref::().unwrap(); - struct_arr.columns().to_vec() + Ok(struct_arr.columns().to_vec()) } - other => vec![column_data.clone()], + data_type => internal_err!( + "expecting column {} from input plan to be a struct, got {:?}", + idx, + data_type + ), }, None => { - vec![column_data.clone()] + Ok(vec![column_data.clone()]) } }) - .collect::>(); + .collect::>>()? + .into_iter() + .flatten().collect(); Ok(RecordBatch::try_new(schema.clone(), columns_expanded)?) } From 9f88c7ae1dd59661493ce8cf9b21d678a604c815 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 19:24:55 +0200 Subject: [PATCH 12/21] chore: fix physical exec err --- datafusion/physical-plan/src/unnest.rs | 2 +- datafusion/sqllogictest/bin/sqllogictests.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index f189ee880d09..c21830e4f827 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -169,7 +169,7 @@ impl ExecutionPlan for UnnestExec { Ok(Box::pin(UnnestStream { input, schema: self.schema.clone(), - list_type_columns: self.struct_column_indices.clone(), + list_type_columns: self.list_column_indices.clone(), struct_column_indices: self.struct_column_indices.iter().copied().collect(), options: self.options.clone(), metrics, diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index 560328ee8619..b010f094d4f3 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -123,7 +123,6 @@ async fn run_tests() -> Result<()> { // report on any errors if !errors.is_empty() { for e in &errors { - println!("{e}"); } exec_err!("{} failures", errors.len()) } else { From e4c12f845eb1817d49f5a6870f49014aed3d1046 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 21:02:25 +0200 Subject: [PATCH 13/21] add sqllogic test --- datafusion/core/src/dataframe/mod.rs | 1 - datafusion/physical-plan/src/unnest.rs | 9 ++++---- datafusion/sqllogictest/bin/sqllogictests.rs | 1 + datafusion/sqllogictest/test_files/unnest.slt | 23 ++++++++++++++----- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index a25289191322..04aaf5a890a8 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -297,7 +297,6 @@ impl DataFrame { options: UnnestOptions, ) -> Result { let columns = columns.iter().map(|c| Column::from(*c)).collect(); - let plan = LogicalPlanBuilder::from(self.plan) .unnest_columns_with_options(columns, options)? .build()?; diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index c21830e4f827..e804d4fa3abd 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -320,13 +320,12 @@ fn build_batch_vertically( data_type ), }, - None => { - Ok(vec![column_data.clone()]) - } + None => Ok(vec![column_data.clone()]), }) .collect::>>()? .into_iter() - .flatten().collect(); + .flatten() + .collect(); Ok(RecordBatch::try_new(schema.clone(), columns_expanded)?) } @@ -647,7 +646,7 @@ fn batch_from_indices( None => Ok(kernels::take::take(arr, indices, None)?), }) .collect::>>()?; - return Ok(arrays); + Ok(arrays) } #[cfg(test)] diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index b010f094d4f3..560328ee8619 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -123,6 +123,7 @@ async fn run_tests() -> Result<()> { // report on any errors if !errors.is_empty() { for e in &errors { + println!("{e}"); } exec_err!("{} failures", errors.len()) } else { diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 94b144fe6b3b..5d1cbc7aa6a7 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -22,12 +22,12 @@ statement ok CREATE TABLE unnest_table AS VALUES - ([1,2,3], [7], 1, [13, 14]), - ([4,5], [8,9,10], 2, [15, 16]), - ([6], [11,12], 3, null), - ([12], [null, 42, null], null, null), + ([1,2,3], [7], 1, [13, 14], struct(1,2)), + ([4,5], [8,9,10], 2, [15, 16], struct(3,4)), + ([6], [11,12], 3, null, null), + ([12], [null, 42, null], null, null, struct(7,8)), -- null array to verify the `preserve_nulls` option - (null, null, 4, [17, 18]) + (null, null, 4, [17, 18], null) ; ## Basic unnest expression in select list @@ -255,7 +255,6 @@ select unnest(struct(null)); NULL ## Unnest struct expression in from clause -## TODO: unnest a struct col with 2 subfields, with some null rows query ? select * from unnest(struct(null)); ---- @@ -309,6 +308,18 @@ select unnest(array_remove(column1, 12)) from unnest_table; 5 6 +## unnest struct-typed column and list-typed column at the same time +query I?II? +select unnest(column1), column1, unnest(column5), column5 from unnest_table; +---- +1 [1, 2, 3] 1 2 {c0: 1, c1: 2} +2 [1, 2, 3] 1 2 {c0: 1, c1: 2} +3 [1, 2, 3] 1 2 {c0: 1, c1: 2} +4 [4, 5] 3 4 {c0: 3, c1: 4} +5 [4, 5] 3 4 {c0: 3, c1: 4} +6 [6] NULL NULL NULL +12 [12] 7 8 {c0: 7, c1: 8} + ## Unnest in from clause with alias query I From 2c70a4b3235c72c51e97e139593472ca4c59e5bd Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 21:34:14 +0200 Subject: [PATCH 14/21] chore: more doc --- datafusion/expr/src/logical_plan/plan.rs | 7 ++-- datafusion/physical-plan/src/unnest.rs | 41 ++++++++++++++++-------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 7f26a9906f7a..e0db031b1698 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2809,13 +2809,14 @@ pub struct Unnest { pub input: Arc, /// Columns to run unnest on, can be a list of (List/Struct) columns pub exec_columns: Vec, - /// refer to the indices(in the original input schema) of field columns + /// refer to the indices(in the input schema) of columns /// that have type list to run unnest on pub list_type_columns: Vec, - /// refer to the indices (in the original input schema) of field columns + /// refer to the indices (in the input schema) of columns /// that have type struct to run unnest on pub struct_type_columns: Vec, - /// for each column in output schema, which column in the input schema it depends on + /// Having items aligned with the output columns + /// representing which column in the input schema each output column depends on pub dependency_indices: Vec, /// The output schema, containing the unnested field column. pub schema: DFSchemaRef, diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index e804d4fa3abd..5fd3ae4a6322 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -50,20 +50,21 @@ use futures::{Stream, StreamExt}; use hashbrown::HashSet; use log::trace; -/// Unnest the given columns by joining the row with each value in the -/// nested type. +/// Unnest the given columns (either with type struct or list) +/// For list unnesting, each rows is vertically transformed into multiple rows +/// For struct unnesting, each columns is horizontally transformed into multiple columns, +/// Thus the original RecordBatch with dimension (n x m) may have new dimension (n' x m') /// /// See [`UnnestOptions`] for more details and an example. -/// TODO: rename into UnnestListExec #[derive(Debug)] pub struct UnnestExec { /// Input execution plan input: Arc, /// The schema once the unnest is applied schema: SchemaRef, - /// The unnest list type columns + /// indices of the list-typed columns in the input schema list_column_indices: Vec, - /// The unnest list type columns + /// indices of the struct-typed columns in the input schema struct_column_indices: Vec, /// Options options: UnnestOptions, @@ -298,7 +299,20 @@ impl UnnestStream { } } -fn build_batch_vertically( + +/// Given a set of struct column indices to flatten +/// try converting the column in input into multiple subfield columns +/// For example +/// ```ignore +/// struct_col: [a: struct(item: int, name: string), b: int] +/// with a batch +/// {a: {item: 1, name: "a"}, b: 2}, +/// {a: {item: 3, name: "b"}, b: 4] +/// will be converted into +/// {a.item: 1, a.name: "a", b: 2}, +/// {a.item: 3, a.name: "b", b: 4} +/// ``` +fn flatten_struct_cols( input_batch: &[Arc], schema: &SchemaRef, struct_column_indices: &HashSet, @@ -329,21 +343,20 @@ fn build_batch_vertically( Ok(RecordBatch::try_new(schema.clone(), columns_expanded)?) } -/// For each row in a `RecordBatch`, some list columns need to be unnested. -/// We will expand the values in each list into multiple rows, +/// For each row in a `RecordBatch`, some list/struct columns need to be unnested. +/// - For list columns: We will expand the values in each list into multiple rows, /// taking the longest length among these lists, and shorter lists are padded with NULLs. -// +/// - For struct columns: We will expand the struct columns into multiple subfield columns. /// For columns that don't need to be unnested, repeat their values until reaching the longest length. fn build_batch( batch: &RecordBatch, schema: &SchemaRef, - // list type column only list_type_columns: &[usize], struct_column_indices: &HashSet, options: &UnnestOptions, ) -> Result { let transformed = match list_type_columns.len() { - 0 => build_batch_vertically(batch.columns(), schema, struct_column_indices), + 0 => flatten_struct_cols(batch.columns(), schema, struct_column_indices), _ => { let list_arrays: Vec = list_type_columns .iter() @@ -379,8 +392,8 @@ fn build_batch( let take_indicies = create_take_indicies(unnested_length, total_length); // vertical expansion because of list unnest - let ret = batch_from_indices(batch, &unnested_array_map, &take_indicies)?; - build_batch_vertically(&ret, schema, struct_column_indices) + let ret = flatten_list_cols_from_indices(batch, &unnested_array_map, &take_indicies)?; + flatten_struct_cols(&ret, schema, struct_column_indices) } }; transformed @@ -632,7 +645,7 @@ fn create_take_indicies( /// c2: 'a', 'b', 'c', 'c', 'c', null, 'd', 'd' /// ``` /// -fn batch_from_indices( +fn flatten_list_cols_from_indices( batch: &RecordBatch, unnested_list_arrays: &HashMap, indices: &PrimitiveArray, From bdcefe1e2c070b118df83d2650735fbedadd7784 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 21:47:07 +0200 Subject: [PATCH 15/21] chore: refactor --- datafusion/expr/src/logical_plan/display.rs | 18 ++++++++++++++++-- datafusion/expr/src/logical_plan/plan.rs | 1 - datafusion/physical-plan/src/unnest.rs | 1 - datafusion/sql/src/expr/function.rs | 6 ++---- datafusion/sql/src/select.rs | 2 +- 5 files changed, 19 insertions(+), 9 deletions(-) diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs index 172e4fa3dc2f..f3765fb184bb 100644 --- a/datafusion/expr/src/logical_plan/display.rs +++ b/datafusion/expr/src/logical_plan/display.rs @@ -30,7 +30,7 @@ use crate::dml::CopyTo; use arrow::datatypes::Schema; use datafusion_common::display::GraphvizBuilder; use datafusion_common::tree_node::{TreeNodeRecursion, TreeNodeVisitor}; -use datafusion_common::DataFusionError; +use datafusion_common::{Column, DataFusionError}; use serde_json::json; /// Formats plans with a single line per node. For example: @@ -638,7 +638,21 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> { "Node Type": "DescribeTable" }) } - LogicalPlan::Unnest(Unnest { list_type_columns,struct_type_columns, .. }) => { + LogicalPlan::Unnest(Unnest { + input: plan, + list_type_columns: list_col_indices, + struct_type_columns: struct_col_indices, + .. + }) => { + let input_columns = plan.schema().columns(); + let list_type_columns = list_col_indices + .iter() + .map(|i| &input_columns[*i]) + .collect::>(); + let struct_type_columns = struct_col_indices + .iter() + .map(|i| &input_columns[*i]) + .collect::>(); json!({ "Node Type": "Unnest", "ListColumn": expr_vec_fmt!(list_type_columns), diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index e0db031b1698..1fa9f9a67951 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -340,7 +340,6 @@ impl LogicalPlan { LogicalPlan::Copy(copy) => vec![©.input], LogicalPlan::Ddl(ddl) => ddl.inputs(), LogicalPlan::Unnest(Unnest { input, .. }) => vec![input], - // TODO: add here LogicalPlan::Prepare(Prepare { input, .. }) => vec![input], LogicalPlan::RecursiveQuery(RecursiveQuery { static_term, diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 5fd3ae4a6322..075ba7da31e6 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -397,7 +397,6 @@ fn build_batch( } }; transformed - // Ok(RecordBatch::try_new(schema.clone(), transformed)?) } /// Find the longest list length among the given list arrays for each row. diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index fcdfd81376a8..cfe7da61dbfc 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -358,10 +358,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { match arg.get_type(schema)? { DataType::List(_) | DataType::LargeList(_) - | DataType::FixedSizeList(_, _) => Ok(()), - DataType::Struct(_) => { - Ok(()) - } + | DataType::FixedSizeList(_, _) + | DataType::Struct(_) => Ok(()), DataType::Null => { not_impl_err!("unnest() does not support null yet") } diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 926f73ad8ded..1df7bd843c3d 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -411,7 +411,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL let unnest_options = UnnestOptions::new().with_preserve_nulls(false); LogicalPlanBuilder::from(input) - .project(inner_projection_exprs.clone())? + .project(inner_projection_exprs)? .unnest_columns_with_options(columns, unnest_options)? .project(outer_projection_exprs)? .build() From 5bba38d9433171341870cab186ea612f3bf06533 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 21:53:18 +0200 Subject: [PATCH 16/21] fix doc --- datafusion/expr/src/logical_plan/builder.rs | 4 +--- datafusion/physical-plan/src/unnest.rs | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 9049cd01be41..bed56a20904a 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1605,9 +1605,7 @@ pub fn get_unnested_columns( DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => { - // let name = format!("{}.element", col_name); let new_field = Arc::new(Field::new( - // TODO: append field name by original name + .fieldname col_name.clone(), field.data_type().clone(), // Unnesting may produce NULLs even if the list is not null. @@ -2138,7 +2136,7 @@ mod tests { #[test] fn plan_builder_unnest() -> Result<()> { - // Unnesting a simple column should return the child plan. + // Cannot unnest on a scalar column let err = nested_table_scan("test_table")? .unnest_column("scalar") .unwrap_err(); diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 075ba7da31e6..8a97ae4c6389 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -581,7 +581,7 @@ fn unnest_list_array( )?) } -/// Creates take indicies that will be used to expand all columns except for the unnest [`columns`](UnnestExec::columns). +/// Creates take indicies that will be used to expand all columns except for the unnest [`columns`](UnnestExec::exec_columns). /// Every column value needs to be repeated multiple times according to the length array. /// /// If the length array looks like this: From ab212c00da6423ebc2aa8dcb85482a1ae9cfef53 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sat, 18 May 2024 22:15:27 +0200 Subject: [PATCH 17/21] fmt --- datafusion/expr/src/logical_plan/plan.rs | 18 +++++++----------- datafusion/physical-plan/src/unnest.rs | 14 ++++++++------ 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 1fa9f9a67951..3f0a6ded411b 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1022,13 +1022,9 @@ impl LogicalPlan { .. }) => { // Update schema with unnested column type. - // TODO: we are not considering new expr rewrite - // If this case happen - // original plan: column1.field1, column1.field2, column2.field3 - // input is optimized so that it only gives the schema of column1 let input = inputs.swap_remove(0); - let new_plan = unnest_with_options(input, - columns.clone(), options.clone())?; + let new_plan = + unnest_with_options(input, columns.clone(), options.clone())?; Ok(new_plan) } } @@ -1798,9 +1794,9 @@ impl LogicalPlan { LogicalPlan::DescribeTable(DescribeTable { .. }) => { write!(f, "DescribeTable") } - LogicalPlan::Unnest(Unnest { + LogicalPlan::Unnest(Unnest { input: plan, - list_type_columns: list_col_indices, + list_type_columns: list_col_indices, struct_type_columns: struct_col_indices, .. }) => { let input_columns = plan.schema().columns(); let list_type_columns = list_col_indices @@ -2808,13 +2804,13 @@ pub struct Unnest { pub input: Arc, /// Columns to run unnest on, can be a list of (List/Struct) columns pub exec_columns: Vec, - /// refer to the indices(in the input schema) of columns - /// that have type list to run unnest on + /// refer to the indices(in the input schema) of columns + /// that have type list to run unnest on pub list_type_columns: Vec, /// refer to the indices (in the input schema) of columns /// that have type struct to run unnest on pub struct_type_columns: Vec, - /// Having items aligned with the output columns + /// Having items aligned with the output columns /// representing which column in the input schema each output column depends on pub dependency_indices: Vec, /// The output schema, containing the unnested field column. diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 8a97ae4c6389..f29b8d212d09 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -228,7 +228,6 @@ struct UnnestStream { schema: Arc, /// The unnest columns list_type_columns: Vec, - /// struct_column_indices: HashSet, /// Options options: UnnestOptions, @@ -299,14 +298,13 @@ impl UnnestStream { } } - /// Given a set of struct column indices to flatten /// try converting the column in input into multiple subfield columns /// For example /// ```ignore -/// struct_col: [a: struct(item: int, name: string), b: int] -/// with a batch -/// {a: {item: 1, name: "a"}, b: 2}, +/// struct_col: [a: struct(item: int, name: string), b: int] +/// with a batch +/// {a: {item: 1, name: "a"}, b: 2}, /// {a: {item: 3, name: "b"}, b: 4] /// will be converted into /// {a.item: 1, a.name: "a", b: 2}, @@ -392,7 +390,11 @@ fn build_batch( let take_indicies = create_take_indicies(unnested_length, total_length); // vertical expansion because of list unnest - let ret = flatten_list_cols_from_indices(batch, &unnested_array_map, &take_indicies)?; + let ret = flatten_list_cols_from_indices( + batch, + &unnested_array_map, + &take_indicies, + )?; flatten_struct_cols(&ret, schema, struct_column_indices) } }; From dbebd44120bcf5543cc3d5d313bb86dc0e336340 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sun, 19 May 2024 20:23:06 +0200 Subject: [PATCH 18/21] fix doc --- datafusion/core/src/dataframe/mod.rs | 6 +++--- datafusion/core/tests/data/unnest.json | 2 ++ datafusion/physical-plan/src/unnest.rs | 5 ++--- 3 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 datafusion/core/tests/data/unnest.json diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 04aaf5a890a8..d4626134acbf 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -263,7 +263,7 @@ impl DataFrame { self.unnest_columns_with_options(&[column], options) } - /// Expand multiple list columns into a set of rows. + /// Expand multiple list/struct columns into a set of rows and new columns. /// /// See also: /// @@ -277,8 +277,8 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let df = df.unnest_columns(&["a", "b"])?; + /// let df = ctx.read_json("tests/data/unnest.json", NdJsonReadOptions::default()).await?; + /// let df = df.unnest_columns(&["b","c","d"])?; /// # Ok(()) /// # } /// ``` diff --git a/datafusion/core/tests/data/unnest.json b/datafusion/core/tests/data/unnest.json new file mode 100644 index 000000000000..5999171c2886 --- /dev/null +++ b/datafusion/core/tests/data/unnest.json @@ -0,0 +1,2 @@ +{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true],"d":{"e":1,"f":2}} +{"a":2, "b":[3.0, 2.3, -7.1], "c":[false, true]} diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index f29b8d212d09..a8151fe0220b 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -301,7 +301,6 @@ impl UnnestStream { /// Given a set of struct column indices to flatten /// try converting the column in input into multiple subfield columns /// For example -/// ```ignore /// struct_col: [a: struct(item: int, name: string), b: int] /// with a batch /// {a: {item: 1, name: "a"}, b: 2}, @@ -309,7 +308,6 @@ impl UnnestStream { /// will be converted into /// {a.item: 1, a.name: "a", b: 2}, /// {a.item: 3, a.name: "b", b: 4} -/// ``` fn flatten_struct_cols( input_batch: &[Arc], schema: &SchemaRef, @@ -583,7 +581,8 @@ fn unnest_list_array( )?) } -/// Creates take indicies that will be used to expand all columns except for the unnest [`columns`](UnnestExec::exec_columns). +/// Creates take indicies that will be used to expand all columns except for the list type +/// [`columns`](UnnestExec::list_column_indices) that is being unnested. /// Every column value needs to be repeated multiple times according to the length array. /// /// If the length array looks like this: From e5a81720b814a3d0b4c067ec4fd943647761d33a Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sun, 19 May 2024 21:28:53 +0200 Subject: [PATCH 19/21] ut for recursive transform unnest --- datafusion/sql/src/select.rs | 93 ++--------------- datafusion/sql/src/utils.rs | 190 ++++++++++++++++++++++++++++++++++- 2 files changed, 195 insertions(+), 88 deletions(-) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 1df7bd843c3d..d2cd1bcf3a06 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -20,16 +20,14 @@ use std::sync::Arc; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::utils::{ - check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs, - resolve_columns, resolve_positions_to_exprs, + check_columns_satisfy_exprs, extract_aliases, rebase_expr, + recursive_transform_unnest, resolve_aliases_to_exprs, resolve_columns, + resolve_positions_to_exprs, }; -use arrow_schema::DataType; -use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{internal_err, not_impl_err, plan_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; use datafusion_common::{Column, UnnestOptions}; -use datafusion_expr::builder::get_unnested_columns; -use datafusion_expr::expr::{Alias, Unnest}; +use datafusion_expr::expr::Alias; use datafusion_expr::expr_rewriter::{ normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols, }; @@ -38,8 +36,7 @@ use datafusion_expr::utils::{ find_aggregate_exprs, find_window_exprs, }; use datafusion_expr::{ - Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder, - Partitioning, + Expr, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder, Partitioning, }; use sqlparser::ast::{ Distinct, Expr as SQLExpr, GroupByExpr, OrderByExpr, ReplaceSelectItem, @@ -294,82 +291,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(plan) } - fn recursive_transform_unnest( - &self, - input: &LogicalPlan, - unnest_placeholder_columns: &mut Vec, - inner_projection_exprs: &mut Vec, - original_expr: Expr, - ) -> Result> { - let mut transform = - |unnest_expr: &Expr, expr_in_unnest: &Expr| -> Result> { - // Full context, we are trying to plan the execution as InnerProjection->Unnest->OuterProjection - // inside unnest execution, each column inside the inner projection - // will be transformed into new columns. Thus we need to keep track of these placeholding column names - let placeholder_name = unnest_expr.display_name()?; - - unnest_placeholder_columns.push(placeholder_name.clone()); - // Add alias for the argument expression, to avoid naming conflicts - // with other expressions in the select list. For example: `select unnest(col1), col1 from t`. - // this extra projection is used to unnest transforming - inner_projection_exprs - .push(expr_in_unnest.clone().alias(placeholder_name.clone())); - let schema = input.schema(); - - let (data_type, _) = expr_in_unnest.data_type_and_nullable(schema)?; - - let outer_projection_columns = - get_unnested_columns(&placeholder_name, &data_type)?; - let expr = outer_projection_columns - .iter() - .map(|col| Expr::Column(col.0.clone())) - .collect::>(); - Ok(expr) - }; - // expr transformed maybe different from the originals exprs - // for example: - // - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2 - // - unnest(array_col) will be transformed into unnest(array_col).element - // - unnest(array_col) + 1 will be transformed into unnest(array_col).element + 1 - - // Specifically handle root level unnest expr, this is the only place - // unnest on struct can be handled - if let Expr::Unnest(Unnest { expr: ref arg }) = original_expr { - return transform(&original_expr, arg); - } - let Transformed { - data: transformed_expr, - transformed, - tnr: _, - } = original_expr.transform_up(|expr: Expr| { - if let Expr::Unnest(Unnest { expr: ref arg }) = expr { - let (data_type, _) = expr.data_type_and_nullable(input.schema())?; - if let DataType::Struct(_) = data_type { - return internal_err!("unnest on struct can ony be applied at the root level of select expression"); - } - let transformed_exprs = transform(&expr, arg)?; - Ok(Transformed::yes(transformed_exprs[0].clone())) - } else { - Ok(Transformed::no(expr)) - } - })?; - - if !transformed { - if matches!(&transformed_expr, Expr::Column(_)) { - inner_projection_exprs.push(transformed_expr.clone()); - Ok(vec![transformed_expr]) - } else { - // We need to evaluate the expr in the inner projection, - // outer projection just select its name - let column_name = transformed_expr.display_name()?; - inner_projection_exprs.push(transformed_expr); - Ok(vec![Expr::Column(Column::from_name(column_name))]) - } - } else { - Ok(vec![transformed_expr]) - } - } - /// Try converting Expr(Unnest(Expr)) to Projection/Unnest/Projection pub(super) fn try_process_unnest( &self, @@ -389,7 +310,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let outer_projection_exprs: Vec = select_exprs .into_iter() .map(|expr| { - self.recursive_transform_unnest( + recursive_transform_unnest( &input, &mut unnest_columns, &mut inner_projection_exprs, diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 2c50d3af1f5e..c58b2ba5ef1b 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -26,9 +26,10 @@ use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{ exec_err, internal_err, plan_err, Column, DataFusionError, Result, ScalarValue, }; -use datafusion_expr::expr::{Alias, GroupingSet, WindowFunction}; +use datafusion_expr::builder::get_unnested_columns; +use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction}; use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs}; -use datafusion_expr::{expr_vec_fmt, Expr, LogicalPlan}; +use datafusion_expr::{expr_vec_fmt, Expr, ExprSchemable, LogicalPlan}; use sqlparser::ast::Ident; /// Make a best-effort attempt at resolving all columns in the expression tree @@ -255,3 +256,188 @@ pub(crate) fn normalize_ident(id: Ident) -> String { None => id.value.to_ascii_lowercase(), } } + +/// The context is we want to rewrite unnest() into InnerProjection->Unnest->OuterProjection +/// Given an expression which contains unnest expr as one of its children, +/// Try transform depends on unnest type +/// - For list column: unnest(col) with type list -> unnest(col) with type list::item +/// - For struct column: unnest(struct(field1, field2)) -> unnest(struct).field1, unnest(struct).field2 +/// The transformed exprs will be used in the outer projection +pub(crate) fn recursive_transform_unnest( + input: &LogicalPlan, + unnest_placeholder_columns: &mut Vec, + inner_projection_exprs: &mut Vec, + original_expr: Expr, +) -> Result> { + let mut transform = + |unnest_expr: &Expr, expr_in_unnest: &Expr| -> Result> { + // Full context, we are trying to plan the execution as InnerProjection->Unnest->OuterProjection + // inside unnest execution, each column inside the inner projection + // will be transformed into new columns. Thus we need to keep track of these placeholding column names + let placeholder_name = unnest_expr.display_name()?; + + unnest_placeholder_columns.push(placeholder_name.clone()); + // Add alias for the argument expression, to avoid naming conflicts + // with other expressions in the select list. For example: `select unnest(col1), col1 from t`. + // this extra projection is used to unnest transforming + inner_projection_exprs + .push(expr_in_unnest.clone().alias(placeholder_name.clone())); + let schema = input.schema(); + + let (data_type, _) = expr_in_unnest.data_type_and_nullable(schema)?; + + let outer_projection_columns = + get_unnested_columns(&placeholder_name, &data_type)?; + let expr = outer_projection_columns + .iter() + .map(|col| Expr::Column(col.0.clone())) + .collect::>(); + Ok(expr) + }; + // expr transformed maybe either the same, or different from the originals exprs + // for example: + // - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2 + // - unnest(array_col) will be transformed into unnest(array_col) + // - unnest(array_col) + 1 will be transformed into unnest(array_col) + 1 + + // Specifically handle root level unnest expr, this is the only place + // unnest on struct can be handled + if let Expr::Unnest(Unnest { expr: ref arg }) = original_expr { + return transform(&original_expr, arg); + } + let Transformed { + data: transformed_expr, + transformed, + tnr: _, + } = original_expr.transform_up(|expr: Expr| { + if let Expr::Unnest(Unnest { expr: ref arg }) = expr { + let (data_type, _) = expr.data_type_and_nullable(input.schema())?; + if let DataType::Struct(_) = data_type { + return internal_err!("unnest on struct can ony be applied at the root level of select expression"); + } + let transformed_exprs = transform(&expr, arg)?; + Ok(Transformed::yes(transformed_exprs[0].clone())) + } else { + Ok(Transformed::no(expr)) + } + })?; + + if !transformed { + if matches!(&transformed_expr, Expr::Column(_)) { + inner_projection_exprs.push(transformed_expr.clone()); + Ok(vec![transformed_expr]) + } else { + // We need to evaluate the expr in the inner projection, + // outer projection just select its name + let column_name = transformed_expr.display_name()?; + inner_projection_exprs.push(transformed_expr); + Ok(vec![Expr::Column(Column::from_name(column_name))]) + } + } else { + Ok(vec![transformed_expr]) + } +} + +// write test for recursive_transform_unnest +#[cfg(test)] +mod tests { + use std::{ops::Add, sync::Arc}; + + use super::*; + use arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; + use arrow_schema::Fields; + use datafusion_common::DFSchema; + use datafusion_expr::EmptyRelation; + + #[test] + fn test_recursive_transform_unnest() -> Result<()> { + let schema = Schema::new(vec![ + Field::new( + "struct_col", + ArrowDataType::Struct(Fields::from(vec![ + Field::new("field1", ArrowDataType::Int32, false), + Field::new("field2", ArrowDataType::Int32, false), + ])), + false, + ), + Field::new( + "array_col", + ArrowDataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + true, + ), + Field::new("int_col", ArrowDataType::Int32, false), + ]); + + let dfschema = DFSchema::try_from(schema)?; + + let input = LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: Arc::new(dfschema), + }); + + let mut unnest_placeholder_columns = vec![]; + let mut inner_projection_exprs = vec![]; + + // unnest(struct_col) + let original_expr = Expr::Unnest(Unnest { + expr: Box::new(Expr::Column(Column::from_name("struct_col"))), + }); + let transformed_exprs = recursive_transform_unnest( + &input, + &mut unnest_placeholder_columns, + &mut inner_projection_exprs, + original_expr, + )?; + assert_eq!( + transformed_exprs, + vec![ + Expr::Column(Column::from_name("unnest(struct_col).field1")), + Expr::Column(Column::from_name("unnest(struct_col).field2")), + ] + ); + assert_eq!(unnest_placeholder_columns, vec!["unnest(struct_col)"]); + // still reference struct_col in original schema but with alias, + // to avoid colliding with the projection on the column itself if any + assert_eq!( + inner_projection_exprs, + vec![ + Expr::Column(Column::from_name("struct_col")).alias("unnest(struct_col)"), + ] + ); + + // unnest(array_col ) + 1 + let original_expr = Expr::Unnest(Unnest { + expr: Box::new(Expr::Column(Column::from_name("array_col"))), + }) + .add(Expr::Literal(ScalarValue::Int64(Some(1)))); + let transformed_exprs = recursive_transform_unnest( + &input, + &mut unnest_placeholder_columns, + &mut inner_projection_exprs, + original_expr, + )?; + assert_eq!( + unnest_placeholder_columns, + vec!["unnest(struct_col)", "unnest(array_col)"] + ); + // only transform the unnest children + assert_eq!( + transformed_exprs, + vec![Expr::Column(Column::from_name("unnest(array_col)")) + .add(Expr::Literal(ScalarValue::Int64(Some(1))))] + ); + + // keep appending to the current vector + // still reference array_col in original schema but with alias, + // to avoid colliding with the projection on the column itself if any + assert_eq!( + inner_projection_exprs, + vec![ + Expr::Column(Column::from_name("struct_col")).alias("unnest(struct_col)"), + Expr::Column(Column::from_name("array_col")).alias("unnest(array_col)"), + ] + ); + + Ok(()) + } +} From 390d4ac1d4b8fa7c5a6141097ada763d7984e590 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Sun, 19 May 2024 22:03:56 +0200 Subject: [PATCH 20/21] a small integration test --- datafusion/sql/tests/sql_integration.rs | 38 +++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index af4dac5f3f89..8f3564d40747 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -2905,6 +2905,21 @@ impl ContextProvider for MockContextProvider { Field::new("Id", DataType::UInt32, false), Field::new("lower", DataType::UInt32, false), ])), + "unnest_table" => Ok(Schema::new(vec![ + Field::new( + "array_col", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + false, + ), + Field::new( + "struct_col", + DataType::Struct(Fields::from(vec![ + Field::new("field1", DataType::Int64, true), + Field::new("field2", DataType::Utf8, true), + ])), + false, + ), + ])), _ => plan_err!("No table named: {} found", name.table()), }; @@ -4707,6 +4722,29 @@ fn roundtrip_crossjoin() -> Result<()> { Ok(()) } +#[test] +fn test_unnest_logical_plan() -> Result<()> { + let query = "select unnest(struct_col), unnest(array_col), struct_col, array_col from unnest_table"; + + let dialect = GenericDialect {}; + let statement = Parser::new(&dialect) + .try_with_sql(query)? + .parse_statement()?; + + let context = MockContextProvider::default(); + let sql_to_rel = SqlToRel::new(&context); + let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap(); + + let expected = "Projection: unnest(unnest_table.struct_col).field1, unnest(unnest_table.struct_col).field2, unnest(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ + \n Unnest: lists[unnest(unnest_table.array_col)] structs[unnest(unnest_table.struct_col)]\ + \n Projection: unnest_table.struct_col AS unnest(unnest_table.struct_col), unnest_table.array_col AS unnest(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ + \n TableScan: unnest_table"; + + assert_eq!(format!("{plan:?}"), expected); + + Ok(()) +} + #[cfg(test)] #[ctor::ctor] fn init() { From 7d5d268c540323fc77d27f2e3b4f788353de4da5 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai Date: Wed, 22 May 2024 22:03:18 +0200 Subject: [PATCH 21/21] fix comment --- datafusion/expr/src/expr_fn.rs | 9 ++++- datafusion/expr/src/expr_schema.rs | 1 - datafusion/expr/src/logical_plan/builder.rs | 6 +++ datafusion/sql/src/utils.rs | 39 +++++++++---------- datafusion/sqllogictest/test_files/unnest.slt | 25 ++++++++++++ 5 files changed, 57 insertions(+), 23 deletions(-) diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 64763a973687..2a2bb75f1884 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -19,7 +19,7 @@ use crate::expr::{ AggregateFunction, BinaryExpr, Cast, Exists, GroupingSet, InList, InSubquery, - Placeholder, TryCast, + Placeholder, TryCast, Unnest, }; use crate::function::{ AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory, @@ -489,6 +489,13 @@ pub fn when(when: Expr, then: Expr) -> CaseBuilder { CaseBuilder::new(None, vec![when], vec![then], None) } +/// Create a Unnest expression +pub fn unnest(expr: Expr) -> Expr { + Expr::Unnest(Unnest { + expr: Box::new(expr), + }) +} + /// Convenience method to create a new user defined scalar function (UDF) with a /// specific signature and specific return type. /// diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index c910cfd03f4b..387f4f11e0c9 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -123,7 +123,6 @@ impl ExprSchemable for Expr { Ok(field.data_type().clone()) } DataType::Struct(_) => { - // TODO: this is not correct, because unnest(struct) wll result into multiple data_type Ok(arg_data_type) } DataType::Null => { diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index bed56a20904a..8483525d7f55 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1595,6 +1595,12 @@ pub fn unnest(input: LogicalPlan, columns: Vec) -> Result { unnest_with_options(input, columns, UnnestOptions::default()) } +// Based on data type, either struct or a variant of list +// return a set of columns as the result of unnesting +// the input columns. +// For example, given a column with name "a", +// - List(Element) returns ["a"] with data type Element +// - Struct(field1, field2) returns ["a.field1","a.field2"] pub fn get_unnested_columns( col_name: &String, data_type: &DataType, diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index c58b2ba5ef1b..4ae486ef1a59 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -343,11 +343,12 @@ pub(crate) fn recursive_transform_unnest( mod tests { use std::{ops::Add, sync::Arc}; - use super::*; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; use arrow_schema::Fields; - use datafusion_common::DFSchema; - use datafusion_expr::EmptyRelation; + use datafusion_common::{DFSchema, Result}; + use datafusion_expr::{col, lit, unnest, EmptyRelation, LogicalPlan}; + + use crate::utils::recursive_transform_unnest; #[test] fn test_recursive_transform_unnest() -> Result<()> { @@ -362,7 +363,11 @@ mod tests { ), Field::new( "array_col", - ArrowDataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + ArrowDataType::List(Arc::new(Field::new( + "item", + ArrowDataType::Int64, + true, + ))), true, ), Field::new("int_col", ArrowDataType::Int32, false), @@ -379,9 +384,7 @@ mod tests { let mut inner_projection_exprs = vec![]; // unnest(struct_col) - let original_expr = Expr::Unnest(Unnest { - expr: Box::new(Expr::Column(Column::from_name("struct_col"))), - }); + let original_expr = unnest(col("struct_col")); let transformed_exprs = recursive_transform_unnest( &input, &mut unnest_placeholder_columns, @@ -391,8 +394,8 @@ mod tests { assert_eq!( transformed_exprs, vec![ - Expr::Column(Column::from_name("unnest(struct_col).field1")), - Expr::Column(Column::from_name("unnest(struct_col).field2")), + col("unnest(struct_col).field1"), + col("unnest(struct_col).field2"), ] ); assert_eq!(unnest_placeholder_columns, vec!["unnest(struct_col)"]); @@ -400,16 +403,11 @@ mod tests { // to avoid colliding with the projection on the column itself if any assert_eq!( inner_projection_exprs, - vec![ - Expr::Column(Column::from_name("struct_col")).alias("unnest(struct_col)"), - ] + vec![col("struct_col").alias("unnest(struct_col)"),] ); - // unnest(array_col ) + 1 - let original_expr = Expr::Unnest(Unnest { - expr: Box::new(Expr::Column(Column::from_name("array_col"))), - }) - .add(Expr::Literal(ScalarValue::Int64(Some(1)))); + // unnest(array_col) + 1 + let original_expr = unnest(col("array_col")).add(lit(1i64)); let transformed_exprs = recursive_transform_unnest( &input, &mut unnest_placeholder_columns, @@ -423,8 +421,7 @@ mod tests { // only transform the unnest children assert_eq!( transformed_exprs, - vec![Expr::Column(Column::from_name("unnest(array_col)")) - .add(Expr::Literal(ScalarValue::Int64(Some(1))))] + vec![col("unnest(array_col)").add(lit(1i64))] ); // keep appending to the current vector @@ -433,8 +430,8 @@ mod tests { assert_eq!( inner_projection_exprs, vec![ - Expr::Column(Column::from_name("struct_col")).alias("unnest(struct_col)"), - Expr::Column(Column::from_name("array_col")).alias("unnest(array_col)"), + col("struct_col").alias("unnest(struct_col)"), + col("array_col").alias("unnest(array_col)") ] ); diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 5d1cbc7aa6a7..7b7249d6d5fa 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -30,6 +30,13 @@ AS VALUES (null, null, 4, [17, 18], null) ; +statement ok +CREATE TABLE nested_unnest_table +AS VALUES + (struct('a', 'b', struct('c')), (struct('a', 'b', [10,20]))), + (struct('d', 'e', struct('f')), (struct('x', 'y', [30,40, 50]))) +; + ## Basic unnest expression in select list query I select unnest([1,2,3]); @@ -415,8 +422,26 @@ select unnest(array_remove(column1, 3)) - 1 as c1, column3 from unnest_table; 5 3 11 NULL +## unnest for nested struct(struct) +query TT? +select unnest(column1) from nested_unnest_table; +---- +a b {c0: c} +d e {c0: f} + +## unnest for nested(struct(list)) +query TT? +select unnest(column2) from nested_unnest_table; +---- +a b [10, 20] +x y [30, 40, 50] + query error DataFusion error: type_coercion\ncaused by\nThis feature is not implemented: Unnest should be rewritten to LogicalPlan::Unnest before type coercion select sum(unnest(generate_series(1,10))); +## TODO: support unnest as a child expr +query error DataFusion error: Internal error: unnest on struct can ony be applied at the root level of select expression +select arrow_typeof(unnest(column5)) from unnest_table; + statement ok drop table unnest_table;