diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 68cba15634d5..3adf2960784d 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -150,6 +150,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { planner_context, // Numeric literals in window function ORDER BY are treated as constants false, + None, )?; let func_deps = schema.functional_dependencies(); @@ -219,8 +220,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } else { // User defined aggregate functions (UDAF) have precedence in case it has the same name as a scalar built-in function if let Some(fm) = self.context_provider.get_aggregate_meta(&name) { - let order_by = - self.order_by_to_sort_expr(&order_by, schema, planner_context, true)?; + let order_by = self.order_by_to_sort_expr( + &order_by, + schema, + planner_context, + true, + None, + )?; let order_by = (!order_by.is_empty()).then_some(order_by); let args = self.function_args_to_expr(args, schema, planner_context)?; // TODO: Support filter and distinct for UDAFs @@ -236,8 +242,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // next, aggregate built-ins if let Ok(fun) = AggregateFunction::from_str(&name) { - let order_by = - self.order_by_to_sort_expr(&order_by, schema, planner_context, true)?; + let order_by = self.order_by_to_sort_expr( + &order_by, + schema, + planner_context, + true, + None, + )?; let order_by = (!order_by.is_empty()).then_some(order_by); let args = self.function_args_to_expr(args, schema, planner_context)?; let filter: Option> = filter diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 13f559a0ebc7..ed5421edfbb0 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -699,6 +699,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { input_schema, planner_context, true, + None, )?) } else { None diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs index 4ccdf6c2d418..4dd81517e958 100644 --- a/datafusion/sql/src/expr/order_by.rs +++ b/datafusion/sql/src/expr/order_by.rs @@ -24,16 +24,39 @@ use sqlparser::ast::{Expr as SQLExpr, OrderByExpr, Value}; impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Convert sql [OrderByExpr] to `Vec`. /// - /// If `literal_to_column` is true, treat any numeric literals (e.g. `2`) as a 1 based index - /// into the SELECT list (e.g. `SELECT a, b FROM table ORDER BY 2`). + /// `input_schema` and `additional_schema` are used to resolve column references in the order-by expressions. + /// `input_schema` is the schema of the input logical plan, typically derived from the SELECT list. + /// + /// Usually order-by expressions can only reference the input plan's columns. + /// But the `SELECT ... FROM ... ORDER BY ...` syntax is a special case. Besides the input schema, + /// it can reference an `additional_schema` derived from the `FROM` clause. + /// + /// If `literal_to_column` is true, treat any numeric literals (e.g. `2`) as a 1 based index into the + /// SELECT list (e.g. `SELECT a, b FROM table ORDER BY 2`). Literals only reference the `input_schema`. + /// /// If false, interpret numeric literals as constant values. pub(crate) fn order_by_to_sort_expr( &self, exprs: &[OrderByExpr], - schema: &DFSchema, + input_schema: &DFSchema, planner_context: &mut PlannerContext, literal_to_column: bool, + additional_schema: Option<&DFSchema>, ) -> Result> { + if exprs.is_empty() { + return Ok(vec![]); + } + + let mut combined_schema; + let order_by_schema = match additional_schema { + Some(schema) => { + combined_schema = input_schema.clone(); + combined_schema.merge(schema); + &combined_schema + } + None => input_schema, + }; + let mut expr_vec = vec![]; for e in exprs { let OrderByExpr { @@ -52,17 +75,23 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { return plan_err!( "Order by index starts at 1 for column indexes" ); - } else if schema.fields().len() < field_index { + } else if input_schema.fields().len() < field_index { return plan_err!( "Order by column out of bounds, specified: {}, max: {}", field_index, - schema.fields().len() + input_schema.fields().len() ); } - Expr::Column(Column::from(schema.qualified_field(field_index - 1))) + Expr::Column(Column::from( + input_schema.qualified_field(field_index - 1), + )) } - e => self.sql_expr_to_logical_expr(e.clone(), schema, planner_context)?, + e => self.sql_expr_to_logical_expr( + e.clone(), + order_by_schema, + planner_context, + )?, }; let asc = asc.unwrap_or(true); expr_vec.push(Expr::Sort(Sort::new( diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs index d5d3bcc4a13b..fdc739646a46 100644 --- a/datafusion/sql/src/query.rs +++ b/datafusion/sql/src/query.rs @@ -25,7 +25,7 @@ use datafusion_expr::{ Operator, }; use sqlparser::ast::{ - Expr as SQLExpr, Offset as SQLOffset, OrderByExpr, Query, SetExpr, Value, + Expr as SQLExpr, Offset as SQLOffset, Query, SelectInto, SetExpr, Value, }; impl<'a, S: ContextProvider> SqlToRel<'a, S> { @@ -46,29 +46,35 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { query: Query, planner_context: &mut PlannerContext, ) -> Result { - let mut set_expr = query.body; if let Some(with) = query.with { self.plan_with_clause(with, planner_context)?; } - // Take the `SelectInto` for later processing. - let select_into = match set_expr.as_mut() { - SetExpr::Select(select) => select.into.take(), - _ => None, - }; - let plan = self.set_expr_to_plan(*set_expr, planner_context)?; - let plan = self.order_by(plan, query.order_by, planner_context)?; - let mut plan = self.limit(plan, query.offset, query.limit)?; - if let Some(into) = select_into { - plan = LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(CreateMemoryTable { - name: self.object_name_to_table_reference(into.name)?, - constraints: Constraints::empty(), - input: Arc::new(plan), - if_not_exists: false, - or_replace: false, - column_defaults: vec![], - })) + + let set_expr = *query.body; + match set_expr { + SetExpr::Select(mut select) => { + let select_into = select.into.take(); + // Order-by expressions may refer to columns in the `FROM` clause, + // so we need to process `SELECT` and `ORDER BY` together. + let plan = + self.select_to_plan(*select, query.order_by, planner_context)?; + let plan = self.limit(plan, query.offset, query.limit)?; + // Process the `SELECT INTO` after `LIMIT`. + self.select_into(plan, select_into) + } + other => { + let plan = self.set_expr_to_plan(other, planner_context)?; + let order_by_rex = self.order_by_to_sort_expr( + &query.order_by, + plan.schema(), + planner_context, + true, + None, + )?; + let plan = self.order_by(plan, order_by_rex)?; + self.limit(plan, query.offset, query.limit) + } } - Ok(plan) } /// Wrap a plan in a limit @@ -114,26 +120,43 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } /// Wrap the logical in a sort - fn order_by( + pub(super) fn order_by( &self, plan: LogicalPlan, - order_by: Vec, - planner_context: &mut PlannerContext, + order_by: Vec, ) -> Result { if order_by.is_empty() { return Ok(plan); } - let order_by_rex = - self.order_by_to_sort_expr(&order_by, plan.schema(), planner_context, true)?; - if let LogicalPlan::Distinct(Distinct::On(ref distinct_on)) = plan { // In case of `DISTINCT ON` we must capture the sort expressions since during the plan // optimization we're effectively doing a `first_value` aggregation according to them. - let distinct_on = distinct_on.clone().with_sort_expr(order_by_rex)?; + let distinct_on = distinct_on.clone().with_sort_expr(order_by)?; Ok(LogicalPlan::Distinct(Distinct::On(distinct_on))) } else { - LogicalPlanBuilder::from(plan).sort(order_by_rex)?.build() + LogicalPlanBuilder::from(plan).sort(order_by)?.build() + } + } + + /// Wrap the logical plan in a `SelectInto` + fn select_into( + &self, + plan: LogicalPlan, + select_into: Option, + ) -> Result { + match select_into { + Some(into) => Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable( + CreateMemoryTable { + name: self.object_name_to_table_reference(into.name)?, + constraints: Constraints::empty(), + input: Arc::new(plan), + if_not_exists: false, + or_replace: false, + column_defaults: vec![], + }, + ))), + _ => Ok(plan), } } } diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index fdcef0ef6acc..730e84cd094b 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -29,7 +29,7 @@ use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; use datafusion_common::{Column, UnnestOptions}; use datafusion_expr::expr::{Alias, Unnest}; use datafusion_expr::expr_rewriter::{ - normalize_col, normalize_col_with_schemas_and_ambiguity_check, + normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols, }; use datafusion_expr::utils::{ expand_qualified_wildcard, expand_wildcard, expr_as_column_expr, expr_to_columns, @@ -39,8 +39,8 @@ use datafusion_expr::{ Expr, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder, Partitioning, }; use sqlparser::ast::{ - Distinct, Expr as SQLExpr, GroupByExpr, ReplaceSelectItem, WildcardAdditionalOptions, - WindowType, + Distinct, Expr as SQLExpr, GroupByExpr, OrderByExpr, ReplaceSelectItem, + WildcardAdditionalOptions, WindowType, }; use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins}; @@ -49,6 +49,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { pub(super) fn select_to_plan( &self, mut select: Select, + order_by: Vec, planner_context: &mut PlannerContext, ) -> Result { // check for unsupported syntax first @@ -94,6 +95,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut combined_schema = base_plan.schema().as_ref().clone(); combined_schema.merge(projected_plan.schema()); + // Order-by expressions prioritize referencing columns from the select list, + // then from the FROM clause. + let order_by_rex = self.order_by_to_sort_expr( + &order_by, + projected_plan.schema().as_ref(), + planner_context, + true, + Some(base_plan.schema().as_ref()), + )?; + let order_by_rex = normalize_cols(order_by_rex, &projected_plan)?; + // this alias map is resolved and looked up in both having exprs and group by exprs let alias_map = extract_aliases(&select_exprs); @@ -248,9 +260,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .collect::>>()?; // Build the final plan - return LogicalPlanBuilder::from(base_plan) + LogicalPlanBuilder::from(base_plan) .distinct_on(on_expr, select_exprs, None)? - .build(); + .build() } }?; @@ -274,6 +286,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { plan }; + let plan = self.order_by(plan, order_by_rex)?; + Ok(plan) } diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index cbe41c33c729..248aad846996 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -27,7 +27,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { planner_context: &mut PlannerContext, ) -> Result { match set_expr { - SetExpr::Select(s) => self.select_to_plan(*s, planner_context), + SetExpr::Select(s) => self.select_to_plan(*s, vec![], planner_context), SetExpr::Values(v) => self.sql_values_to_plan(v, planner_context), SetExpr::SetOperation { op, diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index c81217aa7017..137bb5fb20b7 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -942,7 +942,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { for expr in order_exprs { // Convert each OrderByExpr to a SortExpr: let expr_vec = - self.order_by_to_sort_expr(&expr, schema, planner_context, true)?; + self.order_by_to_sort_expr(&expr, schema, planner_context, true, None)?; // Verify that columns of all SortExprs exist in the schema: for expr in expr_vec.iter() { for column in expr.to_columns()?.iter() { diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 319aa5b5fd30..0af060496312 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3646,7 +3646,7 @@ fn test_select_distinct_order_by() { let sql = "SELECT distinct '1' from person order by id"; let expected = - "Error during planning: For SELECT DISTINCT, ORDER BY expressions id must appear in select list"; + "Error during planning: For SELECT DISTINCT, ORDER BY expressions person.id must appear in select list"; // It should return error. let result = logical_plan(sql); diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 5ef33307b521..0844cb94ace6 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -2010,7 +2010,7 @@ set datafusion.explain.logical_plan_only = false; statement ok set datafusion.execution.target_partitions = 4; -# Planning inner nested loop join +# Planning inner nested loop join # inputs are swapped due to inexact statistics + join reordering caused additional projection query TT diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index 4121de91cb8d..5ff7df50ad38 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -374,11 +374,11 @@ select * from t SORT BY time; # distinct on a column not in the select list should not work -statement error For SELECT DISTINCT, ORDER BY expressions time must appear in select list +statement error DataFusion error: Error during planning: For SELECT DISTINCT, ORDER BY expressions t\.time must appear in select list SELECT DISTINCT value FROM t ORDER BY time; # distinct on an expression of a column not in the select list should not work -statement error For SELECT DISTINCT, ORDER BY expressions time must appear in select list +statement error DataFusion error: Error during planning: For SELECT DISTINCT, ORDER BY expressions t\.time must appear in select list SELECT DISTINCT date_trunc('hour', time) FROM t ORDER BY time; # distinct on a column that is in the select list but aliasted should work @@ -888,6 +888,69 @@ select column2, column1 from foo ORDER BY column2 desc, column1 desc; [1] 4 [0] 2 +# Test issue: https://github.com/apache/datafusion/issues/10013 +# There is both a HAVING clause and an ORDER BY clause in the query. +query I +SELECT + SUM(column1) +FROM foo +HAVING SUM(column1) > 0 +ORDER BY SUM(column1) +---- +16 + +# ORDER BY with a GROUP BY clause +query I +SELECT SUM(column1) + FROM foo +GROUP BY column2 +ORDER BY SUM(column1) +---- +0 +2 +2 +2 +3 +3 +4 + +# ORDER BY with a GROUP BY clause and a HAVING clause +query I +SELECT + SUM(column1) +FROM foo +GROUP BY column2 +HAVING SUM(column1) < 3 +ORDER BY SUM(column1) +---- +0 +2 +2 +2 + +# ORDER BY without a HAVING clause +query I +SELECT SUM(column1) FROM foo ORDER BY SUM(column1) +---- +16 + +# Order by unprojected aggregate expressions is not supported +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression AggregateFunction +SELECT column2 FROM foo ORDER BY SUM(column1) + +statement ok +create table ambiguity_test(a int, b int) as values (1,20), (2,10); + +# Order-by expressions prioritize referencing columns from the select list. +query II +select a as b, b as c2 from ambiguity_test order by b +---- +1 20 +2 10 + # Cleanup statement ok drop table foo; + +statement ok +drop table ambiguity_test; diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index a3a4b3bfc584..092c79e361e6 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -488,7 +488,7 @@ select '1' from foo order by column1; 1 # foo distinct order by -statement error DataFusion error: Error during planning: For SELECT DISTINCT, ORDER BY expressions column1 must appear in select list +statement error DataFusion error: Error during planning: For SELECT DISTINCT, ORDER BY expressions foo\.column1 must appear in select list select distinct '1' from foo order by column1; # distincts for float nan