diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index a96caa03d611..3872f959c315 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -231,6 +231,67 @@ impl LogicalPlanBuilder { Ok(Self::from(LogicalPlan::Values(Values { schema, values }))) } + /// Create a values list based relation using the provided column datatypes. + /// + /// By default, it assigns the names column1, column2, etc. to the columns of a VALUES table. + /// The column names are not specified by the SQL standard and different database systems do it differently, + /// so it's usually better to override the default names with a table alias list. + /// + /// If the values include params/binders such as $1, $2, $3, etc, then the `param_data_types` should be provided. + pub fn values_with_types( + mut values: Vec>, + field_types: &[DataType], + ) -> Result { + if values.is_empty() { + return plan_err!("Values list cannot be empty"); + } + let n_cols = values[0].len(); + if n_cols == 0 { + return plan_err!("Values list cannot be zero length"); + } + if n_cols != field_types.len() { + return plan_err!( + "Values list does not match the provided number of data types: got {} values but expected {}", n_cols, + field_types.len() + ); + } + for (i, row) in values.iter().enumerate() { + if row.len() != n_cols { + return plan_err!( + "Inconsistent data length across values list: got {} values in row {} but expected {}", + row.len(), + i, + n_cols + ); + } + } + + let empty_schema = DFSchema::empty(); + // wrap cast if data type is not same as common type. + for row in &mut values { + for (j, field_type) in field_types.iter().enumerate() { + if let Expr::Literal(ScalarValue::Null) = row[j] { + row[j] = Expr::Literal(ScalarValue::try_from(field_type.clone())?); + } else { + row[j] = + std::mem::take(&mut row[j]).cast_to(field_type, &empty_schema)?; + } + } + } + let fields = field_types + .iter() + .enumerate() + .map(|(j, data_type)| { + // naming is following convention https://www.postgresql.org/docs/current/queries-values.html + let name = &format!("column{}", j + 1); + Field::new(name, data_type.clone(), true) + }) + .collect::>(); + let dfschema = DFSchema::from_unqualified_fields(fields.into(), HashMap::new())?; + let schema = DFSchemaRef::new(dfschema); + Ok(Self::from(LogicalPlan::Values(Values { schema, values }))) + } + /// Convert a table provider into a builder with a TableScan /// /// Note that if you pass a string as `table_name`, it is treated diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 5cbe1d7c014a..558aca29dffc 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -130,6 +130,8 @@ pub struct PlannerContext { /// Data types for numbered parameters ($1, $2, etc), if supplied /// in `PREPARE` statement prepare_param_data_types: Arc>, + /// Column types for VALUES tuples during an INSERT. + values_column_data_types: Arc>, /// Map of CTE name to logical plan of the WITH clause. /// Use `Arc` to allow cheap cloning ctes: HashMap>, @@ -151,6 +153,7 @@ impl PlannerContext { pub fn new() -> Self { Self { prepare_param_data_types: Arc::new(vec![]), + values_column_data_types: Arc::new(vec![]), ctes: HashMap::new(), outer_query_schema: None, outer_from_schema: None, @@ -166,6 +169,14 @@ impl PlannerContext { self } + pub fn with_values_column_data_types( + mut self, + values_column_data_types: Vec, + ) -> Self { + self.values_column_data_types = values_column_data_types.into(); + self + } + // return a reference to the outer queries schema pub fn outer_query_schema(&self) -> Option<&DFSchema> { self.outer_query_schema.as_ref().map(|s| s.as_ref()) @@ -209,6 +220,11 @@ impl PlannerContext { &self.prepare_param_data_types } + /// Return the types of columns in VALUES tuples if known + pub fn values_column_data_types(&self) -> &[DataType] { + &self.values_column_data_types + } + /// returns true if there is a Common Table Expression (CTE) / /// Subquery for the specified name pub fn contains_cte(&self, cte_name: &str) -> bool { diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index e75a96e78d48..093061edbf55 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1468,10 +1468,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } let prepare_param_data_types = prepare_param_data_types.into_values().collect(); + let values_column_data_types = fields + .iter() + .map(|f| f.data_type().clone()) + .collect::>(); // Projection - let mut planner_context = - PlannerContext::new().with_prepare_param_data_types(prepare_param_data_types); + let mut planner_context = PlannerContext::new() + .with_prepare_param_data_types(prepare_param_data_types) + .with_values_column_data_types(values_column_data_types); let source = self.query_to_plan(*source, &mut planner_context)?; if fields.len() != source.schema().fields().len() { plan_err!("Column count doesn't match insert query!")?; diff --git a/datafusion/sql/src/values.rs b/datafusion/sql/src/values.rs index 9efb75bd60e4..5557dfe4b4b7 100644 --- a/datafusion/sql/src/values.rs +++ b/datafusion/sql/src/values.rs @@ -41,6 +41,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .collect::>>() }) .collect::>>()?; - LogicalPlanBuilder::values(values)?.build() + let column_types = planner_context.values_column_data_types(); + if column_types.is_empty() { + LogicalPlanBuilder::values(values)?.build() + } else { + LogicalPlanBuilder::values_with_types(values, column_types)?.build() + } } } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 5685e09c9c9f..f6711dde88cb 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3195,7 +3195,7 @@ fn lateral_left_join() { #[test] fn lateral_nested_left_join() { - let sql = "SELECT * FROM + let sql = "SELECT * FROM j1, \ (j2 LEFT JOIN LATERAL (SELECT * FROM j3 WHERE j1_id + j2_id = j3_id) AS j3 ON(true))"; let expected = "Projection: *\ diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 230ea4d98fc3..b253566551c0 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -255,7 +255,7 @@ insert into table_without_values(id, id) values(3, 3); statement error Arrow error: Cast error: Cannot cast string 'zoo' to value of Int64 type insert into table_without_values(name, id) values(4, 'zoo'); -statement error Error during planning: Column count doesn't match insert query! +statement error Error during planning: Values list does not match the provided number of data types: got 2 values but expected 1 insert into table_without_values(id) values(4, 'zoo'); # insert NULL values for the missing column (name) @@ -433,3 +433,17 @@ drop table test_column_defaults statement error DataFusion error: Error during planning: Column reference is not allowed in the DEFAULT expression : Schema error: No field named a. create table test_column_defaults(a int, b int default a+1) + +# test value casting +statement ok +create table test_column_insert_cast( + a BIGINT UNSIGNED +); + +query I +insert into test_column_insert_cast(a) values (0), (12775823699315690233); +---- +2 + +statement ok +drop table test_column_insert_cast diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index c40f62c3ba80..9c92519941ee 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -500,7 +500,7 @@ insert into table_without_values(id, id) values(3, 3); statement error Arrow error: Cast error: Cannot cast string 'zoo' to value of Int64 type insert into table_without_values(name, id) values(4, 'zoo'); -statement error Error during planning: Column count doesn't match insert query! +statement error Error during planning: Values list does not match the provided number of data types: got 2 values but expected 1 insert into table_without_values(id) values(4, 'zoo'); # insert NULL values for the missing column (name)