apache · alamb · Jul 21, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/datafusion/core/tests/optimizer_integration.rs b/datafusion/core/tests/optimizer_integration.rs
@@ -44,6 +44,8 @@ use datafusion_sql::sqlparser::parser::Parser;
 use datafusion_sql::TableReference;
 
 use chrono::DateTime;
+use datafusion_expr::planner::ExprPlanner;
+use datafusion_functions::core::planner::CoreFunctionPlanner;
 use datafusion_functions::datetime;
 
 #[cfg(test)]
@@ -115,6 +117,139 @@ fn concat_ws_literals() -> Result<()> {
     Ok(())
 }
 
+#[test]
+fn anti_join_with_join_filter() -> Result<()> {
+    // regression test for https://github.com/apache/datafusion/issues/2888
+    let sql = "SELECT col_utf8 FROM test WHERE NOT EXISTS (\
+               SELECT col_utf8 FROM test t2 WHERE test.col_int32 = t2.col_int32 \
+               AND test.col_uint32 != t2.col_uint32)";
+    let plan = test_sql(sql)?;
+    let expected = "Projection: test.col_utf8\
+    \n  LeftAnti Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32\
+    \n    TableScan: test projection=[col_int32, col_uint32, col_utf8]\
+    \n    SubqueryAlias: __correlated_sq_1\
+    \n      SubqueryAlias: t2\
+    \n        TableScan: test projection=[col_int32, col_uint32]";
+    assert_eq!(expected, format!("{plan:?}"));
+    Ok(())
+}
+
+#[test]
+fn where_exists_distinct() -> Result<()> {
+    let sql = "SELECT col_int32 FROM test WHERE EXISTS (\
+               SELECT DISTINCT col_int32 FROM test t2 WHERE test.col_int32 = t2.col_int32)";
+    let plan = test_sql(sql)?;
+    let expected = "LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32\
+    \n  TableScan: test projection=[col_int32]\
+    \n  SubqueryAlias: __correlated_sq_1\
+    \n    Aggregate: groupBy=[[t2.col_int32]], aggr=[[]]\
+    \n      SubqueryAlias: t2\
+    \n        TableScan: test projection=[col_int32]";
+    assert_eq!(expected, format!("{plan:?}"));
+    Ok(())
+}
+
+#[test]
+fn propagate_empty_relation() {
+    let sql = "SELECT test.col_int32 FROM test JOIN ( SELECT col_int32 FROM test WHERE false ) AS ta1 ON test.col_int32 = ta1.col_int32;";
+    let plan = test_sql(sql).unwrap();
+    // when children exist EmptyRelation, it will bottom-up propagate.
+    let expected = "EmptyRelation";
+    assert_eq!(expected, format!("{plan:?}"));
+}
+
+#[test]
+fn join_keys_in_subquery_alias() {
+    let sql = "SELECT * FROM test AS A, ( SELECT col_int32 as key FROM test ) AS B where A.col_int32 = B.key;";
+    let plan = test_sql(sql).unwrap();
+    let expected = "Inner Join: a.col_int32 = b.key\
+    \n  SubqueryAlias: a\
+    \n    Filter: test.col_int32 IS NOT NULL\
+    \n      TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\
+    \n  SubqueryAlias: b\
+    \n    Projection: test.col_int32 AS key\
+    \n      Filter: test.col_int32 IS NOT NULL\
+    \n        TableScan: test projection=[col_int32]";
+
+    assert_eq!(expected, format!("{plan:?}"));
+}
+
+#[test]
+fn join_keys_in_subquery_alias_1() {
+    let sql = "SELECT * FROM test AS A, ( SELECT test.col_int32 AS key FROM test JOIN test AS C on test.col_int32 = C.col_int32 ) AS B where A.col_int32 = B.key;";
+    let plan = test_sql(sql).unwrap();
+    let expected = "Inner Join: a.col_int32 = b.key\
+    \n  SubqueryAlias: a\
+    \n    Filter: test.col_int32 IS NOT NULL\
+    \n      TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\
+    \n  SubqueryAlias: b\
+    \n    Projection: test.col_int32 AS key\
+    \n      Inner Join: test.col_int32 = c.col_int32\
+    \n        Filter: test.col_int32 IS NOT NULL\
+    \n          TableScan: test projection=[col_int32]\
+    \n        SubqueryAlias: c\
+    \n          Filter: test.col_int32 IS NOT NULL\
+    \n            TableScan: test projection=[col_int32]";
+    assert_eq!(expected, format!("{plan:?}"));
+}
+
+#[test]
+// issue: https://github.com/apache/datafusion/issues/5334
+fn test_same_name_but_not_ambiguous() {
+    let sql = "SELECT t1.col_int32 AS col_int32 FROM test t1 intersect SELECT col_int32 FROM test t2";
+    let plan = test_sql(sql).unwrap();
+    let expected = "LeftSemi Join: t1.col_int32 = t2.col_int32\
+    \n  Aggregate: groupBy=[[t1.col_int32]], aggr=[[]]\
+    \n    SubqueryAlias: t1\
+    \n      TableScan: test projection=[col_int32]\
+    \n  SubqueryAlias: t2\
+    \n    TableScan: test projection=[col_int32]";
+    assert_eq!(expected, format!("{plan:?}"));
+}
+
+#[test]
+fn test_propagate_empty_relation_inner_join_and_unions() {
+    let sql = "\
+        SELECT A.col_int32 FROM test AS A \
+        INNER JOIN ( \
+          SELECT col_int32 FROM test WHERE 1 = 0 \
+        ) AS B ON A.col_int32 = B.col_int32 \
+        UNION ALL \
+        SELECT test.col_int32 FROM test WHERE 1 = 1 \
+        UNION ALL \
+        SELECT test.col_int32 FROM test WHERE 0 = 0 \
+        UNION ALL \
+        SELECT test.col_int32 FROM test WHERE test.col_int32 < 0 \
+        UNION ALL \
+        SELECT test.col_int32 FROM test WHERE 1 = 0";
+
+    let plan = test_sql(sql).unwrap();
+    let expected = "\
+        Union\
+        \n  TableScan: test projection=[col_int32]\
+        \n  TableScan: test projection=[col_int32]\
+        \n  Filter: test.col_int32 < Int32(0)\
+        \n    TableScan: test projection=[col_int32]";
+    assert_eq!(expected, format!("{plan:?}"));
+}
+
+#[test]
+fn semi_join_with_join_filter() -> Result<()> {
+    // regression test for https://github.com/apache/datafusion/issues/2888
+    let sql = "SELECT col_utf8 FROM test WHERE EXISTS (\
+               SELECT col_utf8 FROM test t2 WHERE test.col_int32 = t2.col_int32 \
+               AND test.col_uint32 != t2.col_uint32)";
+    let plan = test_sql(sql)?;
+    let expected = "Projection: test.col_utf8\
+                    \n  LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32\
+                    \n    TableScan: test projection=[col_int32, col_uint32, col_utf8]\
+                    \n    SubqueryAlias: __correlated_sq_1\
+                    \n      SubqueryAlias: t2\
+                    \n        TableScan: test projection=[col_int32, col_uint32]";
+    assert_eq!(expected, format!("{plan:?}"));
+    Ok(())
+}
+
 fn quick_test(sql: &str, expected_plan: &str) {
     let plan = test_sql(sql).unwrap();
     assert_eq!(expected_plan, format!("{:?}", plan));
@@ -131,7 +266,8 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> {
         .with_udf(datetime::now())
         .with_udf(datafusion_functions::core::arrow_cast())
         .with_udf(datafusion_functions::string::concat())
-        .with_udf(datafusion_functions::string::concat_ws());
+        .with_udf(datafusion_functions::string::concat_ws())
+        .with_expr_planner(Arc::new(CoreFunctionPlanner::default()));
     let sql_to_rel = SqlToRel::new(&context_provider);
     let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap();
 
@@ -151,13 +287,18 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> {
 struct MyContextProvider {
     options: ConfigOptions,
     udfs: HashMap<String, Arc<ScalarUDF>>,
+    expr_planners: Vec<Arc<dyn ExprPlanner>>,
 }
 
 impl MyContextProvider {
     fn with_udf(mut self, udf: Arc<ScalarUDF>) -> Self {
         self.udfs.insert(udf.name().to_string(), udf);
         self
     }
+    fn with_expr_planner(mut self, planner: Arc<dyn ExprPlanner>) -> Self {
+        self.expr_planners.push(planner);
+        self
+    }
 }
 
 impl ContextProvider for MyContextProvider {
@@ -226,6 +367,10 @@ impl ContextProvider for MyContextProvider {
     fn udwf_names(&self) -> Vec<String> {
         Vec::new()
     }
+
+    fn get_expr_planners(&self) -> &[Arc<dyn ExprPlanner>] {
+        &self.expr_planners
+    }
 }
 
 struct MyTableSource {

diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs
@@ -19,7 +19,7 @@
 
 use std::sync::Arc;
 
-use arrow::datatypes::{DataType, SchemaRef};
+use arrow::datatypes::{DataType, Field, SchemaRef};
 use datafusion_common::{
     config::ConfigOptions, file_options::file_type::FileType, not_impl_err, DFSchema,
     Result, TableReference,
@@ -180,6 +180,23 @@ pub trait ExprPlanner: Send + Sync {
     fn plan_make_map(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
         Ok(PlannerResult::Original(args))
     }
+
+    /// Plans compound identifier eg `db.schema.table` for non-empty nested names
+    ///
+    /// Note:
+    /// Currently compound identifier for outer query schema is not supported.
+    ///
+    /// Returns planned expression
+    fn plan_compound_identifier(
+        &self,
+        _field: &Field,
+        _qualifier: Option<&TableReference>,
+        _nested_names: &[String],
+    ) -> Result<PlannerResult<Vec<Expr>>> {
+        not_impl_err!(
+            "Default planner compound identifier hasn't been implemented for ExprPlanner"
+        )
+    }
 }
 
 /// An operator with two arguments to plan

diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs
@@ -100,7 +100,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
         nvl2(),
         arrow_typeof(),
         named_struct(),
-        get_field(),
         coalesce(),
         map(),
     ]

diff --git a/datafusion/functions/src/core/planner.rs b/datafusion/functions/src/core/planner.rs
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::DFSchema;
+use arrow::datatypes::Field;
 use datafusion_common::Result;
+use datafusion_common::{not_impl_err, Column, DFSchema, ScalarValue, TableReference};
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawDictionaryExpr};
-use datafusion_expr::Expr;
+use datafusion_expr::{lit, Expr};
 
 use super::named_struct;
 
@@ -62,4 +63,26 @@ impl ExprPlanner for CoreFunctionPlanner {
             ScalarFunction::new_udf(crate::string::overlay(), args),
         )))
     }
+
+    fn plan_compound_identifier(
+        &self,
+        field: &Field,
+        qualifier: Option<&TableReference>,
+        nested_names: &[String],
+    ) -> Result<PlannerResult<Vec<Expr>>> {
+        // TODO: remove when can support multiple nested identifiers
+        if nested_names.len() > 1 {
+            return not_impl_err!(
+                "Nested identifiers not yet supported for column {}",
+                Column::from((qualifier, field)).quoted_flat_name()
+            );
+        }
+        let nested_name = nested_names[0].to_string();
+
+        let col = Expr::Column(Column::from((qualifier, field)));
+        let get_field_args = vec![col, lit(ScalarValue::from(nested_name))];
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::core::get_field(), get_field_args),
+        )))
+    }
 }