apache · jayzhan211 · Jun 3, 2024 · May 5, 2024 · May 5, 2024 · May 5, 2024
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
diff --git a/datafusion-examples/examples/advanced_udaf.rs b/datafusion-examples/examples/advanced_udaf.rs
@@ -105,7 +105,10 @@ impl AggregateUDFImpl for GeoMeanUdaf {
         true
     }
 
-    fn create_groups_accumulator(&self) -> Result<Box<dyn GroupsAccumulator>> {
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
         Ok(Box::new(GeometricMeanGroupsAccumulator::new()))
     }
 }

diff --git a/datafusion-examples/examples/simplify_udaf_expression.rs b/datafusion-examples/examples/simplify_udaf_expression.rs
@@ -78,9 +78,13 @@ impl AggregateUDFImpl for BetterAvgUdaf {
         true
     }
 
-    fn create_groups_accumulator(&self) -> Result<Box<dyn GroupsAccumulator>> {
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
         unimplemented!("should not get here");
     }
+
     // we override method, to return new expression which would substitute
     // user defined function call
     fn simplify(&self) -> Option<AggregateFunctionSimplification> {

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -53,8 +53,9 @@ use datafusion_expr::{
     avg, count, max, min, stddev, utils::COUNT_STAR_EXPANSION,
     TableProviderFilterPushDown, UNNAMED_TABLE,
 };
-use datafusion_expr::{case, is_null, sum};
+use datafusion_expr::{case, is_null};
 use datafusion_functions_aggregate::expr_fn::median;
+use datafusion_functions_aggregate::expr_fn::sum;
 
 use async_trait::async_trait;
 
@@ -1593,9 +1594,8 @@ mod tests {
     use datafusion_common::{Constraint, Constraints};
     use datafusion_common_runtime::SpawnedTask;
     use datafusion_expr::{
-        array_agg, cast, count_distinct, create_udf, expr, lit, sum,
-        BuiltInWindowFunction, ScalarFunctionImplementation, Volatility, WindowFrame,
-        WindowFunctionDefinition,
+        array_agg, cast, count_distinct, create_udf, expr, lit, BuiltInWindowFunction,
+        ScalarFunctionImplementation, Volatility, WindowFrame, WindowFunctionDefinition,
     };
     use datafusion_physical_expr::expressions::Column;
     use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties};

diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs
@@ -786,7 +786,7 @@ mod tests {
         assert_eq!(
             evaluate_partition_prefix(
                 partitions,
-                &[col("a").eq(lit("foo")).and((col("b").eq(lit("bar"))))],
+                &[col("a").eq(lit("foo")).and(col("b").eq(lit("bar")))],
             ),
             Some(Path::from("a=foo/b=bar")),
         );

diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
@@ -2257,9 +2257,8 @@ mod tests {
     use datafusion_common::{assert_contains, DFSchemaRef, TableReference};
     use datafusion_execution::runtime_env::RuntimeEnv;
     use datafusion_execution::TaskContext;
-    use datafusion_expr::{
-        col, lit, sum, LogicalPlanBuilder, UserDefinedLogicalNodeCore,
-    };
+    use datafusion_expr::{col, lit, LogicalPlanBuilder, UserDefinedLogicalNodeCore};
+    use datafusion_functions_aggregate::expr_fn::sum;
     use datafusion_physical_expr::EquivalenceProperties;
 
     fn make_session_state() -> SessionState {

diff --git a/datafusion/core/src/prelude.rs b/datafusion/core/src/prelude.rs
@@ -39,7 +39,6 @@ pub use datafusion_expr::{
     Expr,
 };
 pub use datafusion_functions::expr_fn::*;
-pub use datafusion_functions_aggregate::expr_fn::*;
 #[cfg(feature = "array_expressions")]
 pub use datafusion_functions_array::expr_fn::*;
 

diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs
@@ -52,10 +52,10 @@ use datafusion_expr::expr::{GroupingSet, Sort};
 use datafusion_expr::var_provider::{VarProvider, VarType};
 use datafusion_expr::{
     array_agg, avg, cast, col, count, exists, expr, in_subquery, lit, max, out_ref_col,
-    placeholder, scalar_subquery, sum, when, wildcard, AggregateFunction, Expr,
-    ExprSchemable, WindowFrame, WindowFrameBound, WindowFrameUnits,
-    WindowFunctionDefinition,
+    placeholder, scalar_subquery, when, wildcard, AggregateFunction, Expr, ExprSchemable,
+    WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
 };
+use datafusion_functions_aggregate::expr_fn::sum;
 
 #[tokio::test]
 async fn test_count_wildcard_on_sort() -> Result<()> {

diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs
@@ -33,10 +33,12 @@ use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::{Result, ScalarValue};
 use datafusion_common_runtime::SpawnedTask;
 use datafusion_expr::type_coercion::aggregates::coerce_types;
+use datafusion_expr::type_coercion::functions::data_types_with_aggregate_udf;
 use datafusion_expr::{
     AggregateFunction, BuiltInWindowFunction, WindowFrame, WindowFrameBound,
     WindowFrameUnits, WindowFunctionDefinition,
 };
+use datafusion_functions_aggregate::sum::sum_udaf;
 use datafusion_physical_expr::expressions::{cast, col, lit};
 use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
 use test_utils::add_empty_batches;
@@ -341,7 +343,7 @@ fn get_random_function(
     window_fn_map.insert(
         "sum",
         (
-            WindowFunctionDefinition::AggregateFunction(AggregateFunction::Sum),
+            WindowFunctionDefinition::AggregateUDF(sum_udaf()),
             vec![arg.clone()],
         ),
     );
@@ -468,6 +470,14 @@ fn get_random_function(
             let coerced = coerce_types(f, &[dt], &sig).unwrap();
             args[0] = cast(a, schema, coerced[0].clone()).unwrap();
         }
+    } else if let WindowFunctionDefinition::AggregateUDF(udf) = window_fn {
+        if !args.is_empty() {
+            // Do type coercion first argument
+            let a = args[0].clone();
+            let dt = a.data_type(schema.as_ref()).unwrap();
+            let coerced = data_types_with_aggregate_udf(&[dt], udf).unwrap();
+            args[0] = cast(a, schema, coerced[0].clone()).unwrap();
+        }
     }
 
     (window_fn.clone(), args, fn_name.to_string())

diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
@@ -142,7 +142,7 @@ async fn test_udaf_as_window_with_frame_without_retract_batch() {
     let sql = "SELECT time_sum(time) OVER(ORDER BY time ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as time_sum from t";
     // Note if this query ever does start working
     let err = execute(&ctx, sql).await.unwrap_err();
-    assert_contains!(err.to_string(), "This feature is not implemented: Aggregate can not be used as a sliding accumulator because `retract_batch` is not implemented: AggregateUDF { inner: AggregateUDF { name: \"time_sum\", signature: Signature { type_signature: Exact([Timestamp(Nanosecond, None)]), volatility: Immutable }, fun: \"<FUNC>\" } }(t.time) ORDER BY [t.time ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING");
+    assert_contains!(err.to_string(), "This feature is not implemented: Aggregate can not be used as a sliding accumulator because `retract_batch` is not implemented: time_sum(t.time) ORDER BY [t.time ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING");
 }
 
 /// Basic query for with a udaf returning a structure
@@ -729,7 +729,10 @@ impl AggregateUDFImpl for TestGroupsAccumulator {
         true
     }
 
-    fn create_groups_accumulator(&self) -> Result<Box<dyn GroupsAccumulator>> {
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
         Ok(Box::new(self.clone()))
     }
 }

diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs
@@ -378,8 +378,8 @@ async fn udaf_as_window_func() -> Result<()> {
     context.register_udaf(my_acc);
 
     let sql = "SELECT a, MY_ACC(b) OVER(PARTITION BY a) FROM my_table";
-    let expected = r#"Projection: my_table.a, AggregateUDF { inner: AggregateUDF { name: "my_acc", signature: Signature { type_signature: Exact([Int32]), volatility: Immutable }, fun: "<FUNC>" } }(my_table.b) PARTITION BY [my_table.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[AggregateUDF { inner: AggregateUDF { name: "my_acc", signature: Signature { type_signature: Exact([Int32]), volatility: Immutable }, fun: "<FUNC>" } }(my_table.b) PARTITION BY [my_table.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+    let expected = r#"Projection: my_table.a, my_acc(my_table.b) PARTITION BY [my_table.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+  WindowAggr: windowExpr=[[my_acc(my_table.b) PARTITION BY [my_table.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
     TableScan: my_table"#;
 
     let dataframe = context.sql(sql).await.unwrap();

diff --git a/datafusion/expr/src/built_in_window_function.rs b/datafusion/expr/src/built_in_window_function.rs
@@ -82,8 +82,8 @@ impl BuiltInWindowFunction {
             Ntile => "NTILE",
             Lag => "LAG",
             Lead => "LEAD",
-            FirstValue => "FIRST_VALUE",
-            LastValue => "LAST_VALUE",
+            FirstValue => "first_value",
+            LastValue => "last_value",
             NthValue => "NTH_VALUE",
         }
     }

diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
@@ -754,10 +754,14 @@ impl WindowFunctionDefinition {
 impl fmt::Display for WindowFunctionDefinition {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            WindowFunctionDefinition::AggregateFunction(fun) => fun.fmt(f),
-            WindowFunctionDefinition::BuiltInWindowFunction(fun) => fun.fmt(f),
-            WindowFunctionDefinition::AggregateUDF(fun) => std::fmt::Debug::fmt(fun, f),
-            WindowFunctionDefinition::WindowUDF(fun) => fun.fmt(f),
+            WindowFunctionDefinition::AggregateFunction(fun) => {
+                std::fmt::Display::fmt(fun, f)
+            }
+            WindowFunctionDefinition::BuiltInWindowFunction(fun) => {
+                std::fmt::Display::fmt(fun, f)
+            }
+            WindowFunctionDefinition::AggregateUDF(fun) => std::fmt::Display::fmt(fun, f),
+            WindowFunctionDefinition::WindowUDF(fun) => std::fmt::Display::fmt(fun, f),
         }
     }
 }
@@ -2263,7 +2267,11 @@ mod test {
             let fun = find_df_window_func(name).unwrap();
             let fun2 = find_df_window_func(name.to_uppercase().as_str()).unwrap();
             assert_eq!(fun, fun2);
-            assert_eq!(fun.to_string(), name.to_uppercase());
+            if fun.to_string() == "first_value" || fun.to_string() == "last_value" {
+                assert_eq!(fun.to_string(), name);
+            } else {
+                assert_eq!(fun.to_string(), name.to_uppercase());
+            }
         }
         Ok(())
     }

diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
@@ -169,6 +169,8 @@ pub fn max(expr: Expr) -> Expr {
 }
 
 /// Create an expression to represent the sum() aggregate function
+///
+/// TODO: Remove this function and use `sum` from `datafusion_functions_aggregate::expr_fn` instead
 pub fn sum(expr: Expr) -> Expr {
     Expr::AggregateFunction(AggregateFunction::new(
         aggregate_function::AggregateFunction::Sum,

diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs
@@ -21,8 +21,10 @@ use crate::expr::{
     InSubquery, Placeholder, ScalarFunction, Sort, TryCast, Unnest, WindowFunction,
 };
 use crate::type_coercion::binary::get_result_type;
-use crate::type_coercion::functions::data_types_with_scalar_udf;
-use crate::{utils, LogicalPlan, Projection, Subquery};
+use crate::type_coercion::functions::{
+    data_types_with_aggregate_udf, data_types_with_scalar_udf,
+};
+use crate::{utils, LogicalPlan, Projection, Subquery, WindowFunctionDefinition};
 use arrow::compute::can_cast_types;
 use arrow::datatypes::{DataType, Field};
 use datafusion_common::{
@@ -158,7 +160,25 @@ impl ExprSchemable for Expr {
                     .iter()
                     .map(|e| e.get_type(schema))
                     .collect::<Result<Vec<_>>>()?;
-                fun.return_type(&data_types)
+                match fun {
+                    WindowFunctionDefinition::AggregateUDF(udf) => {
+                        let new_types = data_types_with_aggregate_udf(&data_types, udf).map_err(|err| {
+                            plan_datafusion_err!(
+                                "{} and {}",
+                                err,
+                                utils::generate_signature_error_msg(
+                                    fun.name(),
+                                    fun.signature().clone(),
+                                    &data_types
+                                )
+                            )
+                        })?;
+                        Ok(fun.return_type(&new_types)?)
+                    }
+                    _ => {
+                        fun.return_type(&data_types)
+                    }
+                }
             }
             Expr::AggregateFunction(AggregateFunction { func_def, args, .. }) => {
                 let data_types = args
@@ -170,7 +190,18 @@ impl ExprSchemable for Expr {
                         fun.return_type(&data_types)
                     }
                     AggregateFunctionDefinition::UDF(fun) => {
-                        Ok(fun.return_type(&data_types)?)
+                        let new_types = data_types_with_aggregate_udf(&data_types, fun).map_err(|err| {
+                            plan_datafusion_err!(
+                                "{} and {}",
+                                err,
+                                utils::generate_signature_error_msg(
+                                    fun.name(),
+                                    fun.signature().clone(),
+                                    &data_types
+                                )
+                            )
+                        })?;
+                        Ok(fun.return_type(&new_types)?)
                     }
                 }
             }

diff --git a/datafusion/expr/src/function.rs b/datafusion/expr/src/function.rs
@@ -70,6 +70,9 @@ pub struct AccumulatorArgs<'a> {
     /// If no `ORDER BY` is specified, `sort_exprs`` will be empty.
     pub sort_exprs: &'a [Expr],
 
+    /// The name of the aggregate expression
+    pub name: &'a str,
+
     /// Whether the aggregate function is distinct.
     ///
     /// ```sql
@@ -82,9 +85,6 @@ pub struct AccumulatorArgs<'a> {
 
     /// The number of arguments the aggregate function takes.
     pub args_num: usize,
-
-    /// The name of the expression
-    pub name: &'a str,
 }
 
 /// [`StateFieldsArgs`] contains information about the fields that an