apache · alamb · Feb 26, 2025 · Feb 22, 2025 · Feb 22, 2025 · Feb 22, 2025
diff --git a/datafusion/expr-common/src/type_coercion/aggregates.rs b/datafusion/expr-common/src/type_coercion/aggregates.rs
@@ -129,6 +129,11 @@ pub fn check_arg_count(
                 );
             }
         }
+        TypeSignature::Nullary => {
+            if !input_types.is_empty() {
+                return plan_err!("The function {func_name} expects no arguments");
+            }
+        }
         TypeSignature::UserDefined
         | TypeSignature::Numeric(_)
         | TypeSignature::Coercible(_) => {

diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
@@ -100,6 +100,7 @@ impl fmt::Display for AggregateUDF {
 }
 
 /// Arguments passed to [`AggregateUDFImpl::value_from_stats`]
+#[derive(Debug)]
 pub struct StatisticsArgs<'a> {
     /// The statistics of the aggregate input
     pub statistics: &'a Statistics,

diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs
@@ -148,6 +148,15 @@ impl AggregateUDFImpl for Count {
         "count"
     }
 
+    // In AggregateFunctionPlanner, wildcard is converted to count(1)
+    //
+    // count() -> count(1)
+    // count(*) -> count(1)
+    // count(1) -> count(1)
+    // count(2) -> count(2)
+    //
+    // count(1) is named as count(*) in schema_name
+    // other constant remains the same
     fn schema_name(&self, params: &AggregateFunctionParams) -> Result<String> {
         let AggregateFunctionParams {
             args,
@@ -511,6 +520,11 @@ impl AggregateUDFImpl for Count {
             return None;
         }
         if let Precision::Exact(num_rows) = statistics_args.statistics.num_rows {
+            // handle count()
+            if statistics_args.exprs.is_empty() {
+                return Some(ScalarValue::Int64(Some(num_rows as i64)));
+            }
+
             if statistics_args.exprs.len() == 1 {
                 // TODO optimize with exprs other than Column
                 if let Some(col_expr) = statistics_args.exprs[0]
@@ -550,8 +564,6 @@ impl AggregateUDFImpl for Count {
 fn is_count_wildcard(args: &[Expr]) -> bool {
     match args {
         [] => true, // count()
-        // All const should be coerced to int64 or rejected by the signature
-        [Expr::Literal(ScalarValue::Int64(Some(_)))] => true, // count(1)
         _ => false, // More than one argument or non-matching cases
     }
 }

diff --git a/datafusion/functions-aggregate/src/planner.rs b/datafusion/functions-aggregate/src/planner.rs
@@ -20,9 +20,7 @@
 use datafusion_common::Result;
 use datafusion_expr::{
     expr::AggregateFunction,
-    lit,
     planner::{ExprPlanner, PlannerResult, RawAggregateExpr},
-    utils::COUNT_STAR_EXPANSION,
     Expr,
 };
 
@@ -49,7 +47,7 @@ impl ExprPlanner for AggregateFunctionPlanner {
             return Ok(PlannerResult::Planned(Expr::AggregateFunction(
                 AggregateFunction::new_udf(
                     func,
-                    vec![lit(COUNT_STAR_EXPANSION)],
+                    vec![],
                     distinct,
                     filter,
                     order_by,

diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs
@@ -102,7 +102,8 @@ impl AggregateExprBuilder {
             is_distinct,
             is_reversed,
         } = self;
-        if args.is_empty() {
+        // only count function can have empty args
+        if args.is_empty() && fun.name() != "count" {
             return internal_err!("args should not be empty");
         }
 

diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -34,7 +34,9 @@ use crate::{
     SendableRecordBatchStream, Statistics,
 };
 
-use arrow::array::{ArrayRef, UInt16Array, UInt32Array, UInt64Array, UInt8Array};
+use arrow::array::{
+    ArrayRef, Int64Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+};
 use arrow::datatypes::{Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::stats::Precision;
@@ -1231,6 +1233,13 @@ fn evaluate(
     expr: &[Arc<dyn PhysicalExpr>],
     batch: &RecordBatch,
 ) -> Result<Vec<ArrayRef>> {
+    // handle count() case
+    if expr.is_empty() {
+        return Ok(vec![
+            Arc::new(Int64Array::from(vec![1; batch.num_rows()])) as ArrayRef
+        ]);
+    }
+
     expr.iter()
         .map(|expr| {
             expr.evaluate(batch)

diff --git a/datafusion/physical-plan/src/aggregates/no_grouping.rs b/datafusion/physical-plan/src/aggregates/no_grouping.rs
@@ -23,6 +23,7 @@ use crate::aggregates::{
 };
 use crate::metrics::{BaselineMetrics, RecordOutput};
 use crate::{RecordBatchStream, SendableRecordBatchStream};
+use arrow::array::{ArrayRef, Int64Array};
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::Result;
@@ -219,23 +220,26 @@ fn aggregate_batch(
                 None => Cow::Borrowed(&batch),
             };
 
+            let n_rows = batch.num_rows();
+
             // 1.3
-            let values = &expr
-                .iter()
-                .map(|e| {
-                    e.evaluate(&batch)
-                        .and_then(|v| v.into_array(batch.num_rows()))
-                })
-                .collect::<Result<Vec<_>>>()?;
+            // Handle count(*) case
+            let values = if expr.is_empty() {
+                vec![Arc::new(Int64Array::from(vec![1; n_rows])) as ArrayRef]
+            } else {
+                expr.iter()
+                    .map(|e| e.evaluate(&batch).and_then(|v| v.into_array(n_rows)))
+                    .collect::<Result<Vec<_>>>()?
+            };
 
             // 1.4
             let size_pre = accum.size();
             let res = match mode {
                 AggregateMode::Partial
                 | AggregateMode::Single
-                | AggregateMode::SinglePartitioned => accum.update_batch(values),
+                | AggregateMode::SinglePartitioned => accum.update_batch(&values),
                 AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                    accum.merge_batch(values)
+                    accum.merge_batch(&values)
                 }
             };
             let size_post = accum.size();

diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -6276,6 +6276,9 @@ physical_plan
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true
 
+statement count 0
+drop table aggregate_test_100;
+
 # test count(null) case (null with type)
 
 statement count 0
@@ -6296,6 +6299,54 @@ physical_plan
 01)AggregateExec: mode=Single, gby=[], aggr=[count(NULL)]
 02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
+statement count 0
+drop table t;
+
+# test duplicated shema name issue
+
+statement count 0
+create table t (a int) as values (1), (2);
+
+query I
+select count() from t;
+----
+2
+
+query I
+select count(1) * count(2) from t;
+----
+4
+
+query I
+select count(1) * count(*) from t;
+----
+4
+
+query I
+select count(*) * count(*) from t;
+----
+4
+
+query I
+select count(1) * count(1) from t;
+----
+4
+
+query TT
+explain select count(1) * count(2) from t;
+----
+logical_plan
+01)Projection: count(Int64(1)) * count(Int64(2))
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1)), count(Int64(2))]]
+03)----TableScan: t projection=[]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 * count(Int64(2))@1 as count(Int64(1)) * count(Int64(2))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[count(Int64(1)), count(Int64(2))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement count 0
+drop table t;
+
 #######
 # Group median test
 #######

diff --git a/datafusion/sqllogictest/test_files/count_star_rule.slt b/datafusion/sqllogictest/test_files/count_star_rule.slt
@@ -80,12 +80,12 @@ query TT
 EXPLAIN SELECT a, COUNT() OVER (PARTITION BY a) AS count_a FROM t1;
 ----
 logical_plan
-01)Projection: t1.a, count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS count_a
-02)--WindowAggr: windowExpr=[[count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+01)Projection: t1.a, count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS count_a
+02)--WindowAggr: windowExpr=[[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 03)----TableScan: t1 projection=[a]
 physical_plan
-01)ProjectionExec: expr=[a@0 as a, count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as count_a]
-02)--WindowAggExec: wdw=[count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(*) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+01)ProjectionExec: expr=[a@0 as a, count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as count_a]
+02)--WindowAggExec: wdw=[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
 04)------DataSourceExec: partitions=1, partition_sizes=[1]