apache · Ngone51 · May 11, 2020 · May 11, 2020 · May 11, 2020 · May 11, 2020
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -129,7 +129,7 @@ case class HashAggregateExec(
             resultExpressions,
             (expressions, inputSchema) =>
               MutableProjection.create(expressions, inputSchema),
-            child.output,
+            inputAttributes,
             iter,
             testFallbackStartsAt,
             numOutputRows,
@@ -331,10 +331,30 @@ case class HashAggregateExec(
     }
   }
 
+  private def inputAttributes: Seq[Attribute] = {
+    if (modes.contains(Final) || modes.contains(PartialMerge)) {
+      // SPARK-31620: when planning aggregates, the partial aggregate uses aggregate function's
+      // `inputAggBufferAttributes` as its output. And Final and PartialMerge aggregate rely on the
+      // output to bind references for `DeclarativeAggregate.mergeExpressions`. But if we copy the
+      // aggregate function somehow after aggregate planning, like `PlanSubqueries`, the
+      // `DeclarativeAggregate` will be replaced by a new instance with new
+      // `inputAggBufferAttributes` and `mergeExpressions`. Then Final and PartialMerge aggregate
+      // can't bind the `mergeExpressions` with the output of the partial aggregate, as they use
+      // the `inputAggBufferAttributes` of the original `DeclarativeAggregate` before copy. Instead,
+      // we shall use `inputAggBufferAttributes` after copy to match the new `mergeExpressions`.
+      val aggAttrs = aggregateExpressions
+        .filter(a => a.mode == Final || !a.isDistinct).map(_.aggregateFunction)
+        .flatMap(_.inputAggBufferAttributes)
+      child.output.dropRight(aggAttrs.length) ++ aggAttrs
+    } else {
+      child.output
+    }
+  }
+
   private def doConsumeWithoutKeys(ctx: CodegenContext, input: Seq[ExprCode]): String = {
     // only have DeclarativeAggregate
     val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate])
-    val inputAttrs = functions.flatMap(_.aggBufferAttributes) ++ child.output
+    val inputAttrs = functions.flatMap(_.aggBufferAttributes) ++ inputAttributes
     // To individually generate code for each aggregate function, an element in `updateExprs` holds
     // all the expressions for the buffer of an aggregation function.
     val updateExprs = aggregateExpressions.map { e =>
@@ -931,7 +951,7 @@ case class HashAggregateExec(
       }
     }
 
-    val inputAttr = aggregateBufferAttributes ++ child.output
+    val inputAttr = aggregateBufferAttributes ++ inputAttributes
     // Here we set `currentVars(0)` to `currentVars(numBufferSlots)` to null, so that when
     // generating code for buffer columns, we use `INPUT_ROW`(will be the buffer row), while
     // generating input columns, we use `currentVars`.

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -973,4 +973,21 @@ class DataFrameAggregateSuite extends QueryTest
       assert(error.message.contains("function count_if requires boolean type"))
     }
   }
+
+  Seq(true, false).foreach { value =>
+    test(s"SPARK-31620: agg with subquery (whole-stage-codegen = $value)") {
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> value.toString) {
+        withTempView("t1", "t2") {
+          sql("create temporary view t1 as select * from values (1, 2) as t1(a, b)")
+          sql("create temporary view t2 as select * from values (3, 4) as t2(c, d)")
+          // test without grouping keys
+          checkAnswer(sql("select sum(if(c > (select a from t1), d, 0)) as csum from t2"),
+            Row(4) :: Nil)
+          // test with grouping keys
+          checkAnswer(sql("select c, sum(if(c > (select a from t1), d, 0)) as csum from " +
+            "t2 group by c"), Row(3, 4) :: Nil)
+        }
+      }
+    }
+  }
 }