[SPARK-32031][SQL] Fix the wrong references of the PartialMerge/Final AggregateExpression

Ngone51 · cloud-fan · commit 338efee50966 · 2020-06-22T13:59:46.000Z
### What changes were proposed in this pull request? This PR changes the references of the `PartialMerge`/`Final` `AggregateExpression` from `aggBufferAttributes` to `inputAggBufferAttributes`. After this change, the tests of `SPARK-31620` can fail on the assertion of `QueryTest.assertEmptyMissingInput`. So, this PR also fixes it by overriding the `inputAggBufferAttributes` of the Aggregate operators. ### Why are the changes needed? With my understanding of Aggregate framework, especially, according to the logic of `AggUtils.planAggXXX`, I think for the `PartialMerge`/`Final` `AggregateExpression` the right references should be `inputAggBufferAttributes`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Before this patch, for an Aggregate operator, its input attributes will always be equal to or more than(because it refers to its own attributes while it should refer to the attributes from the child) its reference attributes. Therefore, its missing inputs must always be empty and break nothing. Thus, it's impossible to add a UT for this patch. However, after correcting the right references in this PR, the problem is then exposed by `QueryTest.assertEmptyMissingInput` in the UT of SPARK-31620, since missing inputs are no longer always empty. This PR can fix the problem. Closes #28869 from Ngone51/fix-agg-reference. Authored-by: yi.wu <yi.wu@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -140,7 +140,7 @@ case class AggregateExpression(
   override lazy val references: AttributeSet = {
     val aggAttributes = mode match {
       case Partial | Complete => aggregateFunction.references
-      case PartialMerge | Final => AttributeSet(aggregateFunction.aggBufferAttributes)
+      case PartialMerge | Final => AttributeSet(aggregateFunction.inputAggBufferAttributes)
     }
     aggAttributes ++ filterAttributes
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.aggregate
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet, NamedExpression}
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, PartialMerge}
 import org.apache.spark.sql.execution.{ExplainUtils, UnaryExecNode}
 
@@ -53,15 +53,32 @@ trait BaseAggregateExec extends UnaryExecNode {
       // can't bind the `mergeExpressions` with the output of the partial aggregate, as they use
       // the `inputAggBufferAttributes` of the original `DeclarativeAggregate` before copy. Instead,
       // we shall use `inputAggBufferAttributes` after copy to match the new `mergeExpressions`.
-      val aggAttrs = aggregateExpressions
-        // there're exactly four cases needs `inputAggBufferAttributes` from child according to the
-        // agg planning in `AggUtils`: Partial -> Final, PartialMerge -> Final,
-        // Partial -> PartialMerge, PartialMerge -> PartialMerge.
-        .filter(a => a.mode == Final || a.mode == PartialMerge).map(_.aggregateFunction)
-        .flatMap(_.inputAggBufferAttributes)
+      val aggAttrs = inputAggBufferAttributes
       child.output.dropRight(aggAttrs.length) ++ aggAttrs
     } else {
       child.output
     }
   }
+
+  private val inputAggBufferAttributes: Seq[Attribute] = {
+    aggregateExpressions
+      // there're exactly four cases needs `inputAggBufferAttributes` from child according to the
+      // agg planning in `AggUtils`: Partial -> Final, PartialMerge -> Final,
+      // Partial -> PartialMerge, PartialMerge -> PartialMerge.
+      .filter(a => a.mode == Final || a.mode == PartialMerge)
+      .flatMap(_.aggregateFunction.inputAggBufferAttributes)
+  }
+
+  protected val aggregateBufferAttributes: Seq[AttributeReference] = {
+    aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+  }
+
+  override def producedAttributes: AttributeSet =
+    AttributeSet(aggregateAttributes) ++
+    AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
+    AttributeSet(aggregateBufferAttributes) ++
+    // it's not empty when the inputAggBufferAttributes is not equal to the aggregate buffer
+    // attributes of the child Aggregate, when the child Aggregate contains the subquery in
+    // AggregateFunction. See SPARK-31620 for more details.
+    AttributeSet(inputAggBufferAttributes.filterNot(child.output.contains))
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -57,10 +57,6 @@ case class HashAggregateExec(
     with BlockingOperatorWithCodegen
     with AliasAwareOutputPartitioning {
 
-  private[this] val aggregateBufferAttributes = {
-    aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-  }
-
   require(HashAggregateExec.supportsAggregate(aggregateBufferAttributes))
 
   override lazy val allAttributes: AttributeSeq =
@@ -79,11 +75,6 @@ case class HashAggregateExec(
 
   override protected def outputExpressions: Seq[NamedExpression] = resultExpressions
 
-  override def producedAttributes: AttributeSet =
-    AttributeSet(aggregateAttributes) ++
-    AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
-    AttributeSet(aggregateBufferAttributes)
-
   override def requiredChildDistribution: List[Distribution] = {
     requiredChildDistributionExpressions match {
       case Some(exprs) if exprs.isEmpty => AllTuples :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
@@ -69,10 +69,6 @@ case class ObjectHashAggregateExec(
     child: SparkPlan)
   extends BaseAggregateExec with AliasAwareOutputPartitioning {
 
-  private[this] val aggregateBufferAttributes = {
-    aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-  }
-
   override lazy val allAttributes: AttributeSeq =
     child.output ++ aggregateBufferAttributes ++ aggregateAttributes ++
       aggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
@@ -84,11 +80,6 @@ case class ObjectHashAggregateExec(
 
   override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
 
-  override def producedAttributes: AttributeSet =
-    AttributeSet(aggregateAttributes) ++
-    AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
-    AttributeSet(aggregateBufferAttributes)
-
   override def requiredChildDistribution: List[Distribution] = {
     requiredChildDistributionExpressions match {
       case Some(exprs) if exprs.isEmpty => AllTuples :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
@@ -42,15 +42,6 @@ case class SortAggregateExec(
     with AliasAwareOutputPartitioning
     with AliasAwareOutputOrdering {
 
-  private[this] val aggregateBufferAttributes = {
-    aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-  }
-
-  override def producedAttributes: AttributeSet =
-    AttributeSet(aggregateAttributes) ++
-      AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
-      AttributeSet(aggregateBufferAttributes)
-
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 

Original file line number	Diff line number	Diff line change
`@@ -140,7 +140,7 @@ case class AggregateExpression(`
`140`	`140`	`override lazy val references: AttributeSet = {`
`141`	`141`	`val aggAttributes = mode match {`
`142`	`142`	`case Partial \| Complete => aggregateFunction.references`
`143`		`- case PartialMerge \| Final => AttributeSet(aggregateFunction.aggBufferAttributes)`
	`143`	`+ case PartialMerge \| Final => AttributeSet(aggregateFunction.inputAggBufferAttributes)`
`144`	`144`	`}`
`145`	`145`	`aggAttributes ++ filterAttributes`
`146`	`146`	`}`