[SPARK-32940][SQL] Collect, first and last should be deterministic aggregate functions

tanelk · cloud-fan · commit 58e07e0f4cca · 2021-11-05T17:04:28.000+08:00
### What changes were proposed in this pull request? Collect, first and last have mistakenly been marked as non-deterministic. They are actually deterministic iff their child expression is deterministic. For example collect was marked as non-deterministic in #14749. The reasoning was that its output depends on the actual order of input rows. Although it is correct that these aggregators depend on the order of input rows, it does not make them non-deterministic. In `EliminateSorts` optimizer rule, there is a method `isOrderIrrelevantAggs`, that lists all aggregators that do not depend on their input row order. Collect, first and last are correctly not listed there. An aggregator would be non-deterministic if its output for a group would depend on previous groups it has aggregated - I can't think of any practical examples of this kind of aggregator in Spark. An analogous aggregator to these would be sum on float and double datatype - its result does depend on the order of its inputs, but is deterministic. Another similar aggregates are the `max_by` and `min_by` - deterministic functions, that can return different results when the order of rows changes. ### Why are the changes needed? The optimizer rule `PushPredicateThroughNonJoin` can work in more cases. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? UT Closes #29810 from tanelk/SPARK-32940. Lead-authored-by: tanel.kiis@gmail.com <tanel.kiis@gmail.com> Co-authored-by: Tanel Kiis <tanel.kiis@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -218,6 +218,11 @@ package object dsl {
       BitOrAgg(e).toAggregateExpression(isDistinct = false, filter = filter)
     def bitXor(e: Expression, filter: Option[Expression] = None): Expression =
       BitXorAgg(e).toAggregateExpression(isDistinct = false, filter = filter)
+    def collectList(e: Expression, filter: Option[Expression] = None): Expression =
+      CollectList(e).toAggregateExpression(isDistinct = false, filter = filter)
+    def collectSet(e: Expression, filter: Option[Expression] = None): Expression =
+      CollectSet(e).toAggregateExpression(isDistinct = false, filter = filter)
+
     def upper(e: Expression): Expression = Upper(e)
     def lower(e: Expression): Expression = Lower(e)
     def coalesce(args: Expression*): Expression = Coalesce(args)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
@@ -62,9 +62,6 @@ case class First(child: Expression, ignoreNulls: Boolean)
 
   override def nullable: Boolean = true
 
-  // First is not a deterministic function.
-  override lazy val deterministic: Boolean = false
-
   // Return data type.
   override def dataType: DataType = child.dataType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
@@ -61,9 +61,6 @@ case class Last(child: Expression, ignoreNulls: Boolean)
 
   override def nullable: Boolean = true
 
-  // Last is not a deterministic function.
-  override lazy val deterministic: Boolean = false
-
   // Return data type.
   override def dataType: DataType = child.dataType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -43,10 +43,6 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper
 
   override def dataType: DataType = ArrayType(child.dataType, false)
 
-  // Both `CollectList` and `CollectSet` are non-deterministic since their results depend on the
-  // actual order of input rows.
-  override lazy val deterministic: Boolean = false
-
   override def defaultResult: Option[Literal] = Option(Literal.create(Array(), dataType))
 
   protected def convertToBufferElement(value: Any): Any
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -423,6 +423,8 @@ object EliminateDistinct extends Rule[LogicalPlan] {
     case _: BitAndAgg => true
     case _: BitOrAgg => true
     case _: CollectSet => true
+    case _: First => true
+    case _: Last => true
     case _ => false
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala
@@ -39,11 +39,15 @@ class EliminateDistinctSuite extends PlanTest {
     Min(_),
     BitAndAgg(_),
     BitOrAgg(_),
+    First(_, ignoreNulls = true),
+    First(_, ignoreNulls = false),
+    Last(_, ignoreNulls = true),
+    Last(_, ignoreNulls = false),
     CollectSet(_: Expression)
   ).foreach {
     aggBuilder =>
       val agg = aggBuilder('a)
-      test(s"Eliminate Distinct in ${agg.prettyName}") {
+      test(s"Eliminate Distinct in $agg") {
         val query = testRelation
           .select(agg.toAggregateExpression(isDistinct = true).as('result))
           .analyze
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -840,6 +840,29 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("SPARK-32940: aggregate: push filters through first, last and collect") {
+    Seq(
+      first(_: Expression),
+      last(_: Expression),
+      collectList(_: Expression),
+      collectSet(_: Expression)
+    ).foreach { agg =>
+      val originalQuery = testRelation
+        .groupBy('a)(agg('b))
+        .where('a > 42)
+        .analyze
+
+      val optimized = Optimize.execute(originalQuery)
+
+      val correctAnswer = testRelation
+        .where('a > 42)
+        .groupBy('a)(agg('b))
+        .analyze
+
+      comparePlans(optimized, correctAnswer)
+    }
+  }
+
   test("union") {
     val testRelation2 = LocalRelation('d.int, 'e.int, 'f.int)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnwrapCastInBinaryComparisonSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnwrapCastInBinaryComparisonSuite.scala
@@ -23,7 +23,6 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.IntegralLiteralTestUtils._
-import org.apache.spark.sql.catalyst.expressions.aggregate.First
 import org.apache.spark.sql.catalyst.optimizer.UnwrapCastInBinaryComparison._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -183,10 +182,10 @@ class UnwrapCastInBinaryComparisonSuite extends PlanTest with ExpressionEvalHelp
   }
 
  test("unwrap cast should skip when expression is non-deterministic or foldable") {
-    Seq(positiveInt, negativeInt).foreach(v => {
-      val e = Cast(First(f, ignoreNulls = true), IntegerType) <=> v
+   Seq(positiveLong, negativeLong).foreach (v => {
+     val e = Cast(Rand(0), LongType) <=> v
       assertEquivalent(e, e, evaluate = false)
-      val e2 = Cast(Literal(30.toShort), IntegerType) >= v
+      val e2 = Cast(Literal(30), LongType) >= v
       assertEquivalent(e2, e2, evaluate = false)
     })
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalLimit, Project, Reparti
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.{CommandResultExec, UnionExec}
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
-import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
+import org.apache.spark.sql.execution.aggregate._
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.command.{DataWritingCommandExec, FunctionsCommand}
 import org.apache.spark.sql.execution.datasources.{InsertIntoHadoopFsRelationCommand, LogicalRelation}
@@ -44,6 +44,7 @@ import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, CartesianProductExec, SortMergeJoinExec}
+import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -2790,15 +2791,25 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
   }
 
   test("Non-deterministic aggregate functions should not be deduplicated") {
-    val query = "SELECT a, first_value(b), first_value(b) + 1 FROM testData2 GROUP BY a"
-    val df = sql(query)
-    val physical = df.queryExecution.sparkPlan
-    val aggregateExpressions = physical.collectFirst {
-      case agg : HashAggregateExec => agg.aggregateExpressions
-      case agg : SortAggregateExec => agg.aggregateExpressions
+    withUserDefinedFunction("sumND" -> true) {
+      spark.udf.register("sumND", udaf(new Aggregator[Long, Long, Long] {
+        def zero: Long = 0L
+        def reduce(b: Long, a: Long): Long = b + a
+        def merge(b1: Long, b2: Long): Long = b1 + b2
+        def finish(r: Long): Long = r
+        def bufferEncoder: Encoder[Long] = Encoders.scalaLong
+        def outputEncoder: Encoder[Long] = Encoders.scalaLong
+      }).asNondeterministic())
+
+      val query = "SELECT a, sumND(b), sumND(b) + 1 FROM testData2 GROUP BY a"
+      val df = sql(query)
+      val physical = df.queryExecution.sparkPlan
+      val aggregateExpressions = physical.collectFirst {
+        case agg: BaseAggregateExec => agg.aggregateExpressions
+      }
+      assert(aggregateExpressions.isDefined)
+      assert(aggregateExpressions.get.size == 2)
     }
-    assert (aggregateExpressions.isDefined)
-    assert (aggregateExpressions.get.size == 2)
   }
 
   test("SPARK-22356: overlapped columns between data and partition schema in data source tables") {

Original file line number	Diff line number	Diff line change
`@@ -423,6 +423,8 @@ object EliminateDistinct extends Rule[LogicalPlan] {`
`423`	`423`	`case _: BitAndAgg => true`
`424`	`424`	`case _: BitOrAgg => true`
`425`	`425`	`case _: CollectSet => true`
	`426`	`+ case _: First => true`
	`427`	`+ case _: Last => true`
`426`	`428`	`case _ => false`
`427`	`429`	`}`
`428`	`430`	`}`