apache · viirya · Jul 21, 2019 · Jul 22, 2019 · Jul 22, 2019 · Jul 24, 2019
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala
@@ -67,4 +67,10 @@ case class PythonUDF(
     exprId = resultId)
 
   override def nullable: Boolean = true
+
+  override lazy val canonicalized: Expression = {
+    val canonicalizedChildren = children.map(_.canonicalized)
+    // `resultId` can be seen as cosmetic variation in PythonUDF, as it doesn't affect the result.
+    Canonicalize.execute(this.copy(resultId = ExprId(-1)).withNewChildren(canonicalizedChildren))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.{ColumnPruning, Optimizer, PushPredicateThroughNonJoin, RemoveNoopOperators}
 import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
 import org.apache.spark.sql.execution.datasources.SchemaPruning
-import org.apache.spark.sql.execution.python.{ExtractPythonUDFFromAggregate, ExtractPythonUDFs}
+import org.apache.spark.sql.execution.python.{ExtractGroupingPythonUDFFromAggregate, ExtractPythonUDFFromAggregate, ExtractPythonUDFs}
 
 class SparkOptimizer(
     catalog: SessionCatalog,
@@ -33,6 +33,7 @@ class SparkOptimizer(
     Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
     Batch("Extract Python UDFs", Once,
       ExtractPythonUDFFromAggregate,
+      ExtractGroupingPythonUDFFromAggregate,
       ExtractPythonUDFs,
       // The eval-python node may be between Project/Filter and the scan node, which breaks
       // column pruning and filter push-down. Here we rerun the related optimizer rules.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -81,6 +81,64 @@ object ExtractPythonUDFFromAggregate extends Rule[LogicalPlan] {
   }
 }
 
+/**
+ * Extracts PythonUDFs in logical aggregate, which are used in grouping keys, evaluate them
+ * before aggregate.
+ * This must be executed after `ExtractPythonUDFFromAggregate` rule and before `ExtractPythonUDFs`.
+ */
+object ExtractGroupingPythonUDFFromAggregate extends Rule[LogicalPlan] {
+  private def hasScalarPythonUDF(e: Expression): Boolean = {
+    e.find(PythonUDF.isScalarPythonUDF).isDefined
+  }
+
+  private def extract(agg: Aggregate): LogicalPlan = {
+    val projList = new ArrayBuffer[NamedExpression]()
+    val groupingExpr = new ArrayBuffer[Expression]()
+    val attributeMap = mutable.HashMap[PythonUDF, NamedExpression]()
+
+    agg.groupingExpressions.foreach { expr =>
+      if (hasScalarPythonUDF(expr)) {
+        val newE = expr transformDown {
+          case p: PythonUDF =>
+            // This is just a sanity check, the rule PullOutNondeterministic should
+            // already pull out those nondeterministic expressions.
+            assert(p.udfDeterministic, "Non-determinstic PythonUDFs should not appear " +
+              "in grouping expression")
+            val alias = Alias(p, "groupingPythonUDF")()
+            projList += alias
+            attributeMap += ((p.canonicalized.asInstanceOf[PythonUDF], alias))
+            alias.toAttribute
+        }
+        groupingExpr += newE
+      } else {
+        groupingExpr += expr
+      }
+    }
+    val aggExpr = agg.aggregateExpressions.map { expr =>
+      expr.transformUp {
+        // PythonUDF over aggregate was pull out by ExtractPythonUDFFromAggregate.
+        // PythonUDF here should be either
+        // 1. Argument of an aggregate function.
+        //    CheckAnalysis guarantees the arguments are deterministic.
+        // 2. PythonUDF in grouping key. Grouping key must be deterministic.
+        // 3. PythonUDF not in grouping key. It is either no arguments or with grouping key
+        // in its arguments. Such PythonUDF was pull out by ExtractPythonUDFFromAggregate, too.
+        case p: PythonUDF if p.udfDeterministic =>
+          val canonicalized = p.canonicalized.asInstanceOf[PythonUDF]
+          attributeMap.get(canonicalized).map(_.toAttribute).getOrElse(p)
+      }.asInstanceOf[NamedExpression]
+    }
+    agg.copy(
+      groupingExpressions = groupingExpr,
+      aggregateExpressions = aggExpr,
+      child = Project(projList ++ agg.child.output, agg.child))
+  }
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
+    case agg: Aggregate if agg.groupingExpressions.exists(hasScalarPythonUDF(_)) =>
+      extract(agg)
+  }
+}
 
 /**
  * Extracts PythonUDFs from operators, rewriting the query plan so that the UDF can be evaluated

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2189,4 +2189,50 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
           |*(1) Range (0, 10, step=1, splits=2)""".stripMargin))
     }
   }
+
+  test("SPARK-28445: PythonUDF in grouping key and aggregate expressions") {
+    import IntegratedUDFTestUtils._
+
+    val scalaTestUDF = TestScalaUDF(name = "scalaUDF")
+    val pythonTestUDF = TestPythonUDF(name = "pyUDF")
+    assume(shouldTestPythonUDFs)
+
+    withTempView("testData") {
+      sql(
+        """CREATE OR REPLACE TEMPORARY VIEW testData AS
+          |SELECT * FROM VALUES
+          |(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
+          |AS testData(a, b)""".stripMargin)
+
+      val base = spark.table("testData")
+
+      val df = base.groupBy(scalaTestUDF(base("a") + 1))
+        .agg(scalaTestUDF(base("a") + 1), scalaTestUDF(count(base("b"))))
+      val df2 = base.groupBy(pythonTestUDF(base("a") + 1))
+        .agg(pythonTestUDF(base("a") + 1), pythonTestUDF(count(base("b"))))
+      checkAnswer(df, df2)
+
+      val df3 = base.groupBy(scalaTestUDF(base("a") + 1))
+        .agg(scalaTestUDF(base("a") + 1) + 1, scalaTestUDF(count(base("b"))))
+      val df4 = base.groupBy(pythonTestUDF(base("a") + 1))
+        .agg(pythonTestUDF(base("a") + 1) + 1, pythonTestUDF(count(base("b"))))
+      checkAnswer(df3, df4)
+
+      // PythonUDF in aggregate expression has grouping key in its arguments.
+      val df5 = base.groupBy(scalaTestUDF(base("a") + 1))
+        .agg(scalaTestUDF(scalaTestUDF(base("a") + 1)), scalaTestUDF(count(base("b"))))
+      val df6 = base.groupBy(pythonTestUDF(base("a") + 1))
+        .agg(pythonTestUDF(pythonTestUDF(base("a") + 1)), pythonTestUDF(count(base("b"))))
+      checkAnswer(df5, df6)
+
+      // PythonUDF over grouping key is argument to aggregate function.
+      val df7 = base.groupBy(scalaTestUDF(base("a") + 1))
+        .agg(scalaTestUDF(scalaTestUDF(base("a") + 1)),
+          scalaTestUDF(count(scalaTestUDF(base("a") + 1))))
+      val df8 = base.groupBy(pythonTestUDF(base("a") + 1))
+        .agg(pythonTestUDF(pythonTestUDF(base("a") + 1)),
+          pythonTestUDF(count(pythonTestUDF(base("a") + 1))))
+      checkAnswer(df7, df8)
+    }
+  }
 }