apache · viirya · Jul 21, 2019 · Jul 22, 2019 · Jul 22, 2019 · Jul 24, 2019
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala
@@ -67,4 +67,10 @@ case class PythonUDF(
     exprId = resultId)
 
   override def nullable: Boolean = true
+
+  override lazy val canonicalized: Expression = {
+    val canonicalizedChildren = children.map(_.canonicalized)
+    // `resultId` can be seen as cosmetic variation in PythonUDF, as it doesn't affect the result.
+    this.copy(resultId = ExprId(-1)).withNewChildren(canonicalizedChildren)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.{ColumnPruning, Optimizer, PushPredicateThroughNonJoin, RemoveNoopOperators}
 import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
 import org.apache.spark.sql.execution.datasources.SchemaPruning
-import org.apache.spark.sql.execution.python.{ExtractPythonUDFFromAggregate, ExtractPythonUDFs}
+import org.apache.spark.sql.execution.python.{ExtractGroupingPythonUDFFromAggregate, ExtractPythonUDFFromAggregate, ExtractPythonUDFs}
 
 class SparkOptimizer(
     catalog: SessionCatalog,
@@ -33,6 +33,8 @@ class SparkOptimizer(
     Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
     Batch("Extract Python UDFs", Once,
       ExtractPythonUDFFromAggregate,
+      // This must be executed after `ExtractPythonUDFFromAggregate` and before `ExtractPythonUDFs`.
+      ExtractGroupingPythonUDFFromAggregate,
       ExtractPythonUDFs,
       // The eval-python node may be between Project/Filter and the scan node, which breaks
       // column pruning and filter push-down. Here we rerun the related optimizer rules.
@@ -45,7 +47,7 @@ class SparkOptimizer(
     Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
 
   override def nonExcludableRules: Seq[String] = super.nonExcludableRules :+
-    ExtractPythonUDFFromAggregate.ruleName :+
+    ExtractPythonUDFFromAggregate.ruleName :+ ExtractGroupingPythonUDFFromAggregate.ruleName :+
     ExtractPythonUDFs.ruleName
 
   /**

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -81,6 +81,69 @@ object ExtractPythonUDFFromAggregate extends Rule[LogicalPlan] {
   }
 }
 
+/**
+ * Extracts PythonUDFs in logical aggregate, which are used in grouping keys, evaluate them
+ * before aggregate.
+ * This must be executed after `ExtractPythonUDFFromAggregate` rule and before `ExtractPythonUDFs`.
+ */
+object ExtractGroupingPythonUDFFromAggregate extends Rule[LogicalPlan] {
+  private def hasScalarPythonUDF(e: Expression): Boolean = {
+    e.find(PythonUDF.isScalarPythonUDF).isDefined
+  }
+
+  private def extract(agg: Aggregate): LogicalPlan = {
+    val projList = new ArrayBuffer[NamedExpression]()
+    val groupingExpr = new ArrayBuffer[Expression]()
+    val attributeMap = mutable.HashMap[PythonUDF, NamedExpression]()
+
+    agg.groupingExpressions.foreach { expr =>
+      if (hasScalarPythonUDF(expr)) {
+        val newE = expr transformDown {
+          case p: PythonUDF =>
+            // This is just a sanity check, the rule PullOutNondeterministic should
+            // already pull out those nondeterministic expressions.
+            assert(p.udfDeterministic, "Non-determinstic PythonUDFs should not appear " +
+              "in grouping expression")
+            val canonicalized = p.canonicalized.asInstanceOf[PythonUDF]
+            if (attributeMap.contains(canonicalized)) {
+              attributeMap(canonicalized)
+            } else {
+              val alias = Alias(p, "groupingPythonUDF")()
+              projList += alias
+              attributeMap += ((canonicalized, alias.toAttribute))
+              alias.toAttribute
+            }
+        }
+        groupingExpr += newE
+      } else {
+        groupingExpr += expr
+      }
+    }
+    val aggExpr = agg.aggregateExpressions.map { expr =>
+      expr.transformUp {
+        // PythonUDF over aggregate was pull out by ExtractPythonUDFFromAggregate.
+        // PythonUDF here should be either
+        // 1. Argument of an aggregate function.
+        //    CheckAnalysis guarantees the arguments are deterministic.
+        // 2. PythonUDF in grouping key. Grouping key must be deterministic.
+        // 3. PythonUDF not in grouping key. It is either no arguments or with grouping key
+        // in its arguments. Such PythonUDF was pull out by ExtractPythonUDFFromAggregate, too.
+        case p: PythonUDF if p.udfDeterministic =>
+          val canonicalized = p.canonicalized.asInstanceOf[PythonUDF]
+          attributeMap.getOrElse(canonicalized, p)
+      }.asInstanceOf[NamedExpression]
+    }
+    agg.copy(
+      groupingExpressions = groupingExpr,
+      aggregateExpressions = aggExpr,
+      child = Project(projList ++ agg.child.output, agg.child))
+  }
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
+    case agg: Aggregate if agg.groupingExpressions.exists(hasScalarPythonUDF(_)) =>
+      extract(agg)
+  }
+}
 
 /**
  * Extracts PythonUDFs from operators, rewriting the query plan so that the UDF can be evaluated

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import org.apache.spark.sql.{IntegratedUDFTestUtils, QueryTest}
+import org.apache.spark.sql.functions.count
+import org.apache.spark.sql.test.SharedSQLContext
+
+class PythonUDFSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  import IntegratedUDFTestUtils._
+
+  val scalaTestUDF = TestScalaUDF(name = "scalaUDF")
+  val pythonTestUDF = TestPythonUDF(name = "pyUDF")
+  assume(shouldTestPythonUDFs)
+
+  lazy val base = Seq(
+    (Some(1), Some(1)), (Some(1), Some(2)), (Some(2), Some(1)),
+    (Some(2), Some(2)), (Some(3), Some(1)), (Some(3), Some(2)),
+    (None, Some(1)), (Some(3), None), (None, None)).toDF("a", "b")
+
+  test("SPARK-28445: PythonUDF as grouping key and aggregate expressions") {
+    val df1 = base.groupBy(scalaTestUDF(base("a") + 1))
+      .agg(scalaTestUDF(base("a") + 1), scalaTestUDF(count(base("b"))))
+    val df2 = base.groupBy(pythonTestUDF(base("a") + 1))
+      .agg(pythonTestUDF(base("a") + 1), pythonTestUDF(count(base("b"))))
+    checkAnswer(df1, df2)
+  }
+
+  test("SPARK-28445: PythonUDF as grouping key and used in aggregate expressions") {
+    val df1 = base.groupBy(scalaTestUDF(base("a") + 1))
+      .agg(scalaTestUDF(base("a") + 1) + 1, scalaTestUDF(count(base("b"))))
+    val df2 = base.groupBy(pythonTestUDF(base("a") + 1))
+      .agg(pythonTestUDF(base("a") + 1) + 1, pythonTestUDF(count(base("b"))))
+    checkAnswer(df1, df2)
+  }
+
+  test("SPARK-28445: PythonUDF in aggregate expression has grouping key in its arguments") {
+    val df1 = base.groupBy(scalaTestUDF(base("a") + 1))
+      .agg(scalaTestUDF(scalaTestUDF(base("a") + 1)), scalaTestUDF(count(base("b"))))
+    val df2 = base.groupBy(pythonTestUDF(base("a") + 1))
+      .agg(pythonTestUDF(pythonTestUDF(base("a") + 1)), pythonTestUDF(count(base("b"))))
+    checkAnswer(df1, df2)
+  }
+
+  test("SPARK-28445: PythonUDF over grouping key is argument to aggregate function") {
+    val df1 = base.groupBy(scalaTestUDF(base("a") + 1))
+      .agg(scalaTestUDF(scalaTestUDF(base("a") + 1)),
+        scalaTestUDF(count(scalaTestUDF(base("a") + 1))))
+    val df2 = base.groupBy(pythonTestUDF(base("a") + 1))
+      .agg(pythonTestUDF(pythonTestUDF(base("a") + 1)),
+        pythonTestUDF(count(pythonTestUDF(base("a") + 1))))
+    checkAnswer(df1, df2)
+  }
+}