apache · peter-toth · Apr 21, 2021 · Apr 23, 2021 · Apr 23, 2021 · Apr 23, 2021
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
@@ -22,8 +22,9 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan}
+import org.apache.spark.sql.catalyst.trees.LeafLike
 import org.apache.spark.sql.catalyst.trees.TreePattern.{EXISTS_SUBQUERY, LIST_SUBQUERY,
-  PLAN_EXPRESSION, SCALAR_SUBQUERY, TreePattern}
+  MULTI_SCALAR_SUBQUERY, PLAN_EXPRESSION, SCALAR_SUBQUERY, TreePattern}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.BitSet
 
@@ -267,6 +268,33 @@ object ScalarSubquery {
   }
 }
 
+/**
+ * A subquery that is capable to return multiple scalar values.
+ */
+case class MultiScalarSubquery(
+    plan: LogicalPlan,
+    exprId: ExprId = NamedExpression.newExprId)
+  extends SubqueryExpression(plan, Seq.empty, exprId) with LeafLike[Expression] with Unevaluable {
+  override def dataType: DataType = {
+    assert(plan.schema.nonEmpty, "Multi-column scalar subquery should have columns")
+    plan.schema
+  }
+
+  override def nullable: Boolean = true
+
+  override def withNewPlan(plan: LogicalPlan): MultiScalarSubquery = copy(plan = plan)
+
+  override def toString: String = s"multi-scalar-subquery#${exprId.id}"
+
+  override lazy val canonicalized: Expression = {
+    MultiScalarSubquery(
+      plan.canonicalized,
+      ExprId(0))
+  }
+
+  final override def nodePatternsInternal: Seq[TreePattern] = Seq(MULTI_SCALAR_SUBQUERY)
+}
+
 /**
  * A [[ListQuery]] expression defines the query which we want to search in an IN subquery
  * expression. It should and can only be used in conjunction with an IN expression.

diff --git a/...talyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueries.scala b/...talyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueries.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LeafNode, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.TreePattern.{MULTI_SCALAR_SUBQUERY, SCALAR_SUBQUERY}
+
+/**
+ * This rule tries to merge multiple non-correlated [[ScalarSubquery]]s into a
+ * [[MultiScalarSubquery]] to compute multiple scalar values once.
+ *
+ * The process is the following:
+ * - While traversing through the plan each [[ScalarSubquery]] plan is tried to merge into the cache
+ *   of already seen subquery plans. If merge is possible then cache is updated with the merged
+ *   subquery plan, if not then the new subquery plan is added to the cache.
+ * - The original [[ScalarSubquery]] expression is replaced to a reference pointing to its cached
+ *   version in this form: `GetStructField(MultiScalarSubquery(SubqueryReference(...)))`.
+ * - A second traversal checks if a [[SubqueryReference]] is pointing to a subquery plan that
+ *   returns multiple values and either replaces only [[SubqueryReference]] to the cached plan or
+ *   restores the whole expression to its original [[ScalarSubquery]] form.
+ * - [[ReuseSubquery]] rule makes sure that merged subqueries are computed once.
+ *
+ * Eg. the following query:
+ *
+ * SELECT
+ *   (SELECT avg(a) FROM t GROUP BY b),
+ *   (SELECT sum(b) FROM t GROUP BY b)
+ *
+ * is optimized from:
+ *
+ * Project [scalar-subquery#231 [] AS scalarsubquery()#241,
+ *   scalar-subquery#232 [] AS scalarsubquery()#242L]
+ * :  :- Aggregate [b#234], [avg(a#233) AS avg(a)#236]
+ * :  :  +- Relation default.t[a#233,b#234] parquet
+ * :  +- Aggregate [b#240], [sum(b#240) AS sum(b)#238L]
+ * :     +- Project [b#240]
+ * :        +- Relation default.t[a#239,b#240] parquet
+ * +- OneRowRelation
+ *
+ * to:
+ *
+ * Project [multi-scalar-subquery#231.avg(a) AS scalarsubquery()#241,
+ *   multi-scalar-subquery#232.sum(b) AS scalarsubquery()#242L]
+ * :  :- Aggregate [b#234], [avg(a#233) AS avg(a)#236, sum(b#234) AS sum(b)#238L]
+ * :  :  +- Project [a#233, b#234]
+ * :  :     +- Relation default.t[a#233,b#234] parquet
+ * :  +- Aggregate [b#234], [avg(a#233) AS avg(a)#236, sum(b#234) AS sum(b)#238L]
+ * :     +- Project [a#233, b#234]
+ * :        +- Relation default.t[a#233,b#234] parquet
+ * +- OneRowRelation
+ */
+object MergeScalarSubqueries extends Rule[LogicalPlan] with PredicateHelper {
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    if (conf.scalarSubqueryMergeEabled && conf.subqueryReuseEnabled) {
+      val mergedSubqueries = ArrayBuffer.empty[LogicalPlan]
+      removeReferences(mergeAndInsertReferences(plan, mergedSubqueries), mergedSubqueries)
+    } else {
+      plan
+    }
+  }
+
+  private def mergeAndInsertReferences(
+      plan: LogicalPlan,
+      mergedSubqueries: ArrayBuffer[LogicalPlan]): LogicalPlan = {
+    plan.transformAllExpressionsWithPruning(_.containsAnyPattern(SCALAR_SUBQUERY), ruleId) {
+      case s: ScalarSubquery if s.children.isEmpty =>
+        val (mergedPlan, ordinal) = mergeAndGetReference(s.plan, mergedSubqueries)
+        GetStructField(MultiScalarSubquery(mergedPlan, s.exprId), ordinal)
+    }
+  }
+
+  case class SubqueryReference(
+      index: Int,
+      mergedSubqueries: ArrayBuffer[LogicalPlan]) extends LeafNode {
+    override def stringArgs: Iterator[Any] = Iterator(index)
+
+    override def output: Seq[Attribute] = mergedSubqueries(index).output
+  }
+
+  private def mergeAndGetReference(
+      plan: LogicalPlan,
+      mergedSubqueries: ArrayBuffer[LogicalPlan]): (SubqueryReference, Int) = {
+    mergedSubqueries.zipWithIndex.collectFirst {
+      Function.unlift { case (s, i) => mergePlans(plan, s).map(_ -> i) }
+    }.map { case ((mergedPlan, outputMap), i) =>
+      mergedSubqueries(i) = mergedPlan
+      SubqueryReference(i, mergedSubqueries) ->
+        mergedPlan.output.indexOf(outputMap(plan.output.head))
+    }.getOrElse {
+      mergedSubqueries += plan
+      SubqueryReference(mergedSubqueries.length - 1, mergedSubqueries) -> 0
+    }
+  }
+
+  private def mergePlans(
+      newPlan: LogicalPlan,
+      existingPlan: LogicalPlan): Option[(LogicalPlan, AttributeMap[Attribute])] = {
+    (newPlan, existingPlan) match {
+      case (np, ep) if np.canonicalized == ep.canonicalized =>
+        Some(ep -> AttributeMap(np.output.zip(ep.output)))
+      case (np: Project, ep: Project) =>
+        mergePlans(np.child, ep.child).map { case (mergedChild, outputMap) =>
+          val newProjectList = replaceAttributes(np.projectList, outputMap)
+          val newOutputMap = createOutputMap(np.projectList, newProjectList)
+          Project(distinctExpressions(ep.projectList ++ newProjectList), mergedChild) ->
+            newOutputMap
+        }
+      case (np, ep: Project) =>
+        mergePlans(np, ep.child).map { case (mergedChild, outputMap) =>
+          Project(distinctExpressions(ep.projectList ++ outputMap.values), mergedChild) -> outputMap
+        }
+      case (np: Project, ep) =>
+        mergePlans(np.child, ep).map { case (mergedChild, outputMap) =>
+          val newProjectList = replaceAttributes(np.projectList, outputMap)
+          val newOutputMap = createOutputMap(np.projectList, newProjectList)
+          Project(distinctExpressions(ep.output ++ newProjectList), mergedChild) -> newOutputMap
+        }
+      case (np: Aggregate, ep: Aggregate) =>
+        mergePlans(np.child, ep.child).flatMap { case (mergedChild, outputMap) =>
+          val newGroupingExpression = replaceAttributes(np.groupingExpressions, outputMap)
+          if (ExpressionSet(newGroupingExpression) == ExpressionSet(ep.groupingExpressions)) {
+            val newAggregateExpressions = replaceAttributes(np.aggregateExpressions, outputMap)
+            val newOutputMap = createOutputMap(np.aggregateExpressions, newAggregateExpressions)
+            Some(Aggregate(ep.groupingExpressions,
+              distinctExpressions(ep.aggregateExpressions ++ newAggregateExpressions),
+              mergedChild) -> newOutputMap)
+          } else {
+            None
+          }
+        }
+      case _ =>
+        None
+    }
+  }
+
+  private def replaceAttributes[T <: Expression](
+      expressions: Seq[T],
+      outputMap: AttributeMap[Attribute]) = {
+    expressions.map(_.transform {
+      case a: Attribute => outputMap.getOrElse(a, a)
+    }.asInstanceOf[T])
+  }
+
+  private def createOutputMap(from: Seq[NamedExpression], to: Seq[NamedExpression]) = {
+    AttributeMap(from.map(_.toAttribute).zip(to.map(_.toAttribute)))
+  }
+
+  private def distinctExpressions(expressions: Seq[NamedExpression]) = {
+    ExpressionSet(expressions).toSeq.asInstanceOf[Seq[NamedExpression]]
+  }
+
+  private def removeReferences(
+      plan: LogicalPlan,
+      mergedSubqueries: ArrayBuffer[LogicalPlan]): LogicalPlan = {
+    plan.transformAllExpressionsWithPruning(_.containsAnyPattern(MULTI_SCALAR_SUBQUERY), ruleId) {
+      case gsf @ GetStructField(mss @ MultiScalarSubquery(sr: SubqueryReference, _), _, _) =>
+        val dereferencedPlan = removeReferences(mergedSubqueries(sr.index), mergedSubqueries)
+        if (dereferencedPlan.outputSet.size > 1) {
+          gsf.copy(child = mss.copy(plan = dereferencedPlan))
+        } else {
+          ScalarSubquery(dereferencedPlan, exprId = mss.exprId)
+        }
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -231,6 +231,8 @@ abstract class Optimizer(catalogManager: CatalogManager)
       ColumnPruning,
       CollapseProject,
       RemoveNoopOperators) :+
+    Batch("MergeScalarSubqueries", Once,
+      MergeScalarSubqueries) :+
     // This batch must be executed after the `RewriteSubquery` batch, which creates joins.
     Batch("NormalizeFloatingNumbers", Once, NormalizeFloatingNumbers) :+
     Batch("ReplaceUpdateFieldsExpression", Once, ReplaceUpdateFieldsExpression)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
@@ -57,6 +57,7 @@ object RuleIdCollection {
       "org.apache.spark.sql.catalyst.optimizer.NullPropagation" ::
       "org.apache.spark.sql.catalyst.optimizer.OptimizeIn" ::
       "org.apache.spark.sql.catalyst.optimizer.Optimizer$OptimizeSubqueries" ::
+      "org.apache.spark.sql.catalyst.optimizer.MergeScalarSubqueries" ::
       "org.apache.spark.sql.catalyst.optimizer.PushDownLeftSemiAntiJoin" ::
       "org.apache.spark.sql.catalyst.optimizer.PushExtraPredicateThroughJoin" ::
       "org.apache.spark.sql.catalyst.optimizer.PushFoldableIntoBranches" ::

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala
@@ -41,6 +41,7 @@ object TreePattern extends Enumeration  {
   val LIKE_FAMLIY: Value = Value
   val LIST_SUBQUERY: Value = Value
   val LITERAL: Value = Value
+  val MULTI_SCALAR_SUBQUERY: Value = Value
   val NOT: Value = Value
   val NULL_CHECK: Value = Value
   val NULL_LITERAL: Value = Value

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1353,6 +1353,14 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val SCALAR_SUBQUERY_MERGE_ENABLED =
+    buildConf("spark.sql.scalarSubqueyMerge.enabled")
+      .internal()
+      .doc("When true, the planner will try to merge scalar subqueries and re-use them.")
+      .version("3.2.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val REMOVE_REDUNDANT_PROJECTS_ENABLED = buildConf("spark.sql.execution.removeRedundantProjects")
     .internal()
     .doc("Whether to remove redundant project exec node based on children's output and " +
@@ -3481,6 +3489,8 @@ class SQLConf extends Serializable with Logging {
 
   def subqueryReuseEnabled: Boolean = getConf(SUBQUERY_REUSE_ENABLED)
 
+  def scalarSubqueryMergeEabled: Boolean = getConf(SCALAR_SUBQUERY_MERGE_ENABLED)
+
   def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE)
 
   def constraintPropagationEnabled: Boolean = getConf(CONSTRAINT_PROPAGATION_ENABLED)

diff --git a/...t/src/test/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueriesSuite.scala b/...t/src/test/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueriesSuite.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions.{GetStructField, MultiScalarSubquery, ScalarSubquery}
+import org.apache.spark.sql.catalyst.plans._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules._
+
+class MergeScalarSubqueriesSuite extends PlanTest {
+
+  private object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("MergeScalarSubqueries", Once, MergeScalarSubqueries) :: Nil
+  }
+
+  test("Simple non-correlated scalar subquery merge") {
+    val testRelation = LocalRelation('a.int, 'b.int)
+
+    val subquery1 = testRelation
+      .groupBy('b)(max('a))
+    val subquery2 = testRelation
+      .groupBy('b)(sum('a))
+    val originalQuery = testRelation
+      .select(ScalarSubquery(subquery1), ScalarSubquery(subquery2))
+
+    val multiSubquery = testRelation
+      .groupBy('b)(max('a), sum('a)).analyze
+    val correctAnswer = testRelation
+      .select(GetStructField(MultiScalarSubquery(multiSubquery), 0).as("scalarsubquery()"),
+        GetStructField(MultiScalarSubquery(multiSubquery), 1).as("scalarsubquery()"))
+
+    // checkAnalysis is disabled because `Analizer` is not prepared for `MultiScalarSubquery` nodes
+    // as only `Optimizer` can insert such a node to the plan
+    comparePlans(Optimize.execute(originalQuery.analyze), correctAnswer, false)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -73,6 +73,8 @@ trait PlanTestBase extends PredicateHelper with SQLHelper with SQLConfHelper { s
     plan transformAllExpressions {
       case s: ScalarSubquery =>
         s.copy(plan = normalizeExprIds(s.plan), exprId = ExprId(0))
+      case s: MultiScalarSubquery =>
+        s.copy(plan = normalizeExprIds(s.plan), exprId = ExprId(0))
       case e: Exists =>
         e.copy(exprId = ExprId(0))
       case l: ListQuery =>

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala b/...core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala
@@ -24,7 +24,8 @@ import org.apache.spark.sql.catalyst.expressions.{ListQuery, SubqueryExpression}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical.UnspecifiedDistribution
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.trees.TreePattern.{DYNAMIC_PRUNING_SUBQUERY, IN_SUBQUERY, SCALAR_SUBQUERY}
+import org.apache.spark.sql.catalyst.trees.TreePattern.{DYNAMIC_PRUNING_SUBQUERY, IN_SUBQUERY,
+  MULTI_SCALAR_SUBQUERY, SCALAR_SUBQUERY}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.command.{DataWritingCommandExec, ExecutedCommandExec}
 import org.apache.spark.sql.execution.datasources.v2.V2CommandExec
@@ -114,7 +115,8 @@ case class InsertAdaptiveSparkPlan(
    */
   private def buildSubqueryMap(plan: SparkPlan): Map[Long, BaseSubqueryExec] = {
     val subqueryMap = mutable.HashMap.empty[Long, BaseSubqueryExec]
-    if (!plan.containsAnyPattern(SCALAR_SUBQUERY, IN_SUBQUERY, DYNAMIC_PRUNING_SUBQUERY)) {
+    if (!plan.containsAnyPattern(SCALAR_SUBQUERY, MULTI_SCALAR_SUBQUERY, IN_SUBQUERY,
+      DYNAMIC_PRUNING_SUBQUERY)) {
       return subqueryMap.toMap
     }
     plan.foreach(_.expressions.foreach(_.foreach {
@@ -125,6 +127,13 @@ case class InsertAdaptiveSparkPlan(
         val subquery = SubqueryExec.createForScalarSubquery(
           s"subquery#${exprId.id}", executedPlan)
         subqueryMap.put(exprId.id, subquery)
+      case expressions.MultiScalarSubquery(p, exprId)
+        if !subqueryMap.contains(exprId.id) =>
+        val executedPlan = compileSubquery(p)
+        verifyAdaptivePlan(executedPlan, p)
+        val subquery = SubqueryExec.createForScalarSubquery(
+          s"subquery#${exprId.id}", executedPlan)
+        subqueryMap.put(exprId.id, subquery)
       case expressions.InSubquery(_, ListQuery(query, _, exprId, _))
           if !subqueryMap.contains(exprId.id) =>
         val executedPlan = compileSubquery(query)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, DynamicPruningExpression, ListQuery, Literal}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern.{DYNAMIC_PRUNING_SUBQUERY, IN_SUBQUERY,
-  SCALAR_SUBQUERY}
+  MULTI_SCALAR_SUBQUERY, SCALAR_SUBQUERY}
 import org.apache.spark.sql.execution
 import org.apache.spark.sql.execution.{BaseSubqueryExec, InSubqueryExec, SparkPlan}
 
@@ -30,9 +30,12 @@ case class PlanAdaptiveSubqueries(
 
   def apply(plan: SparkPlan): SparkPlan = {
     plan.transformAllExpressionsWithPruning(
-      _.containsAnyPattern(SCALAR_SUBQUERY, IN_SUBQUERY, DYNAMIC_PRUNING_SUBQUERY)) {
+      _.containsAnyPattern(SCALAR_SUBQUERY, MULTI_SCALAR_SUBQUERY, IN_SUBQUERY,
+        DYNAMIC_PRUNING_SUBQUERY)) {
       case expressions.ScalarSubquery(_, _, exprId) =>
         execution.ScalarSubquery(subqueryMap(exprId.id), exprId)
+      case expressions.MultiScalarSubquery(_, exprId) =>
+        execution.MultiScalarSubqueryExec(subqueryMap(exprId.id), exprId)
       case expressions.InSubquery(values, ListQuery(_, _, exprId, _)) =>
         val expr = if (values.length == 1) {
           values.head