apache
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala‎
Lines changed: 23 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueries.scala‎
Lines changed: 176 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueries.scala‎
Lines changed: 176 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala‎
Lines changed: 2 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala‎
Lines changed: 10 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala‎
Lines changed: 7 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala‎
Lines changed: 2 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala‎
Lines changed: 50 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala‎
Lines changed: 50 additions & 0 deletions
@@ -22,6 +22,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan}
+import org.apache.spark.sql.catalyst.trees.LeafLike
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.BitSet
 
@@ -258,6 +259,28 @@ object ScalarSubquery {
   }
 }
 
+case class MultiScalarSubquery(
+    plan: LogicalPlan,
+    exprId: ExprId = NamedExpression.newExprId)
+  extends SubqueryExpression(plan, Seq.empty, exprId) with LeafLike[Expression] with Unevaluable {
+  override def dataType: DataType = {
+    assert(plan.schema.nonEmpty, "Multi-column scalar subquery should have columns")
+    plan.schema
+  }
+
+  override def nullable: Boolean = true
+
+  override def withNewPlan(plan: LogicalPlan): MultiScalarSubquery = copy(plan = plan)
+
+  override def toString: String = s"multi-scalar-subquery#${exprId.id}"
+
+  override lazy val canonicalized: Expression = {
+    MultiScalarSubquery(
+      plan.canonicalized,
+      ExprId(0))
+  }
+}
+
 /**
  * A [[ListQuery]] expression defines the query which we want to search in an IN subquery
  * expression. It should and can only be used in conjunction with an IN expression.
 
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LeafNode, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+
+/**
+ * This rule tries to merge non-correlated [[ScalarSubquery]]s into [[MultiScalarSubquery]]s.
+ * Mergeable [[ScalarSubquery]]s are then replaced to their corresponding [[MultiScalarSubquery]]
+ * and the [[ReuseSubquery]] rule makes sure that merged subqueries are computed once.
+ *
+ * Eg. the following query:
+ *
+ * SELECT
+ *   (SELECT avg(a) FROM t GROUP BY b),
+ *   (SELECT sum(b) FROM t GROUP BY b)
+ *
+ * is optimized from:
+ *
+ * Project [scalar-subquery#231 [] AS scalarsubquery()#241,
+ *   scalar-subquery#232 [] AS scalarsubquery()#242L]
+ * :  :- Aggregate [b#234], [avg(a#233) AS avg(a)#236]
+ * :  :  +- Relation default.t[a#233,b#234] parquet
+ * :  +- Aggregate [b#240], [sum(b#240) AS sum(b)#238L]
+ * :     +- Project [b#240]
+ * :        +- Relation default.t[a#239,b#240] parquet
+ * +- OneRowRelation
+ *
+ * to:
+ *
+ * Project [multi-scalar-subquery#231.avg(a) AS scalarsubquery()#241,
+ *   multi-scalar-subquery#232.sum(b) AS scalarsubquery()#242L]
+ * :  :- Aggregate [b#234], [avg(a#233) AS avg(a)#236, sum(b#234) AS sum(b)#238L]
+ * :  :  +- Project [a#233, b#234]
+ * :  :     +- Relation default.t[a#233,b#234] parquet
+ * :  +- Aggregate [b#234], [avg(a#233) AS avg(a)#236, sum(b#234) AS sum(b)#238L]
+ * :     +- Project [a#233, b#234]
+ * :        +- Relation default.t[a#233,b#234] parquet
+ * +- OneRowRelation
+ */
+object MergeScalarSubqueries extends Rule[LogicalPlan] with PredicateHelper {
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    if (conf.scalarSubqueryMergeEabled && conf.subqueryReuseEnabled) {
+      val mergedSubqueries = ArrayBuffer.empty[LogicalPlan]
+      removeReferences(mergeAndInsertReferences(plan, mergedSubqueries), mergedSubqueries)
+    } else {
+      plan
+    }
+  }
+
+  private def mergeAndInsertReferences(
+      plan: LogicalPlan,
+      mergedSubqueries: ArrayBuffer[LogicalPlan]): LogicalPlan = {
+    plan.transformAllExpressions {
+      case s: ScalarSubquery if s.children.isEmpty =>
+        val (mergedPlan, ordinal) = mergeAndGetReference(s.plan, mergedSubqueries)
+        GetStructField(MultiScalarSubquery(mergedPlan, s.exprId), ordinal)
+    }
+  }
+
+  case class SubqueryReference(
+      index: Int,
+      mergedSubqueries: ArrayBuffer[LogicalPlan]) extends LeafNode {
+    override def stringArgs: Iterator[Any] = Iterator(index)
+
+    override def output: Seq[Attribute] = mergedSubqueries(index).output
+  }
+
+  private def mergeAndGetReference(
+      plan: LogicalPlan,
+      mergedSubqueries: ArrayBuffer[LogicalPlan]): (SubqueryReference, Int) = {
+    mergedSubqueries.zipWithIndex.collectFirst {
+      Function.unlift { case (s, i) => mergePlans(plan, s).map(_ -> i) }
+    }.map { case ((mergedPlan, outputMap), i) =>
+      mergedSubqueries(i) = mergedPlan
+      SubqueryReference(i, mergedSubqueries) ->
+        mergedPlan.output.indexOf(outputMap(plan.output.head))
+    }.getOrElse {
+      mergedSubqueries += plan
+      SubqueryReference(mergedSubqueries.length - 1, mergedSubqueries) -> 0
+    }
+  }
+
+  private def mergePlans(
+      newPlan: LogicalPlan,
+      existingPlan: LogicalPlan): Option[(LogicalPlan, AttributeMap[Attribute])] = {
+    (newPlan, existingPlan) match {
+      case (np, ep) if np.canonicalized == ep.canonicalized =>
+        Some(ep -> AttributeMap(np.output.zip(ep.output)))
+      case (np: Project, ep: Project) =>
+        mergePlans(np.child, ep.child).map { case (mergedChild, outputMap) =>
+          val newProjectList = replaceAttributes(np.projectList, outputMap)
+          val newOutputMap = createOutputMap(np.projectList, newProjectList)
+          Project(distinctExpressions(ep.projectList ++ newProjectList), mergedChild) ->
+            newOutputMap
+        }
+      case (np, ep: Project) =>
+        mergePlans(np, ep.child).map { case (mergedChild, outputMap) =>
+          Project(distinctExpressions(ep.projectList ++ outputMap.values), mergedChild) -> outputMap
+        }
+      case (np: Project, ep) =>
+        mergePlans(np.child, ep).map { case (mergedChild, outputMap) =>
+          val newProjectList = replaceAttributes(np.projectList, outputMap)
+          val newOutputMap = createOutputMap(np.projectList, newProjectList)
+          Project(distinctExpressions(ep.output ++ newProjectList), mergedChild) -> newOutputMap
+        }
+      case (np: Aggregate, ep: Aggregate) =>
+        mergePlans(np.child, ep.child).flatMap { case (mergedChild, outputMap) =>
+          val newGroupingExpression = replaceAttributes(np.groupingExpressions, outputMap)
+          if (ExpressionSet(newGroupingExpression) == ExpressionSet(ep.groupingExpressions)) {
+            val newAggregateExpressions = replaceAttributes(np.aggregateExpressions, outputMap)
+            val newOutputMap = createOutputMap(np.aggregateExpressions, newAggregateExpressions)
+            Some(Aggregate(ep.groupingExpressions,
+              distinctExpressions(ep.aggregateExpressions ++ newAggregateExpressions),
+              mergedChild) -> newOutputMap)
+          } else {
+            None
+          }
+        }
+      case _ =>
+        None
+    }
+  }
+
+  private def replaceAttributes[T <: Expression](
+      expressions: Seq[T],
+      outputMap: AttributeMap[Attribute]) = {
+    expressions.map(_.transform {
+      case a: Attribute => outputMap.getOrElse(a, a)
+    }.asInstanceOf[T])
+  }
+
+  private def createOutputMap(from: Seq[NamedExpression], to: Seq[NamedExpression]) = {
+    AttributeMap(from.map(_.toAttribute).zip(to.map(_.toAttribute)))
+  }
+
+  private def distinctExpressions(expressions: Seq[NamedExpression]) = {
+    ExpressionSet(expressions).toSeq.asInstanceOf[Seq[NamedExpression]]
+  }
+
+  private def removeReferences(
+      plan: LogicalPlan,
+      mergedSubqueries: ArrayBuffer[LogicalPlan]): LogicalPlan = {
+    plan.transformUp {
+      case other => other.transformExpressionsUp {
+        case gsf @ GetStructField(mss @ MultiScalarSubquery(sr: SubqueryReference, _), _, _) =>
+          val dereferencedPlan = removeReferences(mergedSubqueries(sr.index), mergedSubqueries)
+          if (dereferencedPlan.outputSet.size > 1) {
+            gsf.copy(child = mss.copy(plan = dereferencedPlan))
+          } else {
+            ScalarSubquery(dereferencedPlan, exprId = mss.exprId)
+          }
+        case s: SubqueryExpression => s.withNewPlan(removeReferences(s.plan, mergedSubqueries))
+      }
+    }
+  }
+}
@@ -232,6 +232,8 @@ abstract class Optimizer(catalogManager: CatalogManager)
       ColumnPruning,
       CollapseProject,
       RemoveNoopOperators) :+
+    Batch("MergeScalarSubqueries", Once,
+      MergeScalarSubqueries) :+
     // This batch must be executed after the `RewriteSubquery` batch, which creates joins.
     Batch("NormalizeFloatingNumbers", Once, NormalizeFloatingNumbers) :+
     Batch("ReplaceUpdateFieldsExpression", Once, ReplaceUpdateFieldsExpression)
 
@@ -1353,6 +1353,14 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val SCALAR_SUBQUERY_MERGE_ENABLED =
+    buildConf("spark.sql.scalarSubqueyMerge.enabled")
+      .internal()
+      .doc("When true, the planner will try to merge scalar subqueries and re-use them.")
+      .version("3.2.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val REMOVE_REDUNDANT_PROJECTS_ENABLED = buildConf("spark.sql.execution.removeRedundantProjects")
     .internal()
     .doc("Whether to remove redundant project exec node based on children's output and " +
@@ -3473,6 +3481,8 @@ class SQLConf extends Serializable with Logging {
 
   def subqueryReuseEnabled: Boolean = getConf(SUBQUERY_REUSE_ENABLED)
 
+  def scalarSubqueryMergeEabled: Boolean = getConf(SCALAR_SUBQUERY_MERGE_ENABLED)
+
   def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE)
 
   def constraintPropagationEnabled: Boolean = getConf(CONSTRAINT_PROPAGATION_ENABLED)
 
@@ -121,6 +121,13 @@ case class InsertAdaptiveSparkPlan(
         val subquery = SubqueryExec.createForScalarSubquery(
           s"subquery#${exprId.id}", executedPlan)
         subqueryMap.put(exprId.id, subquery)
+      case expressions.MultiScalarSubquery(p, exprId)
+        if !subqueryMap.contains(exprId.id) =>
+        val executedPlan = compileSubquery(p)
+        verifyAdaptivePlan(executedPlan, p)
+        val subquery = SubqueryExec.createForScalarSubquery(
+          s"subquery#${exprId.id}", executedPlan)
+        subqueryMap.put(exprId.id, subquery)
       case expressions.InSubquery(_, ListQuery(query, _, exprId, _))
           if !subqueryMap.contains(exprId.id) =>
         val executedPlan = compileSubquery(query)
 
@@ -30,6 +30,8 @@ case class PlanAdaptiveSubqueries(
     plan.transformAllExpressions {
       case expressions.ScalarSubquery(_, _, exprId) =>
         execution.ScalarSubquery(subqueryMap(exprId.id), exprId)
+      case expressions.MultiScalarSubquery(_, exprId) =>
+        execution.MultiScalarSubqueryExec(subqueryMap(exprId.id), exprId)
       case expressions.InSubquery(values, ListQuery(_, _, exprId, _)) =>
         val expr = if (values.length == 1) {
           values.head
 
@@ -106,6 +106,50 @@ case class ScalarSubquery(
   }
 }
 
+case class MultiScalarSubqueryExec(
+    plan: BaseSubqueryExec,
+    exprId: ExprId)
+  extends ExecSubqueryExpression with LeafLike[Expression] {
+
+  override def dataType: DataType = plan.schema
+  override def nullable: Boolean = true
+  override def toString: String = plan.simpleString(SQLConf.get.maxToStringFields)
+  override def withNewPlan(query: BaseSubqueryExec): MultiScalarSubqueryExec = copy(plan = query)
+
+  override def semanticEquals(other: Expression): Boolean = other match {
+    case s: MultiScalarSubqueryExec => plan.sameResult(s.plan)
+    case _ => false
+  }
+
+  // the first column in first row from `query`.
+  @volatile private var result: Any = _
+  @volatile private var updated: Boolean = false
+
+  def updateResult(): Unit = {
+    val rows = plan.executeCollect()
+    if (rows.length > 1) {
+      sys.error(s"more than one row returned by a subquery used as an expression:\n$plan")
+    }
+    if (rows.length == 1) {
+      result = rows(0)
+    } else {
+      // If there is no rows returned, the result should be null.
+      result = null
+    }
+    updated = true
+  }
+
+  override def eval(input: InternalRow): Any = {
+    require(updated, s"$this has not finished")
+    result
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    require(updated, s"$this has not finished")
+    Literal.create(result, dataType).doGenCode(ctx, ev)
+  }
+}
+
 /**
  * The physical node of in-subquery. This is for Dynamic Partition Pruning only, as in-subquery
  * coming from the original query will always be converted to joins.
@@ -183,6 +227,12 @@ case class PlanSubqueries(sparkSession: SparkSession) extends Rule[SparkPlan] {
           SubqueryExec.createForScalarSubquery(
             s"scalar-subquery#${subquery.exprId.id}", executedPlan),
           subquery.exprId)
+      case subquery: expressions.MultiScalarSubquery =>
+        val executedPlan = QueryExecution.prepareExecutedPlan(sparkSession, subquery.plan)
+        MultiScalarSubqueryExec(
+          SubqueryExec.createForScalarSubquery(
+            s"multi-scalar-subquery#${subquery.exprId.id}", executedPlan),
+          subquery.exprId)
       case expressions.InSubquery(values, ListQuery(query, _, exprId, _)) =>
         val expr = if (values.length == 1) {
           values.head